diff --git a/build.c b/build.c
index 91867798..9959084b 100644
--- a/build.c
+++ b/build.c
@@ -22,7 +22,6 @@ Bool arg_crtlib = false;
 Bool arg_debinfo = false;
 Bool arg_developer = false;
 Bool arg_profiling = false;
-Bool arg_profiler_sampling = false;
 Bool arg_unoptimized = false;
 
 /* ========================== *
@@ -359,7 +358,6 @@ void OnBuild(StringList cli_args)
                         if (StringEqual(arg, Lit("-debinfo"))) arg_debinfo = true;
                         if (StringEqual(arg, Lit("-developer"))) arg_developer = true;
                         if (StringEqual(arg, Lit("-profiling"))) arg_profiling = true;
-                        if (StringEqual(arg, Lit("-sampling"))) arg_profiler_sampling = true;
                         if (StringEqual(arg, Lit("-unoptimized"))) arg_unoptimized = true;
                     } break;
                 }
@@ -404,7 +402,6 @@ void OnBuild(StringList cli_args)
         SH_PrintF(Lit("[%F]\n"), FmtStr(compiler));
         if (arg_asan) SH_Print(Lit("[Asan Enabled]\n"));
         if (arg_profiling) SH_Print(Lit("[Profiling]\n"));
-        if (arg_profiler_sampling) SH_Print(Lit("[Profiler sampling]\n"));
         if (arg_developer) SH_Print(Lit("[Developer build]\n"));
         SH_Print(Lit("------------------------------\n\n"));
     }
@@ -594,12 +591,6 @@ void OnBuild(StringList cli_args)
             }
             StringListAppend(&perm, &compile_args, StringF(&perm, Lit("-DTRACY_INCLUDE_PATH=\"%F\""), FmtStr(tracy_include_path)));
 
-            /* Tracy flags */
-            StringListAppend(&perm, &compile_args, Lit("-DTRACY_ENABLE=1"));
-            if (!arg_profiler_sampling) {
-                StringListAppend(&perm, &compile_args, Lit("-DTRACY_NO_SAMPLING -DTRACY_NO_SYSTEM_TRACING -DTRACY_NO_CALLSTACK"));
-            }
-
             /* Disable compiler warnings when compiling tracy client */
             compile_warnings = (StringList) { 0 };
             link_warnings = (StringList) { 0 };
@@ -607,14 +598,6 @@ void OnBuild(StringList cli_args)
             StringListAppend(&perm, &link_warnings, Lit("-Wno-everything"));
         }
 
-        /* Profiler sampling */
-        if (arg_profiler_sampling) {
-            if (!arg_profiling) {
-                Error(Lit("Profiling must be enabled to use profiler sampling"));
-                OS_Exit(1);
-            }
-        }
-
         if (!arg_msvc) {
             String incbin_dir = StringReplace(&perm, out_inc_dir_path, Lit("\\"), Lit("/"));
             StringListAppend(&perm, &compile_args, StringF(&perm, Lit("-DINCBIN_DIR_RAW=\"%F\""), FmtStr(incbin_dir)));
diff --git a/src/app.c b/src/app.c
index 7434b16d..c9dcc024 100644
--- a/src/app.c
+++ b/src/app.c
@@ -3,10 +3,9 @@
 #include "string.h"
 #include "scratch.h"
 #include "sys.h"
-#include "work.h"
+#include "job.h"
 #include "user.h"
 #include "sim.h"
-#include "sim.h"
 #include "playback.h"
 #include "log.h"
 #include "resource.h"
@@ -248,8 +247,8 @@ void app_entry_point(struct string args_str)
 
         i32 num_logical_cores = (i32)sys_num_logical_processors();
         //num_logical_cores = min(num_logical_cores, 8) + (max(num_logical_cores - 8, 0) / 2);  /* Dumb heuristic to try and lessen e-core usage */
-        i32 min_worker_count = 2;
-        i32 max_worker_count = 128;
+        i32 min_worker_count = JOB_MIN_WORKER_COUNT;
+        i32 max_worker_count = JOB_MAX_WORKER_COUNT;
         i32 target_worker_count = num_logical_cores - num_reserved_cores;
         worker_count = (u32)clamp_i32(target_worker_count, min_worker_count, max_worker_count);
 #endif
@@ -319,20 +318,20 @@ void app_entry_point(struct string args_str)
     }
 
     /* Startup systems */
+    job_startup(worker_count);
+    struct resource_startup_receipt resource_sr = resource_startup();
     struct sock_startup_receipt sock_sr = sock_startup();
     struct host_startup_receipt host_sr = host_startup(&sock_sr);
-    struct resource_startup_receipt resource_sr = resource_startup();
-    struct work_startup_receipt work_sr = work_startup(worker_count);
-    struct gp_startup_receipt gp_sr = gp_startup(&work_sr);
-    struct asset_cache_startup_receipt asset_cache_sr = asset_cache_startup(&work_sr);
+    struct gp_startup_receipt gp_sr = gp_startup();
+    struct asset_cache_startup_receipt asset_cache_sr = asset_cache_startup();
     struct ttf_startup_receipt ttf_sr = ttf_startup();
-    struct font_startup_receipt font_sr = font_startup(&work_sr, &gp_sr, &asset_cache_sr, &ttf_sr, &resource_sr);
+    struct font_startup_receipt font_sr = font_startup(&gp_sr, &asset_cache_sr, &ttf_sr, &resource_sr);
     struct sprite_startup_receipt sprite_sr = sprite_startup(&gp_sr, &resource_sr);
     struct mixer_startup_receipt mixer_sr = mixer_startup();
-    struct sound_startup_receipt sound_sr = sound_startup(&work_sr, &asset_cache_sr, &resource_sr);
+    struct sound_startup_receipt sound_sr = sound_startup(&asset_cache_sr, &resource_sr);
     struct draw_startup_receipt draw_sr = draw_startup(&gp_sr, &font_sr);
     struct sim_startup_receipt sim_sr = sim_startup();
-    struct user_startup_receipt user_sr = user_startup(&work_sr, &gp_sr, &font_sr, &sprite_sr, &draw_sr, &asset_cache_sr, &sound_sr, &mixer_sr, &host_sr, &sim_sr, connect_address, window);
+    struct user_startup_receipt user_sr = user_startup(&gp_sr, &font_sr, &sprite_sr, &draw_sr, &asset_cache_sr, &sound_sr, &mixer_sr, &host_sr, &sim_sr, connect_address, window);
     struct playback_startup_receipt playback_sr = playback_startup(&mixer_sr);
 
     (UNUSED)user_sr;
diff --git a/src/asset_cache.c b/src/asset_cache.c
index 075b7c41..2b15fa6d 100644
--- a/src/asset_cache.c
+++ b/src/asset_cache.c
@@ -5,8 +5,8 @@
 #include "arena.h"
 #include "scratch.h"
 #include "util.h"
-#include "work.h"
 #include "log.h"
+#include "job.h"
 
 /* ========================== *
  * Global state
@@ -35,10 +35,8 @@ GLOBAL struct {
  * Startup
  * ========================== */
 
-struct asset_cache_startup_receipt asset_cache_startup(struct work_startup_receipt *work_sr)
+struct asset_cache_startup_receipt asset_cache_startup(void)
 {
-    (UNUSED)work_sr;
-
     /* Init lookup */
     G.lookup_mutex = sys_mutex_alloc();
     /* Init store */
@@ -152,7 +150,7 @@ struct asset *asset_cache_touch(struct string key, u64 hash, b32 *is_first_touch
                 .status = ASSET_STATUS_UNINITIALIZED,
                 .hash = hash,
                 .key = key_stored,
-                .work_ready_sf = sync_flag_alloc(),
+                .job_ready_sf = sync_flag_alloc(),
                 .asset_ready_sf = sync_flag_alloc()
             };
             if (is_first_touch) {
@@ -173,13 +171,13 @@ struct asset *asset_cache_touch(struct string key, u64 hash, b32 *is_first_touch
  * Marking
  * ========================== */
 
-/* Call this once asset work has been created */
+/* Call this once asset job has been created */
 void asset_cache_mark_loading(struct asset *asset)
 {
     asset->status = ASSET_STATUS_LOADING;
 }
 
-/* Call this once asset work has finished */
+/* Call this once asset job has finished */
 void asset_cache_mark_ready(struct asset *asset, void *store_data)
 {
     asset->store_data = store_data;
@@ -189,24 +187,23 @@ void asset_cache_mark_ready(struct asset *asset, void *store_data)
 }
 
 /* ========================== *
- * Work
+ * Job
  * ========================== */
 
-/* NOTE: If an asset doesn't have any load work then call this function with `NULL` */
-void asset_cache_set_work(struct asset *asset, struct work_handle *handle)
+void asset_cache_set_job(struct asset *asset, struct job_handle *job)
 {
-    asset->work = handle ? *handle : (struct work_handle) { 0 };
-    sync_flag_set(&asset->work_ready_sf);
+    asset->job = job ? *job : (struct job_handle) { 0 };
+    sync_flag_set(&asset->job_ready_sf);
 }
 
 void asset_cache_wait(struct asset *asset)
 {
     if (asset->status != ASSET_STATUS_READY) {
-        /* Wait for work to be set */
-        sync_flag_wait(&asset->work_ready_sf);
-        /* Help with work */
-        if (asset->work.gen != 0) {
-            work_help(asset->work);
+        /* Wait for job to be set */
+        sync_flag_wait(&asset->job_ready_sf);
+        /* Wait on job */
+        if (asset->job.gen != 0) {
+            job_wait(asset->job);
         }
         /* Wait for asset to be ready */
         sync_flag_wait(&asset->asset_ready_sf);
diff --git a/src/asset_cache.h b/src/asset_cache.h
index e83cbbab..668432c3 100644
--- a/src/asset_cache.h
+++ b/src/asset_cache.h
@@ -2,10 +2,8 @@
 #define ASSET_CACHE_H
 
 #include "sys.h"
-#include "work.h"
 #include "util.h"
-
-struct work_startup_receipt;
+#include "job.h"
 
 enum asset_status {
     ASSET_STATUS_NONE,
@@ -21,9 +19,9 @@ struct asset {
     u64 hash;
     struct string key;
 
-    /* Managed via asset_cache_set_work */
-    struct work_handle work;
-    struct sync_flag work_ready_sf;
+    /* Managed via asset_cache_set_job */
+    struct job_handle job;
+    struct sync_flag job_ready_sf;
 
     /* Managed via asset_cache_mark_x functions */
     enum asset_status status;
@@ -41,14 +39,14 @@ struct asset_cache_store {
 };
 
 struct asset_cache_startup_receipt { i32 _; };
-struct asset_cache_startup_receipt asset_cache_startup(struct work_startup_receipt *work_sr);
+struct asset_cache_startup_receipt asset_cache_startup(void);
 
 struct asset *asset_cache_touch(struct string key, u64 hash, b32 *is_first_touch);
 
 void asset_cache_mark_loading(struct asset *asset);
 void asset_cache_mark_ready(struct asset *asset, void *store_data);
 
-void asset_cache_set_work(struct asset *asset, struct work_handle *handle);
+void asset_cache_set_job(struct asset *asset, struct job_handle *job);
 void asset_cache_wait(struct asset *asset);
 
 void *asset_cache_get_store_data(struct asset *asset);
diff --git a/src/common.h b/src/common.h
index 78b34fd2..68cc23f0 100644
--- a/src/common.h
+++ b/src/common.h
@@ -626,129 +626,18 @@ INLINE i64 clamp_i64(i64 v, i64 min, i64 max) { return v < min ? min : v > max ?
 INLINE f32 clamp_f32(f32 v, f32 min, f32 max) { return v < min ? min : v > max ? max : v; }
 INLINE f64 clamp_f64(f64 v, f64 min, f64 max) { return v < min ? min : v > max ? max : v; }
 
-/* ========================== *
- * Profiling
- * ========================== */
-
-#if PROFILING
-
-#include STRINGIZE(TRACY_INCLUDE_PATH)
-
-#define PROFILING_CAPTURE_FRAME_IMAGE 0
-
-/* Clang/GCC cleanup macros */
-#if COMPILER_MSVC
-# error "MSVC not supported for profiling (cleanup attributes are required for profiling markup)"
-#else
-# ifdef TRACY_NO_CALLSTACK
-#  define __prof static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { NULL, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin( &CAT(__tracy_source_location,__LINE__), true );
-#  define __profscope(name) static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin( &CAT(__tracy_source_location,__LINE__), true );
-# else
-#  define __prof static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { NULL, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin_callstack( &CAT(__tracy_source_location,__LINE__), TRACY_CALLSTACK, true );
-#  define __profscope(name) static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin_callstack( &CAT(__tracy_source_location,__LINE__), TRACY_CALLSTACK, true );
-# endif
-# define __profscope_dx11(dx11_ctx, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d11_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx11_zone_cleanup_func))) TracyCD3D11ZoneCtx __tracy_d3d11_zone_ctx; ___tracy_d3d11_emit_zone_begin( dx11_ctx, &__tracy_d3d11_zone_ctx, &CAT(__tracy_gpu_d3d11_source_location,__LINE__), true);
-# define __profscope_dx12(dx12_ctx, cmd_list, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d12_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx12_zone_cleanup_func))) TracyCD3D12ZoneCtx __tracy_d3d12_zone_ctx; ___tracy_d3d12_emit_zone_begin( dx12_ctx, cmd_list, &__tracy_d3d12_zone_ctx, &CAT(__tracy_gpu_d3d12_source_location,__LINE__), true);
-#endif
-INLINE void __prof_zone_cleanup_func(TracyCZoneCtx *ctx) { TracyCZoneEnd(*ctx); }
-INLINE void __prof_dx11_zone_cleanup_func(TracyCD3D11ZoneCtx *ctx) { ___tracy_d3d11_emit_zone_end(*ctx); }
-INLINE void __prof_dx12_zone_cleanup_func(TracyCD3D12ZoneCtx *ctx) { ___tracy_d3d12_emit_zone_end(*ctx); }
-
-#define __profalloc(ptr, size)      TracyCAlloc((ptr), (size))
-#define __proffree(ptr)             TracyCFree((ptr))
-#define __profmsg(txt, len, col)    TracyCMessageC((txt), (len), BGR32(col));
-#define __profframe(name)           TracyCFrameMarkNamed((name))
-#define __profthread(name)          TracyCSetThreadName((name))
-
-#define __proflock_ctx TracyCSharedLockCtx
-#define __proflock_alloc(ctx) TracyCSharedLockAnnounce((ctx))
-#define __proflock_release(ctx) TracyCSharedLockTerminate((ctx))
-#define __proflock_before_exclusive_lock(ctx) TracyCSharedLockBeforeExclusiveLock((ctx))
-#define __proflock_after_exclusive_lock(ctx) TracyCSharedLockAfterExclusiveLock((ctx))
-#define __proflock_after_exclusive_unlock(ctx) TracyCSharedLockAfterExclusiveUnlock((ctx))
-#define __proflock_after_try_exclusive_lock(ctx, acquired) TracyCSharedLockAfterTryExclusiveLock((ctx), (acquired))
-#define __proflock_before_shared_lock(ctx) TracyCSharedLockBeforeSharedLock((ctx))
-#define __proflock_after_shared_lock(ctx) TracyCSharedLockAfterSharedLock((ctx))
-#define __proflock_after_shared_unlock(ctx) TracyCSharedLockAfterSharedUnlock((ctx))
-#define __proflock_after_try_shared_lock(ctx, acquired) TracyCSharedLockAfterTrySharedLock((ctx), (acquired))
-#define __proflock_mark(ctx) TracyCSharedLockMark((ctx))
-#define __proflock_custom_name(ctx, name, len) TracyCSharedLockCustomName((ctx), (name), (len))
-
-#define __prof_dx11_ctx TracyCD3D11Ctx
-#define __prof_dx11_ctx_alloc(ctx, device, device_ctx, name, name_len) ctx = ___tracy_d3d11_context_announce(device, device_ctx, name, name_len)
-#define __prof_dx11_ctx_release(ctx) ___tracy_d3d11_context_terminate(ctx)
-#define __prof_dx11_collect(ctx) ___tracy_d3d11_context_collect(ctx)
-
-#define __prof_dx12_ctx TracyCD3D12Ctx
-#define __prof_dx12_ctx_alloc(ctx, device, queue, name, name_len) ctx = ___tracy_d3d12_context_announce(device, queue, name, name_len)
-#define __prof_dx12_ctx_release(ctx) ___tracy_d3d12_context_terminate(ctx)
-#define __prof_dx12_new_frame(ctx) ___tracy_d3d12_context_new_frame(ctx)
-#define __prof_dx12_collect(ctx) ___tracy_d3d12_context_collect(ctx)
-
-enum __prof_plot_type {
-    __prof_plot_type_number = TracyPlotFormatNumber,
-    __prof_plot_type_memory = TracyPlotFormatMemory,
-    __prof_plot_type_percentage = TracyPlotFormatPercentage,
-    __prof_plot_type_watt = TracyPlotFormatWatt
-};
-#define __prof_plot_init(name, type, step, fill, color) TracyCPlotConfig(name, type, step, fill, BGR32(color))
-#define __prof_plot(name, val) TracyCPlot(name, val)
-#define __prof_plot_i(name, val) TracyCPlotI(name, val)
-
-#if PROFILING_CAPTURE_FRAME_IMAGE
-# define __profframeimage(image, width, height, offset, flipped)    TracyCFrameImage((image), (width), (height), (offset), (flipped));
-#else
-# define __profframeimage(image, width, height, offset, flipped)
-#endif  /* PROFILING_CAPTURE_FRAME_IMAGE */
-
-#else
-
-#define PROFILING_CAPTURE_FRAME_IMAGE 0
-
-#define __prof
-#define __profscope(name)
-#define __profscope_dx11(dx11_ctx, name, color)
-#define __profscope_dx12(dx11_ctx, queue, name, color)
-#define __profalloc(ptr, size)
-#define __proffree(ptr)
-#define __profmsg(txt, len, col)
-#define __profframe(name)
-#define __profthread(name)
-#define __profframeimage(image, width, height, offset, flipped)
-#define __proflock_ctx
-#define __proflock_alloc(ctx)
-#define __proflock_release(ctx)
-#define __proflock_before_exclusive_lock(ctx)
-#define __proflock_after_exclusive_lock(ctx)
-#define __proflock_after_exclusive_unlock(ctx)
-#define __proflock_after_try_exclusive_lock(ctx, acquired)
-#define __proflock_before_shared_lock(ctx)
-#define __proflock_after_shared_lock(ctx)
-#define __proflock_after_shared_unlock(ctx)
-#define __proflock_after_try_shared_lock(ctx, acquired)
-#define __proflock_mark(ctx)
-#define __proflock_custom_name(ctx, name, len)
-#define __prof_dx11_ctx
-#define __prof_dx11_ctx_alloc(ctx, device, device_ctx, name, name_len)
-#define __prof_dx11_ctx_release(ctx)
-#define __prof_dx11_collect(ctx)
-#define __prof_dx12_ctx
-#define __prof_dx12_ctx_alloc(ctx, device, queue, name, name_len)
-#define __prof_dx12_ctx_release(ctx)
-#define __prof_dx12_new_frame(ctx)
-#define __prof_dx12_collect(ctx)
-#define __prof_plot_init(name, type, step, fill, color)
-#define __prof_plot(name, val)
-#define __prof_plot_i(name, val)
-
-#endif  /* PROFILING */
-
 /* ========================== *
  * Configurable constants
  * ========================== */
 
 #include "config.h"
 
+/* ========================== *
+ * Profiling
+ * ========================== */
+
+#include "prof_tracy.h"
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/font.c b/src/font.c
index 6ed42bb8..3978cbd8 100644
--- a/src/font.c
+++ b/src/font.c
@@ -1,7 +1,7 @@
 #include "font.h"
 #include "arena.h"
 #include "ttf.h"
-#include "work.h"
+#include "job.h"
 #include "scratch.h"
 #include "asset_cache.h"
 #include "resource.h"
@@ -41,13 +41,11 @@ GLOBAL struct {
  * Startup
  * ========================== */
 
-struct font_startup_receipt font_startup(struct work_startup_receipt *work_sr,
-                                         struct gp_startup_receipt *gp_sr,
+struct font_startup_receipt font_startup(struct gp_startup_receipt *gp_sr,
                                          struct asset_cache_startup_receipt *asset_cache_sr,
                                          struct ttf_startup_receipt *ttf_sr,
                                          struct resource_startup_receipt *resource_sr)
 {
-    (UNUSED)work_sr;
     (UNUSED)gp_sr;
     (UNUSED)asset_cache_sr;
     (UNUSED)ttf_sr;
@@ -91,12 +89,12 @@ INTERNAL void font_task_params_release(struct font_task_params *p)
  * Load
  * ========================== */
 
-INTERNAL WORK_TASK_FUNC_DEF(font_load_asset_task, vparams)
+INTERNAL JOB_DEF(font_load_asset_job, job)
 {
     __prof;
     struct arena_temp scratch = scratch_begin_no_conflict();
 
-    struct font_task_params *params = (struct font_task_params *)vparams;
+    struct font_task_params *params = job.sig;
     struct string path = STRING(params->path_len, (u8 *)params->path_cstr);
     f32 point_size = params->point_size;
     struct asset *asset = params->asset;
@@ -163,7 +161,7 @@ INTERNAL WORK_TASK_FUNC_DEF(font_load_asset_task, vparams)
 }
 
 /* Returns the asset from the asset cache */
-struct asset *font_load_asset(struct string path, f32 point_size, b32 help)
+struct asset *font_load_asset(struct string path, f32 point_size, b32 wait)
 {
     __prof;
     struct arena_temp scratch = scratch_begin_no_conflict();
@@ -192,13 +190,13 @@ struct asset *font_load_asset(struct string path, f32 point_size, b32 help)
 
         /* Push task */
         asset_cache_mark_loading(asset);
-        struct work_handle wh = ZI;
-        if (help) {
-            wh = work_push_task_and_help(&font_load_asset_task, params, WORK_PRIORITY_NORMAL);
+        if (wait) {
+            job_dispatch_wait(1, font_load_asset_job, params);
+            asset_cache_set_job(asset, NULL);
         } else {
-            wh = work_push_task(&font_load_asset_task, params, WORK_PRIORITY_NORMAL);
+            struct job_handle job = job_dispatch_async(1, font_load_asset_job, params);
+            asset_cache_set_job(asset, &job);
         }
-        asset_cache_set_work(asset, &wh);
     }
 
     scratch_end(scratch);
diff --git a/src/font.h b/src/font.h
index 193fb1c5..cef1c00d 100644
--- a/src/font.h
+++ b/src/font.h
@@ -5,7 +5,6 @@
 #include "gp.h"
 
 struct asset;
-struct work_startup_receipt;
 struct gp_startup_receipt;
 struct asset_cache_startup_receipt;
 struct ttf_startup_receipt;
@@ -31,13 +30,12 @@ struct font {
 };
 
 struct font_startup_receipt { i32 _; };
-struct font_startup_receipt font_startup(struct work_startup_receipt *work_sr,
-                                         struct gp_startup_receipt *gp_sr,
+struct font_startup_receipt font_startup(struct gp_startup_receipt *gp_sr,
                                          struct asset_cache_startup_receipt *asset_cache_sr,
                                          struct ttf_startup_receipt *ttf_sr,
                                          struct resource_startup_receipt *resource_sr);
 
-struct asset *font_load_asset(struct string path, f32 point_size, b32 help);
+struct asset *font_load_asset(struct string path, f32 point_size, b32 wait);
 struct font *font_load_async(struct string path, f32 point_size);
 struct font *font_load(struct string path, f32 point_size);
 
diff --git a/src/gp.h b/src/gp.h
index e4dab4f3..97821baa 100644
--- a/src/gp.h
+++ b/src/gp.h
@@ -2,14 +2,13 @@
 #define GP_H
 
 struct sys_window;
-struct work_startup_receipt;
 
 /* ========================== *
  * Startup
  * ========================== */
 
 struct gp_startup_receipt { i32 _; };
-struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr);
+struct gp_startup_receipt gp_startup(void);
 
 /* ========================== *
  * Resource
diff --git a/src/gp_dx11.c b/src/gp_dx11.c
index 07522aaa..b10760fd 100644
--- a/src/gp_dx11.c
+++ b/src/gp_dx11.c
@@ -223,8 +223,8 @@ struct dx11_shader_desc {
 GLOBAL struct {
     struct arena *arena;
 
-#if PROFILING
-    struct __prof_dx11_ctx *profiling_ctx;
+#if PROFILING_D3D
+    __prof_dx11_ctx(profiling_ctx);
 #endif
 
     ID3D11Device *dev;
@@ -2007,7 +2007,7 @@ void gp_present(struct sys_window *window, struct v2i32 backbuffer_resolution, s
  * ========================== */
 
  /* FIXME: enable this */
-#if PROFILING && PROFILING_CAPTURE_FRAME_IMAGE
+#if PROFILING_CAPTURE_FRAME_IMAGE
 
 #define CAP_WIDTH 320
 #define CAP_HEIGHT 180
diff --git a/src/gp_dx12.c b/src/gp_dx12.c
index 65422fcd..16aa3120 100644
--- a/src/gp_dx12.c
+++ b/src/gp_dx12.c
@@ -7,7 +7,7 @@
 #include "string.h"
 #include "scratch.h"
 #include "app.h"
-#include "work.h"
+#include "job.h"
 #include "log.h"
 #include "resource.h"
 #include "atomic.h"
@@ -36,7 +36,7 @@
 #pragma comment(lib, "dxguid")
 #pragma comment(lib, "d3dcompiler")
 
-#if PROFILING
+#if PROFILING_D3D
 /* For RegOpenKeyEx */
 # include <winreg.h>
 # pragma comment(lib, "advapi32")
@@ -61,13 +61,13 @@
 #if DX12_MULTI_QUEUE
 # define DX12_QUEUE_DIRECT 0
 # define DX12_QUEUE_COMPUTE 1
-# define DX12_QUEUE_COPY_CRITICAL 2
+# define DX12_QUEUE_COPY 2
 # define DX12_QUEUE_COPY_BACKGROUND 3
 # define DX12_NUM_QUEUES 4
 #else
 # define DX12_QUEUE_DIRECT 0
 # define DX12_QUEUE_COMPUTE 0
-# define DX12_QUEUE_COPY_CRITICAL 0
+# define DX12_QUEUE_COPY 0
 # define DX12_QUEUE_COPY_BACKGROUND 0
 # define DX12_NUM_QUEUES 1
 #endif
@@ -142,8 +142,8 @@ struct command_queue {
 
     struct command_list_pool *cl_pool;
 
-#if PROFILING
-    struct __prof_dx12_ctx *prof;
+#if PROFILING_D3D
+    __prof_dx12_ctx(prof);
 #endif
 };
 
@@ -344,17 +344,16 @@ INTERNAL void dx12_init_pipelines(void);
 INTERNAL struct cpu_descriptor_heap *cpu_descriptor_heap_alloc(enum D3D12_DESCRIPTOR_HEAP_TYPE type);
 INTERNAL struct command_queue *command_queue_alloc(enum D3D12_COMMAND_LIST_TYPE type, enum D3D12_COMMAND_QUEUE_PRIORITY priority, struct string dbg_name);
 INTERNAL void command_queue_release(struct command_queue *cq);
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(evictor_thread_entry_point, arg);
+INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg);
 INTERNAL void fenced_release(void *data, enum fenced_release_kind kind);
 
 #if RESOURCE_RELOADING
 INTERNAL RESOURCE_WATCH_CALLBACK_FUNC_DEF(pipeline_resource_watch_callback, name);
 #endif
 
-struct gp_startup_receipt gp_startup(struct work_startup_receipt *work_sr)
+struct gp_startup_receipt gp_startup(void)
 {
     __prof;
-    (UNUSED)work_sr;
 
     /* Initialize command descriptor heaps pool */
     G.command_descriptor_heaps_mutex = sys_mutex_alloc();
@@ -535,7 +534,7 @@ INTERNAL void dx12_init_device(void)
     }
 #endif
 
-#if PROFILING
+#if PROFILING_D3D
     /* Enable stable power state */
     {
         b32 success = true;
@@ -601,8 +600,8 @@ INTERNAL void dx12_init_objects(void)
             G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_DIRECT, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Direct queue"));
         } else if (i == DX12_QUEUE_COMPUTE) {
             G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COMPUTE, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Compute queue"));
-        } else if (i == DX12_QUEUE_COPY_CRITICAL) {
-            G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_HIGH, LIT("High priority copy queue"));
+        } else if (i == DX12_QUEUE_COPY) {
+            G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_HIGH, LIT("Copy queue"));
         } else if (i == DX12_QUEUE_COPY_BACKGROUND) {
             G.command_queues[i] = command_queue_alloc(D3D12_COMMAND_LIST_TYPE_COPY, D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, LIT("Background copy queue"));
         }
@@ -613,7 +612,7 @@ INTERNAL void dx12_init_objects(void)
  * Dx12 pipeline initialization
  * ========================== */
 
-INTERNAL void pipeline_alloc_from_desc(u64 num_pipelines, struct pipeline_desc *descs, struct pipeline **pipelines_out);
+INTERNAL void pipeline_alloc(u64 num_pipelines, struct pipeline_desc *descs_in, struct pipeline **pipelines_out);
 INTERNAL void pipeline_register(u64 num_pipelines, struct pipeline **pipelines);
 
 INTERNAL void dx12_init_pipelines(void)
@@ -666,7 +665,7 @@ INTERNAL void dx12_init_pipelines(void)
         ++num_pipelines;
     }
     struct pipeline **pipelines = arena_push_array(scratch.arena, struct pipeline *, num_pipelines);
-    pipeline_alloc_from_desc(num_pipelines, descs, pipelines);
+    pipeline_alloc(num_pipelines, descs, pipelines);
     for (u32 i = 0; i < num_pipelines; ++i) {
         struct pipeline *pipeline = pipelines[i];
         if (!pipeline->success) {
@@ -763,14 +762,14 @@ INTERNAL void dx12_include_handler_release(struct dx12_include_handler *handler)
     sys_mutex_release(handler->pipeline_mutex);
 }
 
-enum shader_compile_task_kind {
+enum shader_compile_job_kind {
     SHADER_COMPILE_TASK_KIND_VS,
     SHADER_COMPILE_TASK_KIND_PS
 };
 
-struct shader_compile_task_arg {
+struct shader_compile_job_param {
     /* In */
-    enum shader_compile_task_kind kind;
+    enum shader_compile_job_kind kind;
     struct pipeline *pipeline;
     struct shader_desc shader_desc;
     struct resource *shader_res;
@@ -782,15 +781,20 @@ struct shader_compile_task_arg {
     i64 elapsed;
 };
 
-/* TODO: Compile shaders offline w/ dxc for performance & language features like static_assert */
-INTERNAL WORK_TASK_FUNC_DEF(shader_compile_task, comp_arg_raw)
+struct shader_compile_job_sig {
+    struct shader_compile_job_param **params;
+};
+
+/* TODO: Compile shaders offline w/ dxc for performance & language features */
+INTERNAL JOB_DEF(shader_compile_job, job)
 {
     __prof;
-    struct shader_compile_task_arg *comp_arg = (struct shader_compile_task_arg *)comp_arg_raw;
-    enum shader_compile_task_kind kind = comp_arg->kind;
-    struct pipeline *pipeline = comp_arg->pipeline;
-    struct shader_desc shader_desc = comp_arg->shader_desc;
-    struct resource *shader_res = comp_arg->shader_res;
+    struct shader_compile_job_sig *sig = job.sig;
+    struct shader_compile_job_param *param = sig->params[job.id];
+    enum shader_compile_job_kind kind = param->kind;
+    struct pipeline *pipeline = param->pipeline;
+    struct shader_desc shader_desc = param->shader_desc;
+    struct resource *shader_res = param->shader_res;
 
     struct arena_temp scratch = scratch_begin_no_conflict();
     {
@@ -846,10 +850,10 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_compile_task, comp_arg_raw)
         }
     #endif
 
-        comp_arg->success = success;
-        comp_arg->blob = blob;
-        comp_arg->error_blob = error_blob;
-        comp_arg->elapsed = sys_time_ns() - start_ns;
+        param->success = success;
+        param->blob = blob;
+        param->error_blob = error_blob;
+        param->elapsed = sys_time_ns() - start_ns;
     }
     scratch_end(scratch);
 }
@@ -858,11 +862,29 @@ INTERNAL WORK_TASK_FUNC_DEF(shader_compile_task, comp_arg_raw)
  * Pipeline
  * ========================== */
 
-INTERNAL WORK_TASK_FUNC_DEF(pipeline_load_task, load_arg_raw)
+struct pipeline_init_job_sig {
+    struct pipeline_desc *descs_in;
+    struct pipeline **pipelines_out;
+};
+
+INTERNAL JOB_DEF(pipeline_init_job, job)
 {
     __prof;
-    struct pipeline *pipeline = (struct pipeline *)load_arg_raw;
-    struct pipeline_desc *desc = &pipeline->desc;
+    struct pipeline_init_job_sig *sig = job.sig;
+    struct pipeline_desc *desc = &sig->descs_in[job.id];
+    struct pipeline **pipelines_out = sig->pipelines_out;
+
+    struct pipeline *pipeline = NULL;
+    {
+        struct arena *pipeline_arena = arena_alloc(MEGABYTE(64));
+        pipeline = arena_push(pipeline_arena, struct pipeline);
+        pipeline->arena = pipeline_arena;
+        pipelines_out[job.id] = pipeline;
+    }
+    pipeline->desc = *desc;
+    pipeline->name = string_copy(pipeline->arena, desc->name);
+    pipeline->hash = hash_fnv64(HASH_FNV64_BASIS, pipeline->name);
+    pipeline->dependencies = dict_init(pipeline->arena, 64);
 
     struct arena_temp scratch = scratch_begin_no_conflict();
     {
@@ -893,13 +915,13 @@ INTERNAL WORK_TASK_FUNC_DEF(pipeline_load_task, load_arg_raw)
             }
         }
 
-        struct shader_compile_task_arg vs = ZI;
+        struct shader_compile_job_param vs = ZI;
         vs.kind = SHADER_COMPILE_TASK_KIND_VS;
         vs.pipeline = pipeline;
         vs.shader_desc = desc->vs;
         vs.shader_res = &vs_res;
 
-        struct shader_compile_task_arg ps = ZI;
+        struct shader_compile_job_param ps = ZI;
         ps.kind = SHADER_COMPILE_TASK_KIND_PS;
         ps.pipeline = pipeline;
         ps.shader_desc = desc->ps;
@@ -907,11 +929,9 @@ INTERNAL WORK_TASK_FUNC_DEF(pipeline_load_task, load_arg_raw)
 
         /* Compile shaders */
         if (success) {
-            struct work_slate ws = work_slate_begin();
-            work_slate_push_task(&ws, shader_compile_task, &vs);
-            work_slate_push_task(&ws, shader_compile_task, &ps);
-            struct work_handle work = work_slate_end_and_help(&ws, WORK_PRIORITY_HIGH);
-            work_wait(work);
+            struct shader_compile_job_param *params[] = { &vs, &ps };
+            struct shader_compile_job_sig comp_sig = { .params = params };
+            job_dispatch_wait(ARRAY_COUNT(params), shader_compile_job, &comp_sig);
             success = vs.success && ps.success;
         }
 
@@ -1099,27 +1119,11 @@ INTERNAL WORK_TASK_FUNC_DEF(pipeline_load_task, load_arg_raw)
     scratch_end(scratch);
 }
 
-INTERNAL void pipeline_alloc_from_desc(u64 num_pipelines, struct pipeline_desc *descs, struct pipeline **pipelines_out)
+INTERNAL void pipeline_alloc(u64 num_pipelines, struct pipeline_desc *descs_in, struct pipeline **pipelines_out)
 {
     __prof;
-    struct work_slate ws = work_slate_begin();
-    for (u64 i = 0; i < num_pipelines; ++i) {
-        struct pipeline_desc *desc = &descs[i];
-        struct pipeline *pipeline = NULL;
-        {
-            struct arena *pipeline_arena = arena_alloc(MEGABYTE(64));
-            pipeline = arena_push(pipeline_arena, struct pipeline);
-            pipeline->arena = pipeline_arena;
-            pipelines_out[i] = pipeline;
-        }
-        pipeline->desc = *desc;
-        pipeline->name = string_copy(pipeline->arena, desc->name);
-        pipeline->hash = hash_fnv64(HASH_FNV64_BASIS, pipeline->name);
-        pipeline->dependencies = dict_init(pipeline->arena, 64);
-        work_slate_push_task(&ws, pipeline_load_task, pipeline);
-    }
-    struct work_handle work = work_slate_end_and_help(&ws, WORK_PRIORITY_HIGH);
-    work_wait(work);
+    struct pipeline_init_job_sig sig = { .descs_in = descs_in, .pipelines_out = pipelines_out };
+    job_dispatch_wait(num_pipelines, pipeline_init_job, &sig);
 }
 
 INTERNAL void pipeline_release_now(struct pipeline *pipeline)
@@ -1262,7 +1266,7 @@ INTERNAL RESOURCE_WATCH_CALLBACK_FUNC_DEF(pipeline_resource_watch_callback, name
     /* Recompile dirty pipelines */
     if (num_pipelines > 0) {
         struct pipeline **pipelines = arena_push_array(scratch.arena, struct pipeline *, num_pipelines);
-        pipeline_alloc_from_desc(num_pipelines, pipeline_descs, pipelines);
+        pipeline_alloc(num_pipelines, pipeline_descs, pipelines);
         {
             struct sys_lock lock = sys_mutex_lock_s(G.pipelines_mutex);
             for (u32 i = 0; i < num_pipelines; ++i) {
@@ -1539,7 +1543,7 @@ INTERNAL void fenced_release(void *data, enum fenced_release_kind kind)
 
     u64 fr_targets[ARRAY_COUNT(G.fenced_release_targets)] = ZI;
 
-    /* Read fence values */
+    /* Read current fence target values from command queues */
     for (u32 i = 0; i < ARRAY_COUNT(G.command_queues); ++i) {
         struct command_queue *cq = G.command_queues[i];
         struct sys_lock lock = sys_mutex_lock_s(cq->submit_fence_mutex);
@@ -2803,7 +2807,7 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g
         __profframe(0);
     }
 
-#if PROFILING
+#if PROFILING_D3D
     {
         __profscope(Mark queue frames);
         /* Lock because frame marks shouldn't occur while command lists are recording */
@@ -2828,7 +2832,7 @@ void gp_present(struct sys_window *window, struct v2i32 backresolution, struct g
  * Evictor thread
  * ========================== */
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(evictor_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(evictor_thread_entry_point, arg)
 {
     __prof;
     (UNUSED)arg;
diff --git a/src/host.c b/src/host.c
index 9672bdd2..2aac9403 100644
--- a/src/host.c
+++ b/src/host.c
@@ -157,7 +157,7 @@ GLOBAL struct {
     i32 _;
 } G = ZI, DEBUG_ALIAS(G, G_host);
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(host_receiver_thread_entry_point, arg);
+INTERNAL SYS_THREAD_DEF(host_receiver_thread_entry_point, arg);
 INTERNAL void host_msg_assembler_release(struct host_msg_assembler *ma);
 
 /* ========================== *
@@ -1061,7 +1061,7 @@ void host_update_end(struct host *host)
  * Receive thread
  * ========================== */
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(host_receiver_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(host_receiver_thread_entry_point, arg)
 {
     u64 read_buff_size = KILOBYTE(64);
     struct arena *read_buff_arena = arena_alloc(read_buff_size);
diff --git a/src/job.c b/src/job.c
new file mode 100644
index 00000000..35336e2a
--- /dev/null
+++ b/src/job.c
@@ -0,0 +1,296 @@
+#include "job.h"
+#include "sys.h"
+#include "arena.h"
+#include "atomic.h"
+#include "string.h"
+#include "scratch.h"
+#include "app.h"
+
+#if 0
+/* FIXME: Remove this (replace with sys_ wrappers) */
+#include <Windows.h>
+#endif
+
+struct worker_job {
+    struct sys_mutex *mutex;
+    i32 num_workers;
+    i32 num_dispatched;
+
+    i32 count;
+    job_func *func;
+    void *sig;
+
+    u64 gen;
+    struct sys_condition_variable *gen_cv;
+
+
+    struct worker_job *prev;
+    struct worker_job *next;
+
+    struct worker_job *next_free;
+};
+
+/* ========================== *
+ * Global state
+ * ========================== */
+
+struct worker_info {
+    i32 id;
+};
+
+GLOBAL struct {
+    struct sys_mutex *free_jobs_mutex;
+    struct arena *free_jobs_arena;
+    struct worker_job *first_free_job;
+
+    struct sys_mutex *queued_jobs_mutex;
+    struct worker_job *first_queued_job;
+    struct worker_job *last_queued_job;
+    u64 num_queued_jobs;
+
+
+    u32 num_worker_threads;
+    b32 workers_shutdown;
+    struct sys_mutex *workers_wake_mutex;
+    struct sys_condition_variable *workers_wake_cv;
+    struct sys_thread *worker_threads[JOB_MAX_WORKERS];
+} G = ZI, DEBUG_ALIAS(G, G_job);
+
+/* ========================== *
+ * Startup
+ * ========================== */
+
+INTERNAL SYS_THREAD_DEF(worker_thread_entry_point, thread_arg);
+INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(job_shutdown);
+
+void job_startup(i32 num_workers)
+{
+    __prof;
+    struct arena_temp scratch = scratch_begin_no_conflict();
+
+    G.free_jobs_mutex = sys_mutex_alloc();
+    G.free_jobs_arena = arena_alloc(GIGABYTE(64));
+
+    G.queued_jobs_mutex = sys_mutex_alloc();
+
+    G.workers_wake_mutex = sys_mutex_alloc();
+    G.workers_wake_cv = sys_condition_variable_alloc();
+
+    if (num_workers < JOB_MIN_WORKERS || num_workers > JOB_MAX_WORKERS) {
+        /* Invalid worker count */
+        ASSERT(false);
+    }
+    G.num_worker_threads = num_workers;
+    for (u64 i = 0; i < G.num_worker_threads; ++i) {
+        u32 prefix = num_workers - i;  /* For profiler sorting order */
+        struct string name = string_format(scratch.arena, LIT("[P6%F] Worker #%F"), FMT_UINT(prefix), FMT_UINT(i));
+        G.worker_threads[i] = sys_thread_alloc(worker_thread_entry_point, &i, name);
+    }
+
+    app_register_exit_callback(job_shutdown);
+
+    scratch_end(scratch);
+}
+
+INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(job_shutdown)
+{
+    __prof;
+    {
+        struct sys_lock lock = sys_mutex_lock_e(G.workers_wake_mutex);
+        G.workers_shutdown = true;
+        sys_condition_variable_signal(G.workers_wake_cv, U32_MAX);
+        sys_mutex_unlock(&lock);
+    }
+    for (u32 i = 0; i < G.num_worker_threads; ++i) {
+        struct sys_thread *thread = G.worker_threads[i];
+        sys_thread_wait_release(thread);
+    }
+}
+
+/* ========================== *
+ * Job
+ * ========================== */
+
+struct job_handle job_dispatch_async(u32 count, job_func *job_func, void *sig)
+{
+    __prof;
+
+    /* Allocate job */
+    u64 gen = 0;
+    struct worker_job *job = NULL;
+    {
+        struct sys_mutex *old_mutex = NULL;
+        struct sys_condition_variable *old_cv = NULL;
+        {
+            struct sys_lock lock = sys_mutex_lock_e(G.free_jobs_mutex);
+            if (G.first_free_job) {
+                job = G.first_free_job;
+                G.first_free_job = job->next_free;
+                old_mutex = job->mutex;
+                old_cv = job->gen_cv;
+                gen = job->gen + 1;
+            } else {
+                job = arena_push_no_zero(G.free_jobs_arena, struct worker_job);
+                gen = 1;
+            }
+            sys_mutex_unlock(&lock);
+        }
+        MEMZERO_STRUCT(job);
+        if (old_mutex) {
+            job->mutex = old_mutex;
+            job->gen_cv = old_cv;
+        } else {
+            job->mutex = sys_mutex_alloc();
+            job->gen_cv = sys_condition_variable_alloc();
+        }
+    }
+    job->count = count;
+    job->func = job_func;
+    job->sig = sig;
+    job->gen = gen;
+
+    /* Queue job */
+    {
+        struct sys_lock lock = sys_mutex_lock_e(G.queued_jobs_mutex);
+        if (G.last_queued_job) {
+            G.last_queued_job->next = job;
+        } else {
+            G.first_queued_job = job;
+        }
+        G.last_queued_job = job;
+        sys_mutex_unlock(&lock);
+    }
+
+    /* Signal workers */
+    {
+        struct sys_lock lock = sys_mutex_lock_e(G.workers_wake_mutex);
+        sys_condition_variable_signal(G.workers_wake_cv, count);
+        sys_mutex_unlock(&lock);
+    }
+
+    struct job_handle handle = ZI;
+    handle.job = job;
+    handle.gen = gen;
+    return handle;
+}
+
+void job_dispatch_wait(u32 count, job_func *job_func, void *sig)
+{
+    __prof;
+    struct job_handle handle = job_dispatch_async(count, job_func, sig);
+    job_wait(handle);
+}
+
+void job_wait(struct job_handle handle)
+{
+    __prof;
+    if (handle.job) {
+        struct worker_job *job = handle.job;
+        while (job->gen == handle.gen) {
+            struct sys_lock lock = sys_mutex_lock_s(job->mutex);
+            sys_condition_variable_wait(job->gen_cv, &lock);
+            sys_mutex_unlock(&lock);
+        }
+    }
+}
+
+/* ========================== *
+ * Worker
+ * ========================== */
+
+INTERNAL SYS_THREAD_DEF(worker_thread_entry_point, thread_arg)
+{
+    i32 worker_id = *(i32 *)thread_arg;
+    (UNUSED)worker_id;
+
+    struct sys_lock workers_wake_lock = sys_mutex_lock_s(G.workers_wake_mutex);
+    while (!G.workers_shutdown) {
+        sys_mutex_unlock(&workers_wake_lock);
+
+        /* Try to pick job from queue */
+        i32 job_id = 0;
+        i32 job_count = 0;
+        struct worker_job *job = NULL;
+        {
+            struct sys_lock queue_lock = sys_mutex_lock_s(G.queued_jobs_mutex);
+            for (struct worker_job *tmp = G.first_queued_job; tmp && !job; tmp = tmp->next) {
+                struct sys_lock job_lock = sys_mutex_lock_e(tmp->mutex);
+                {
+                    i32 tmp_id = tmp->num_dispatched++;
+                    i32 tmp_count = tmp->count;
+                    if (tmp_id < tmp_count) {
+                        /* Pick job */
+                        ++tmp->num_workers;
+                        job = tmp;
+                        job_id = tmp_id;
+                        job_count = tmp_count;
+                    }
+                }
+                sys_mutex_unlock(&job_lock);
+            }
+            sys_mutex_unlock(&queue_lock);
+        }
+
+        /* Remove job from queue */
+        if (job_id == (job_count - 1)) {
+            struct sys_lock queue_lock = sys_mutex_lock_e(G.queued_jobs_mutex);
+            {
+                struct worker_job *prev = job->prev;
+                struct worker_job *next = job->next;
+                if (prev) {
+                    prev->next = next;
+                } else {
+                    G.first_queued_job = next;
+                }
+                if (next) {
+                    next->prev = prev;
+                } else {
+                    G.last_queued_job = prev;
+                }
+                --G.num_queued_jobs;
+            }
+            sys_mutex_unlock(&queue_lock);
+        }
+
+        /* Execute job */
+        if (job) {
+            struct job_data data = ZI;
+            data.sig = job->sig;
+            job_func *func = job->func;
+            b32 should_release = false;
+            while (job_id < job_count) {
+                {
+                    data.id = job_id;
+                    func(data);
+                }
+                {
+                    struct sys_lock job_lock = sys_mutex_lock_e(job->mutex);
+                    job_id = job->num_dispatched++;
+                    if (job_id >= job_count) {
+                        i32 num_workers = --job->num_workers;
+                        if (num_workers == 0) {
+                            ++job->gen;
+                            should_release = true;
+                            sys_condition_variable_signal(job->gen_cv, U32_MAX);
+                        }
+                    }
+                    sys_mutex_unlock(&job_lock);
+                }
+            }
+            if (should_release) {
+                struct sys_lock fj_lock = sys_mutex_lock_e(G.free_jobs_mutex);
+                {
+                    job->next_free = G.first_free_job;
+                    G.first_free_job = job;
+                }
+                sys_mutex_unlock(&fj_lock);
+            }
+        }
+
+        workers_wake_lock = sys_mutex_lock_s(G.workers_wake_mutex);
+        if (!G.workers_shutdown && !G.first_queued_job) {
+            __profscope(Worker sleep);
+            sys_condition_variable_wait(G.workers_wake_cv, &workers_wake_lock);
+        }
+    }
+}
diff --git a/src/job.h b/src/job.h
new file mode 100644
index 00000000..4a8d0a7e
--- /dev/null
+++ b/src/job.h
@@ -0,0 +1,34 @@
+#ifndef JOB_H
+#define JOB_H
+
+#define JOB_MIN_WORKERS 2
+#define JOB_MAX_WORKERS 64
+
+/* ========================== *
+ * Startup
+ * ========================== */
+
+void job_startup(i32 num_workers);
+
+/* ========================== *
+ * Job
+ * ========================== */
+
+struct job_data {
+    i32 id;
+    void *sig;
+};
+
+struct job_handle {
+    void *job;
+    u64 gen;
+};
+
+#define JOB_DEF(job_name, arg_name) void job_name(struct job_data arg_name)
+typedef JOB_DEF(job_func, job_data);
+
+struct job_handle job_dispatch_async(u32 count, job_func *job_func, void *sig);
+void job_dispatch_wait(u32 count, job_func *job_func, void *sig);
+void job_wait(struct job_handle handle);
+
+#endif
diff --git a/src/playback_wasapi.c b/src/playback_wasapi.c
index 48e49d0f..44d07010 100644
--- a/src/playback_wasapi.c
+++ b/src/playback_wasapi.c
@@ -44,7 +44,6 @@ GLOBAL struct {
     IAudioRenderClient *playback;
     WAVEFORMATEX *buffer_format;
     u32 buffer_frames;
-    HANDLE mmtc_handle;
 } G = ZI, DEBUG_ALIAS(G, G_playback_wasapi);
 
 /* ========================== *
@@ -53,7 +52,7 @@ GLOBAL struct {
 
 INTERNAL void wasapi_initialize(void);
 INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(playback_shutdown);
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(playback_thread_entry_point, arg);
+INTERNAL SYS_THREAD_DEF(playback_thread_entry_point, arg);
 
 struct playback_startup_receipt playback_startup(struct mixer_startup_receipt *mixer_sr)
 {
@@ -79,10 +78,6 @@ INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(playback_shutdown)
 
 INTERNAL void wasapi_initialize(void)
 {
-    /* https://learn.microsoft.com/en-us/windows/win32/procthread/multimedia-class-scheduler-service#registry-settings */
-    DWORD task = 0;
-    G.mmtc_handle = AvSetMmThreadCharacteristicsW(L"Pro Audio", &task);
-
     u64 sample_rate = PLAYBACK_SAMPLE_RATE;
     u64 channel_count = 2;
     u32 channel_mask = SPEAKER_FRONT_LEFT | SPEAKER_FRONT_RIGHT;
@@ -233,11 +228,15 @@ INTERNAL void wasapi_update_end(struct wasapi_buffer *wspbuf, struct mixed_pcm_f
  * Playback thread entry
  * ========================== */
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(playback_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(playback_thread_entry_point, arg)
 {
     struct arena_temp scratch = scratch_begin_no_conflict();
     (UNUSED)arg;
 
+    /* https://learn.microsoft.com/en-us/windows/win32/procthread/multimedia-class-scheduler-service#registry-settings */
+    DWORD task = 0;
+    HANDLE mmc_handle = AvSetMmThreadCharacteristicsW(L"Pro Audio", &task);
+    ASSERT(mmc_handle);
 
     /* FIXME: If playback fails at any point and mixer stops advancing, we
      * need to halt mixer to prevent memory leak when sounds are played. */
diff --git a/src/prof_tracy.h b/src/prof_tracy.h
new file mode 100644
index 00000000..45e7d31f
--- /dev/null
+++ b/src/prof_tracy.h
@@ -0,0 +1,139 @@
+#ifndef PROF_H
+#define PROF_H
+
+#if COMPILER_MSVC
+# error "MSVC not supported for profiling (cleanup attributes are required for profiling markup)"
+#endif
+
+#if PROFILING
+
+/* Include tracy client */
+#define TRACY_ENABLE
+#define TRACY_MANUAL_LIFETIME
+#define TRACY_DELAYED_INIT
+#if 1
+/* Disable system tracing (very slow) */
+# define TRACY_NO_CALLSTACK
+# define TRACY_NO_SYSTEM_TRACING
+#endif
+#include STRINGIZE(TRACY_INCLUDE_PATH)
+
+#define PROFILING_CAPTURE_FRAME_IMAGE 0
+#define PROFILING_LOCKS 0
+#define PROFILING_D3D 1
+#define PROFILING_CMD_WSTR L"tracy-profiler.exe -a 127.0.0.1"
+
+/* Clang/GCC cleanup macros */
+#define __prof static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { NULL, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin( &CAT(__tracy_source_location,__LINE__), true );
+#define __profscope(name) static const struct ___tracy_source_location_data CAT(__tracy_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, 0 }; __attribute((cleanup(__prof_zone_cleanup_func))) TracyCZoneCtx __tracy_zone_ctx = ___tracy_emit_zone_begin( &CAT(__tracy_source_location,__LINE__), true );
+INLINE void __prof_zone_cleanup_func(TracyCZoneCtx *ctx) { TracyCZoneEnd(*ctx); }
+
+#define __profalloc(ptr, size)      TracyCAlloc((ptr), (size))
+#define __proffree(ptr)             TracyCFree((ptr))
+#define __profmsg(txt, len, col)    TracyCMessageC((txt), (len), BGR32(col));
+#define __profframe(name)           TracyCFrameMarkNamed((name))
+#define __profthread(name)          TracyCSetThreadName((name))
+
+enum __prof_plot_type {
+    __prof_plot_type_number = TracyPlotFormatNumber,
+    __prof_plot_type_memory = TracyPlotFormatMemory,
+    __prof_plot_type_percentage = TracyPlotFormatPercentage,
+    __prof_plot_type_watt = TracyPlotFormatWatt
+};
+#define __prof_plot_init(name, type, step, fill, color) TracyCPlotConfig(name, type, step, fill, BGR32(color))
+#define __prof_plot(name, val) TracyCPlot(name, val)
+#define __prof_plot_i(name, val) TracyCPlotI(name, val)
+#define __prof_is_connected() ___tracy_connected()
+
+#else
+
+#define PROFILING_CAPTURE_FRAME_IMAGE 0
+#define PROFILING_LOCKS 0
+#define PROFILING_D3D 0
+
+#define __prof
+#define __profscope(name)
+#define __profalloc(ptr, size)
+#define __proffree(ptr)
+#define __profmsg(txt, len, col)
+#define __profframe(name)
+#define __profthread(name)
+#define __prof_plot_init(name, type, step, fill, color)
+#define __prof_plot(name, val)
+#define __prof_plot_i(name, val)
+#define __prof_is_connected() 0
+
+#endif  /* PROFILING */
+
+#if PROFILING_LOCKS
+# define __proflock_ctx(name) struct TracyCSharedLockCtx *name
+# define __proflock_alloc(ctx) TracyCSharedLockAnnounce((ctx))
+# define __proflock_release(ctx) TracyCSharedLockTerminate((ctx))
+# define __proflock_before_exclusive_lock(ctx) TracyCSharedLockBeforeExclusiveLock((ctx))
+# define __proflock_after_exclusive_lock(ctx) TracyCSharedLockAfterExclusiveLock((ctx))
+# define __proflock_after_exclusive_unlock(ctx) TracyCSharedLockAfterExclusiveUnlock((ctx))
+# define __proflock_after_try_exclusive_lock(ctx, acquired) TracyCSharedLockAfterTryExclusiveLock((ctx), (acquired))
+# define __proflock_before_shared_lock(ctx) TracyCSharedLockBeforeSharedLock((ctx))
+# define __proflock_after_shared_lock(ctx) TracyCSharedLockAfterSharedLock((ctx))
+# define __proflock_after_shared_unlock(ctx) TracyCSharedLockAfterSharedUnlock((ctx))
+# define __proflock_after_try_shared_lock(ctx, acquired) TracyCSharedLockAfterTrySharedLock((ctx), (acquired))
+# define __proflock_mark(ctx) TracyCSharedLockMark((ctx))
+# define __proflock_custom_name(ctx, name, len) TracyCSharedLockCustomName((ctx), (name), (len))
+#else
+# define __proflock_alloc(ctx)
+# define __proflock_release(ctx)
+# define __proflock_before_exclusive_lock(ctx)
+# define __proflock_after_exclusive_lock(ctx)
+# define __proflock_after_exclusive_unlock(ctx)
+# define __proflock_after_try_exclusive_lock(ctx, acquired)
+# define __proflock_before_shared_lock(ctx)
+# define __proflock_after_shared_lock(ctx)
+# define __proflock_after_shared_unlock(ctx)
+# define __proflock_after_try_shared_lock(ctx, acquired)
+# define __proflock_mark(ctx)
+# define __proflock_custom_name(ctx, name, len)
+#endif  /* PROFILING && PROFILING_LOCKS */
+
+#if PROFILING_D3D
+/* Dx11 */
+INLINE void __prof_dx11_zone_cleanup_func(TracyCD3D11ZoneCtx *ctx) { ___tracy_d3d11_emit_zone_end(*ctx); }
+# define __profscope_dx11(dx11_ctx, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d11_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx11_zone_cleanup_func))) TracyCD3D11ZoneCtx __tracy_d3d11_zone_ctx; ___tracy_d3d11_emit_zone_begin( dx11_ctx, &__tracy_d3d11_zone_ctx, &CAT(__tracy_gpu_d3d11_source_location,__LINE__), true);
+# define __prof_dx11_ctx(name) struct TracyCD3D11Ctx *name
+# define __prof_dx11_ctx_alloc(ctx, device, device_ctx, name, name_len) ctx = ___tracy_d3d11_context_announce(device, device_ctx, name, name_len)
+# define __prof_dx11_ctx_release(ctx) ___tracy_d3d11_context_terminate(ctx)
+# define __prof_dx11_collect(ctx) ___tracy_d3d11_context_collect(ctx)
+/* Dx12 */
+INLINE void __prof_dx12_zone_cleanup_func(TracyCD3D12ZoneCtx *ctx) { ___tracy_d3d12_emit_zone_end(*ctx); }
+# define __profscope_dx12(dx12_ctx, cmd_list, name, color) static const struct ___tracy_source_location_data CAT(__tracy_gpu_d3d12_source_location,__LINE__) = { #name, __func__,  __FILE__, (uint32_t)__LINE__, BGR32(color) }; __attribute((cleanup(__prof_dx12_zone_cleanup_func))) TracyCD3D12ZoneCtx __tracy_d3d12_zone_ctx; ___tracy_d3d12_emit_zone_begin( dx12_ctx, cmd_list, &__tracy_d3d12_zone_ctx, &CAT(__tracy_gpu_d3d12_source_location,__LINE__), true);
+# define __prof_dx12_ctx(name) struct TracyCD3D12Ctx *name
+# define __prof_dx12_ctx_alloc(ctx, device, queue, name, name_len) ctx = ___tracy_d3d12_context_announce(device, queue, name, name_len)
+# define __prof_dx12_ctx_release(ctx) ___tracy_d3d12_context_terminate(ctx)
+# define __prof_dx12_new_frame(ctx) ___tracy_d3d12_context_new_frame(ctx)
+# define __prof_dx12_collect(ctx) ___tracy_d3d12_context_collect(ctx)
+#else
+# define __profscope_dx11(dx11_ctx, name, color)
+# define __prof_dx11_ctx_alloc(ctx, device, device_ctx, name, name_len)
+# define __prof_dx11_ctx_release(ctx)
+# define __prof_dx11_collect(ctx)
+# define __profscope_dx12(dx11_ctx, queue, name, color)
+# define __prof_dx12_ctx_alloc(ctx, device, queue, name, name_len)
+# define __prof_dx12_ctx_release(ctx)
+# define __prof_dx12_new_frame(ctx)
+# define __prof_dx12_collect(ctx)
+#endif  /* PROFILING_D3D */
+
+#if PROFILING_CAPTURE_FRAME_IMAGE
+# define __profframeimage(image, width, height, offset, flipped)    TracyCFrameImage((image), (width), (height), (offset), (flipped));
+#else
+# define __profframeimage(image, width, height, offset, flipped)
+#endif  /* PROFILING_CAPTURE_FRAME_IMAGE */
+
+#ifdef TRACY_MANUAL_LIFETIME
+# define __prof_startup ___tracy_startup_profiler()
+# define __prof_shutdown ___tracy_shutdown_profiler()
+#else
+# define __prof_startup
+# define __prof_shutdown
+#endif /* TRACY_MANUAL_LIFETIME */
+
+#endif
diff --git a/src/resource.c b/src/resource.c
index 2e1635df..2b2970ea 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -45,8 +45,8 @@ GLOBAL struct {
  * ========================== */
 
 #if RESOURCE_RELOADING
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(resource_watch_monitor_thread_entry_point, _);
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(resource_watch_dispatcher_thread_entry_point, _);
+INTERNAL SYS_THREAD_DEF(resource_watch_monitor_thread_entry_point, _);
+INTERNAL SYS_THREAD_DEF(resource_watch_dispatcher_thread_entry_point, _);
 INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(resource_shutdown);
 #endif
 
@@ -182,7 +182,7 @@ void resource_register_watch_callback(resource_watch_callback *callback)
     sys_mutex_unlock(&lock);
 }
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(resource_watch_monitor_thread_entry_point, _)
+INTERNAL SYS_THREAD_DEF(resource_watch_monitor_thread_entry_point, _)
 {
     (UNUSED)_;
     struct arena_temp scratch = scratch_begin_no_conflict();
@@ -218,7 +218,7 @@ INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(resource_watch_monitor_thread_entry_poi
 #define WATCH_DISPATCHER_DELAY_SECONDS 0.050
 #define WATCH_DISPATCHER_DEDUP_DICT_BINS 128
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(resource_watch_dispatcher_thread_entry_point, _)
+INTERNAL SYS_THREAD_DEF(resource_watch_dispatcher_thread_entry_point, _)
 {
     (UNUSED)_;
     struct arena_temp scratch = scratch_begin_no_conflict();
diff --git a/src/sound.c b/src/sound.c
index ee8ec0f3..e98df42e 100644
--- a/src/sound.c
+++ b/src/sound.c
@@ -6,7 +6,7 @@
 #include "resource.h"
 #include "asset_cache.h"
 #include "mp3.h"
-#include "work.h"
+#include "job.h"
 
 struct sound_task_params {
     struct sound_task_params *next_free;
@@ -35,11 +35,9 @@ GLOBAL struct {
  * Startup
  * ========================== */
 
-struct sound_startup_receipt sound_startup(struct work_startup_receipt *work_sr,
-                                           struct asset_cache_startup_receipt *asset_cache_sr,
+struct sound_startup_receipt sound_startup(struct asset_cache_startup_receipt *asset_cache_sr,
                                            struct resource_startup_receipt *resource_sr)
 {
-    (UNUSED)work_sr;
     (UNUSED)asset_cache_sr;
     (UNUSED)resource_sr;
 
@@ -81,10 +79,10 @@ INTERNAL void sound_task_params_release(struct sound_task_params *p)
  * Load
  * ========================== */
 
-INTERNAL WORK_TASK_FUNC_DEF(sound_load_asset_task, vparams)
+INTERNAL JOB_DEF(sound_load_asset_job, job)
 {
     __prof;
-    struct sound_task_params *params = (struct sound_task_params *)vparams;
+    struct sound_task_params *params = job.sig;
     struct arena_temp scratch = scratch_begin_no_conflict();
     struct string path = STRING(params->path_len, (u8 *)params->path_cstr);
     struct asset *asset = params->asset;
@@ -156,7 +154,7 @@ INTERNAL WORK_TASK_FUNC_DEF(sound_load_asset_task, vparams)
     scratch_end(scratch);
 }
 
-struct asset *sound_load_asset(struct string path, u32 flags, b32 help)
+struct asset *sound_load_asset(struct string path, u32 flags, b32 wait)
 {
     __prof;
     struct arena_temp scratch = scratch_begin_no_conflict();
@@ -185,13 +183,13 @@ struct asset *sound_load_asset(struct string path, u32 flags, b32 help)
 
         /* Push task */
         asset_cache_mark_loading(asset);
-        struct work_handle wh = ZI;
-        if (help) {
-            wh = work_push_task_and_help(&sound_load_asset_task, params, WORK_PRIORITY_NORMAL);
+        if (wait) {
+            job_dispatch_wait(1, sound_load_asset_job, params);
+            asset_cache_set_job(asset, NULL);
         } else {
-            wh = work_push_task(&sound_load_asset_task, params, WORK_PRIORITY_NORMAL);
+            struct job_handle job = job_dispatch_async(1, sound_load_asset_job, params);
+            asset_cache_set_job(asset, &job);
         }
-        asset_cache_set_work(asset, &wh);
     }
 
     scratch_end(scratch);
diff --git a/src/sound.h b/src/sound.h
index aecf766e..41872b9a 100644
--- a/src/sound.h
+++ b/src/sound.h
@@ -5,7 +5,6 @@
 #define SOUND_FLAG_STEREO       0x1
 
 struct asset;
-struct work_startup_receipt;
 struct asset_cache_startup_receipt;
 struct resource_startup_receipt;
 
@@ -15,8 +14,7 @@ struct sound {
 };
 
 struct sound_startup_receipt { i32 _; };
-struct sound_startup_receipt sound_startup(struct work_startup_receipt *work_sr,
-                                           struct asset_cache_startup_receipt *asset_cache_sr,
+struct sound_startup_receipt sound_startup(struct asset_cache_startup_receipt *asset_cache_sr,
                                            struct resource_startup_receipt *resource_sr);
 
 struct asset *sound_load_asset(struct string path, u32 flags, b32 wait);
diff --git a/src/sprite.c b/src/sprite.c
index 732f2b46..a704f937 100644
--- a/src/sprite.c
+++ b/src/sprite.c
@@ -6,7 +6,7 @@
 #include "resource.h"
 #include "ase.h"
 #include "util.h"
-#include "work.h"
+#include "job.h"
 #include "atomic.h"
 #include "app.h"
 #include "gp.h"
@@ -203,8 +203,8 @@ INTERNAL struct image_rgba generate_purple_black_image(struct arena *arena, u32
  * ========================== */
 
 INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(sprite_shutdown);
-INTERNAL WORK_TASK_FUNC_DEF(sprite_load_task, arg);
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(sprite_evictor_thread_entry_point, arg);
+INTERNAL JOB_DEF(sprite_load_job, arg);
+INTERNAL SYS_THREAD_DEF(sprite_evictor_thread_entry_point, arg);
 
 #if RESOURCE_RELOADING
 INTERNAL RESOURCE_WATCH_CALLBACK_FUNC_DEF(sprite_resource_watch_callback, info);
@@ -312,7 +312,7 @@ INTERNAL struct cache_entry_hash cache_entry_hash_from_tag_hash(u64 tag_hash, en
  * ========================== */
 
 INTERNAL struct sprite_scope_cache_ref *scope_ensure_ref_from_ref(struct sprite_scope *scope, struct cache_ref ref);
-INTERNAL void push_load_task(struct cache_ref ref, struct sprite_tag tag)
+INTERNAL void push_load_job(struct cache_ref ref, struct sprite_tag tag)
 {
     struct load_cmd *cmd = NULL;
     {
@@ -338,7 +338,7 @@ INTERNAL void push_load_task(struct cache_ref ref, struct sprite_tag tag)
     }
 
     /* Push work */
-    work_push_task(&sprite_load_task, cmd, WORK_PRIORITY_NORMAL);
+    job_dispatch_async(1, sprite_load_job, cmd);
 }
 
 INTERNAL void cache_entry_load_texture(struct cache_ref ref, struct sprite_tag tag)
@@ -1013,7 +1013,7 @@ INTERNAL void *data_from_tag_internal(struct sprite_scope *scope, struct sprite_
                 }
             } else {
                 /* Allocate cmd */
-                push_load_task(ref, tag);
+                push_load_job(ref, tag);
             }
         }
     }
@@ -1141,13 +1141,13 @@ struct sprite_sheet_slice_array sprite_sheet_get_slices(struct sprite_sheet *she
 }
 
 /* ========================== *
- * Load task
+ * Load job
  * ========================== */
 
-INTERNAL WORK_TASK_FUNC_DEF(sprite_load_task, arg)
+INTERNAL JOB_DEF(sprite_load_job, job)
 {
     __prof;
-    struct load_cmd *cmd = (struct load_cmd *)arg;
+    struct load_cmd *cmd = job.sig;
     struct cache_ref ref = cmd->ref;
 
     switch (ref.e->kind) {
@@ -1190,7 +1190,7 @@ INTERNAL void reload_if_exists(struct sprite_scope *scope, struct sprite_tag tag
     if (existing_ref) {
         logf_info("Sprite resource file \"%F\" has changed for sprite [%F].", FMT_STR(tag.path), FMT_HEX(hash.v));
         struct sprite_scope_cache_ref *scope_ref = cache_entry_from_tag(scope, tag, kind, true);
-        push_load_task(scope_ref->ref, tag);
+        push_load_job(scope_ref->ref, tag);
     }
 }
 
@@ -1240,7 +1240,7 @@ INTERNAL SORT_COMPARE_FUNC_DEF(evict_sort, arg_a, arg_b, udata)
  *     - The cache is over its memory budget and the node's last reference is longer ago than the grace period
  *     - Resource reloading is enabled and the node is out of date due to a change to its original resource file
  */
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(sprite_evictor_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(sprite_evictor_thread_entry_point, arg)
 {
     (UNUSED)arg;
 
diff --git a/src/sys.h b/src/sys.h
index 530e44a2..11d0ae40 100644
--- a/src/sys.h
+++ b/src/sys.h
@@ -403,12 +403,12 @@ struct thread_local_store *sys_thread_get_thread_local_store(void);
 
 #define SYS_THREAD_STACK_SIZE MEGABYTE(4)
 
-#define SYS_THREAD_ENTRY_POINT_FUNC_DEF(name, arg_name) void name(void *arg_name)
-typedef SYS_THREAD_ENTRY_POINT_FUNC_DEF(sys_thread_entry_point_func, data);
+#define SYS_THREAD_DEF(name, arg_name) void name(void *arg_name)
+typedef SYS_THREAD_DEF(sys_thread_func, data);
 
 /* Creates a new thread running in the supplied `entry_point` */
 struct sys_thread *sys_thread_alloc(
-    sys_thread_entry_point_func *entry_point,
+    sys_thread_func *entry_point,
     void *thread_data,  /* Passed as arg to `entry_point` */
     struct string thread_name
 );
@@ -467,4 +467,10 @@ void sys_sleep_precise(f64 seconds);
  * (less cpu intensive) */
 void sys_sleep(f64 seconds);
 
+/* ========================== *
+ * Command line
+ * ========================== */
+
+b32 sys_run_command(struct string cmd);
+
 #endif
diff --git a/src/sys_win32.c b/src/sys_win32.c
index e9540921..b55ea886 100644
--- a/src/sys_win32.c
+++ b/src/sys_win32.c
@@ -5,7 +5,6 @@
 #include "arena.h"
 #include "scratch.h"
 #include "atomic.h"
-#include "work.h"
 #include "log.h"
 #include "math.h"
 #include "util.h"
@@ -37,9 +36,10 @@ struct win32_mutex {
     SRWLOCK srwlock;
     struct win32_mutex *next_free;
 
-#if PROFILING
-    struct __proflock_ctx *profiling_ctx;
+#if PROFILING_LOCKS
+    __proflock_ctx(profiling_ctx);
 #endif
+
 #if RTC
     u64 owner_tid;
     struct atomic_i64 count;
@@ -55,7 +55,7 @@ struct win32_condition_variable {
 };
 
 struct win32_thread {
-    sys_thread_entry_point_func *entry_point;
+    sys_thread_func *entry_point;
     void *thread_data;
     char thread_name_cstr[256];
     wchar_t thread_name_wstr[256];
@@ -944,7 +944,7 @@ INTERNAL HWND win32_create_window(struct win32_window *window)
     return hwnd;
 }
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(window_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(window_thread_entry_point, arg)
 {
     struct win32_window *window = (struct win32_window *)arg;
 
@@ -1588,12 +1588,12 @@ void sys_window_cursor_disable_clip(struct sys_window *sys_window)
 
 INTERNAL void win32_mutex_init(struct win32_mutex *m)
 {
-#if PROFILING
-    struct __proflock_ctx *profiling_ctx = m->profiling_ctx;
+#if PROFILING_LOCKS
+    __proflock_ctx(profiling_ctx) = m->profiling_ctx;
 #endif
     MEMZERO_STRUCT(m);
     m->srwlock = (SRWLOCK)SRWLOCK_INIT;
-#if PROFILING
+#if PROFILING_LOCKS
     if (!profiling_ctx) {
         __proflock_alloc(profiling_ctx);
     }
@@ -1964,7 +1964,7 @@ INTERNAL DWORD WINAPI win32_thread_proc(LPVOID vt)
     return 0;
 }
 
-struct sys_thread *sys_thread_alloc(sys_thread_entry_point_func *entry_point, void *thread_data, struct string thread_name)
+struct sys_thread *sys_thread_alloc(sys_thread_func *entry_point, void *thread_data, struct string thread_name)
 {
     __prof;
     struct arena_temp scratch = scratch_begin_no_conflict();
@@ -2324,11 +2324,30 @@ void sys_sleep(f64 seconds)
     Sleep(ms);
 }
 
+/* ========================== *
+ * Command line
+ * ========================== */
+
+b32 sys_run_command(struct string cmd)
+{
+    b32 success = false;
+    {
+        struct arena_temp scratch = scratch_begin_no_conflict();
+        wchar_t *cmd_wstr = wstr_from_string(scratch.arena, cmd);
+        STARTUPINFO si = ZI;
+        si.cb = sizeof(si);
+        PROCESS_INFORMATION pi = ZI;
+        success = CreateProcessW(NULL, cmd_wstr, NULL, NULL, FALSE, DETACHED_PROCESS, NULL, NULL, &si, &pi);
+        scratch_end(scratch);
+    }
+    return success;
+}
+
 /* ========================== *
  * Entry point
  * ========================== */
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(win32_app_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(win32_app_thread_entry_point, arg)
 {
     (UNUSED)arg;
     struct arena_temp scratch = scratch_begin_no_conflict();
@@ -2343,6 +2362,7 @@ int CALLBACK wWinMain(_In_ HINSTANCE instance, _In_opt_ HINSTANCE prev_instance,
     (UNUSED)prev_instance;
     (UNUSED)cmdline_wstr;
     (UNUSED)show_code;
+    __prof_startup;
 
     u64 cmdline_len = wstr_len(cmdline_wstr, ARRAY_COUNT(G.cmdline_args_wstr) - 1);
     MEMCPY(G.cmdline_args_wstr, cmdline_wstr, cmdline_len * sizeof(*cmdline_wstr));
@@ -2532,6 +2552,26 @@ int CALLBACK wWinMain(_In_ HINSTANCE instance, _In_opt_ HINSTANCE prev_instance,
         return 1;
     }
 
+#if PROFILING
+    /* Launch profiler */
+    if (!__prof_is_connected()) {
+        __profscope(Launch profiler);
+        STARTUPINFO si = { sizeof(si) };
+        PROCESS_INFORMATION pi = ZI;
+        wchar_t cmd[sizeof(PROFILING_CMD_WSTR)] = ZI;
+        MEMCPY(cmd, PROFILING_CMD_WSTR, sizeof(PROFILING_CMD_WSTR));
+        b32 success = CreateProcessW(NULL, cmd, NULL, NULL, FALSE, DETACHED_PROCESS, NULL, NULL, &si, &pi);
+        if (success) {
+            while (!__prof_is_connected()) {
+                ix_pause();
+            }
+        } else {
+            MessageBoxExW(NULL, L"Failed to launch tracy profiler using command " PROFILING_CMD_WSTR, L"Error", MB_ICONSTOP | MB_SETFOREGROUND | MB_TOPMOST, 0);
+        }
+    }
+#endif
+
+    __prof_shutdown;
     return 0;
 }
 
diff --git a/src/thread_local.c b/src/thread_local.c
index 3c001441..19d81d65 100644
--- a/src/thread_local.c
+++ b/src/thread_local.c
@@ -56,7 +56,7 @@ void thread_local_store_release(struct thread_local_store *t)
     arena_release(t->arena);
 }
 
-void *_thread_local_var_eval(struct thread_local_var_meta *meta)
+volatile void *_thread_local_var_eval(struct thread_local_var_meta *meta)
 {
     /* Register var if unregistered */
     u64 id;
diff --git a/src/thread_local.h b/src/thread_local.h
index 3b4bc1b6..1c5ef45e 100644
--- a/src/thread_local.h
+++ b/src/thread_local.h
@@ -60,6 +60,6 @@ struct thread_local_var_meta {
 # define thread_local_var_eval(var_ptr) (void *)(_thread_local_var_eval(&(var_ptr)->meta))
 #endif
 
-void *_thread_local_var_eval(struct thread_local_var_meta *meta);
+volatile void *_thread_local_var_eval(struct thread_local_var_meta *meta);
 
 #endif
diff --git a/src/ttf_dwrite.cpp b/src/ttf_dwrite.cpp
index ed203016..b879c874 100644
--- a/src/ttf_dwrite.cpp
+++ b/src/ttf_dwrite.cpp
@@ -76,6 +76,7 @@ struct ttf_startup_receipt ttf_startup(void)
 
 struct ttf_decode_result ttf_decode(struct arena *arena, struct string encoded, f32 point_size, u32 *cache_codes, u32 cache_codes_count)
 {
+    __prof;
     COLORREF bg_color = RGB32(0,0,0);
     COLORREF fg_color = RGB32(255,255,255);
 
@@ -173,110 +174,113 @@ struct ttf_decode_result ttf_decode(struct arena *arena, struct string encoded,
     u32 out_offset_x = 0;
     u32 out_offset_y = 0;
     u32 row_height = 0;
-    for (u16 i = 0; i < glyph_count; ++i) {
-        /* Render glyph to target */
-        DWRITE_GLYPH_RUN glyph_run = ZI;
-        glyph_run.fontFace = font_face;
-        glyph_run.fontEmSize = pixel_per_em;
-        glyph_run.glyphCount = 1;
-        glyph_run.glyphIndices = &i;
+    {
+        __profscope(Build atlas);
+        for (u16 i = 0; i < glyph_count; ++i) {
+            /* Render glyph to target */
+            DWRITE_GLYPH_RUN glyph_run = ZI;
+            glyph_run.fontFace = font_face;
+            glyph_run.fontEmSize = pixel_per_em;
+            glyph_run.glyphCount = 1;
+            glyph_run.glyphIndices = &i;
 
-        RECT bounding_box = ZI;
-        error = render_target->DrawGlyphRun(
-            raster_target_x,
-            raster_target_y,
-            DWRITE_MEASURING_MODE_NATURAL,
-            &glyph_run,
-            rendering_params,
-            fg_color,
-            &bounding_box
-        );
+            RECT bounding_box = ZI;
+            error = render_target->DrawGlyphRun(
+                raster_target_x,
+                raster_target_y,
+                DWRITE_MEASURING_MODE_NATURAL,
+                &glyph_run,
+                rendering_params,
+                fg_color,
+                &bounding_box
+            );
 
-        if (bounding_box.left < 0
-            || bounding_box.top < 0
-            || bounding_box.right > raster_target_w
-            || bounding_box.bottom > raster_target_h) {
-            /* Skip */
-            continue;
-        }
-
-        /* Compute glyph metrics */
-        DWRITE_GLYPH_METRICS glyph_metrics = ZI;
-
-
-        error = font_face->GetDesignGlyphMetrics(&i, 1, &glyph_metrics, false);
-
-        f32 off_x = (f32)bounding_box.left - raster_target_x;
-        f32 off_y = (f32)bounding_box.top - raster_target_y;
-        f32 advance = (f32)glyph_metrics.advanceWidth * pixel_per_design_unit;
-        i32 tex_w = bounding_box.right - bounding_box.left;
-        i32 tex_h = bounding_box.bottom - bounding_box.top;
-
-        struct font_glyph *glyph = &glyphs[i];
-        glyph->off_x = off_x;
-        glyph->off_y = off_y;
-        glyph->advance = round_up(advance);
-        glyph->width = (f32)tex_w;
-        glyph->height = (f32)tex_h;
-
-        /* Get the bitmap */
-        HBITMAP bitmap = (HBITMAP)GetCurrentObject(dc, OBJ_BITMAP);
-        DIBSECTION dib = ZI;
-        GetObject(bitmap, sizeof(dib), &dib);
-
-        /* Start new row if necessary */
-        if ((out_offset_x + tex_w) >= atlas_w) {
-            out_offset_y += row_height;
-            out_offset_x = 0;
-            row_height = 0;
-        }
-
-        /* Grow atlas height */
-        if ((out_offset_y + tex_h) > atlas_h) {
-            u64 diff = (out_offset_y + tex_h) - atlas_h;
-            /* NOTE: This allocation must be contiguous with the initial atlas
-             * allocation (IE: No non-atlas arena PUSHes)  */
-            arena_push_array(arena, u32, diff * atlas_w);
-            atlas_h += diff;
-        }
-
-        /* Set bounding box metrics (now that we know atlas x & y) */
-        glyph->atlas_rect = ZI;
-        glyph->atlas_rect.x = (f32)out_offset_x;
-        glyph->atlas_rect.y = (f32)out_offset_y;
-        glyph->atlas_rect.width = (f32)tex_w;
-        glyph->atlas_rect.height = (f32)tex_h;
-
-        /* Fill atlas */
-        u64 in_pitch = (u64)dib.dsBm.bmWidthBytes / 4;
-        u32 *in_data = (u32 *)dib.dsBm.bmBits;
-        u32 *out_data = atlas_memory;
-        for (i32 y = 0; y < tex_h; ++y) {
-            u64 out_y = out_offset_y + y;
-            u64 in_y = (u64)bounding_box.top + y;
-            for (i32 x = 0; x < tex_w; ++x) {
-                u64 out_x = out_offset_x + x;
-                u64 in_x = (u64)bounding_box.left + x;
-                u32 *out_pixel = out_data + (out_x + (out_y * atlas_w));
-                u32 *in_pixel = in_data + (in_x + (in_y * in_pitch));
-                *out_pixel = RGBA32(0xFF, 0xFF, 0xFF, *in_pixel & 0xFF);
+            if (bounding_box.left < 0
+                || bounding_box.top < 0
+                || bounding_box.right > raster_target_w
+                || bounding_box.bottom > raster_target_h) {
+                /* Skip */
+                continue;
             }
-        }
-        out_offset_x += tex_w;
 
-        /* Grow row height */
-        if ((u32)tex_h > row_height) {
-            row_height = (u32)tex_h;
-        }
+            /* Compute glyph metrics */
+            DWRITE_GLYPH_METRICS glyph_metrics = ZI;
 
-        /* Clear the render target */
-        {
-            HGDIOBJ original = SelectObject(dc, GetStockObject(DC_PEN));
-            SetDCPenColor(dc, bg_color);
-            SelectObject(dc, GetStockObject(DC_BRUSH));
-            SetDCBrushColor(dc, bg_color);
-            Rectangle(dc, bounding_box.left, bounding_box.top, bounding_box.right, bounding_box.bottom);
-            SelectObject(dc, original);
+
+            error = font_face->GetDesignGlyphMetrics(&i, 1, &glyph_metrics, false);
+
+            f32 off_x = (f32)bounding_box.left - raster_target_x;
+            f32 off_y = (f32)bounding_box.top - raster_target_y;
+            f32 advance = (f32)glyph_metrics.advanceWidth * pixel_per_design_unit;
+            i32 tex_w = bounding_box.right - bounding_box.left;
+            i32 tex_h = bounding_box.bottom - bounding_box.top;
+
+            struct font_glyph *glyph = &glyphs[i];
+            glyph->off_x = off_x;
+            glyph->off_y = off_y;
+            glyph->advance = round_up(advance);
+            glyph->width = (f32)tex_w;
+            glyph->height = (f32)tex_h;
+
+            /* Get the bitmap */
+            HBITMAP bitmap = (HBITMAP)GetCurrentObject(dc, OBJ_BITMAP);
+            DIBSECTION dib = ZI;
+            GetObject(bitmap, sizeof(dib), &dib);
+
+            /* Start new row if necessary */
+            if ((out_offset_x + tex_w) >= atlas_w) {
+                out_offset_y += row_height;
+                out_offset_x = 0;
+                row_height = 0;
+            }
+
+            /* Grow atlas height */
+            if ((out_offset_y + tex_h) > atlas_h) {
+                u64 diff = (out_offset_y + tex_h) - atlas_h;
+                /* NOTE: This allocation must be contiguous with the initial atlas
+                 * allocation (IE: No non-atlas arena PUSHes)  */
+                arena_push_array(arena, u32, diff * atlas_w);
+                atlas_h += diff;
+            }
+
+            /* Set bounding box metrics (now that we know atlas x & y) */
+            glyph->atlas_rect = ZI;
+            glyph->atlas_rect.x = (f32)out_offset_x;
+            glyph->atlas_rect.y = (f32)out_offset_y;
+            glyph->atlas_rect.width = (f32)tex_w;
+            glyph->atlas_rect.height = (f32)tex_h;
+
+            /* Fill atlas */
+            u64 in_pitch = (u64)dib.dsBm.bmWidthBytes / 4;
+            u32 *in_data = (u32 *)dib.dsBm.bmBits;
+            u32 *out_data = atlas_memory;
+            for (i32 y = 0; y < tex_h; ++y) {
+                u64 out_y = out_offset_y + y;
+                u64 in_y = (u64)bounding_box.top + y;
+                for (i32 x = 0; x < tex_w; ++x) {
+                    u64 out_x = out_offset_x + x;
+                    u64 in_x = (u64)bounding_box.left + x;
+                    u32 *out_pixel = out_data + (out_x + (out_y * atlas_w));
+                    u32 *in_pixel = in_data + (in_x + (in_y * in_pitch));
+                    *out_pixel = RGBA32(0xFF, 0xFF, 0xFF, *in_pixel & 0xFF);
+                }
+            }
+            out_offset_x += tex_w;
+
+            /* Grow row height */
+            if ((u32)tex_h > row_height) {
+                row_height = (u32)tex_h;
+            }
+
+            /* Clear the render target */
+            {
+                HGDIOBJ original = SelectObject(dc, GetStockObject(DC_PEN));
+                SetDCPenColor(dc, bg_color);
+                SelectObject(dc, GetStockObject(DC_BRUSH));
+                SetDCBrushColor(dc, bg_color);
+                Rectangle(dc, bounding_box.left, bounding_box.top, bounding_box.right, bounding_box.bottom);
+                SelectObject(dc, original);
+            }
         }
     }
 
diff --git a/src/user.c b/src/user.c
index db6fdaf1..92c0604c 100644
--- a/src/user.c
+++ b/src/user.c
@@ -94,6 +94,7 @@ GLOBAL struct {
     i32 console_log_color_indices[LOG_LEVEL_COUNT];
     f32 console_logs_height;
     b32 debug_console;
+    b32 profiler_launched;
 
     /* Window -> user */
     struct sys_mutex *sys_events_mutex;
@@ -175,6 +176,7 @@ GLOBAL READONLY enum user_bind_kind g_binds[SYS_BTN_COUNT] = {
     [SYS_BTN_F1] = USER_BIND_KIND_DEBUG_PAUSE,
     [SYS_BTN_F2] = USER_BIND_KIND_DEBUG_CAMERA,
     [SYS_BTN_F3] = USER_BIND_KIND_DEBUG_DRAW,
+    [SYS_BTN_F4] = USER_BIND_KIND_PROFILER,
     [SYS_BTN_GRAVE_ACCENT] = USER_BIND_KIND_DEBUG_CONSOLE,
     [SYS_BTN_F11] = USER_BIND_KIND_FULLSCREEN,
     [SYS_BTN_MWHEELUP] = USER_BIND_KIND_ZOOM_IN,
@@ -195,12 +197,11 @@ GLOBAL READONLY enum user_bind_kind g_binds[SYS_BTN_COUNT] = {
 
 INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(user_shutdown);
 INTERNAL LOG_EVENT_CALLBACK_FUNC_DEF(debug_console_log_callback, log);
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(user_thread_entry_point, arg);
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(user_local_sim_thread_entry_point, arg);
+INTERNAL SYS_THREAD_DEF(user_thread_entry_point, arg);
+INTERNAL SYS_THREAD_DEF(user_local_sim_thread_entry_point, arg);
 INTERNAL SYS_WINDOW_EVENT_CALLBACK_FUNC_DEF(window_event_callback, event);
 
-struct user_startup_receipt user_startup(struct work_startup_receipt *work_sr,
-    struct gp_startup_receipt *gp_sr,
+struct user_startup_receipt user_startup(struct gp_startup_receipt *gp_sr,
                                          struct font_startup_receipt *font_sr,
                                          struct sprite_startup_receipt *sprite_sr,
                                          struct draw_startup_receipt *draw_sr,
@@ -212,7 +213,6 @@ struct user_startup_receipt user_startup(struct work_startup_receipt *work_sr,
                                          struct string connect_address_str,
                                          struct sys_window *window)
 {
-    (UNUSED)work_sr;
     (UNUSED)gp_sr;
     (UNUSED)font_sr;
     (UNUSED)sprite_sr;
@@ -624,6 +624,7 @@ INTERNAL SORT_COMPARE_FUNC_DEF(ent_draw_order_cmp, arg_a, arg_b, udata)
 
 INTERNAL void user_update(void)
 {
+    __prof;
     struct arena_temp scratch = scratch_begin_no_conflict();
 
     /* ========================== *
@@ -868,6 +869,26 @@ INTERNAL void user_update(void)
     if (G.bind_states[USER_BIND_KIND_DEBUG_CAMERA].num_presses > 0) {
         G.debug_camera = !G.debug_camera;
     }
+    if (G.bind_states[USER_BIND_KIND_PROFILER].num_presses > 0) {
+        if (G.profiler_launched) {
+            logf_warning("Profiler already launched");
+        } else {
+#if PROFILING
+            __profscope(Launch profiler);
+            struct string cmd = string_from_wstr_no_limit(scratch.arena, PROFILING_CMD_WSTR);
+            logf_info("Launching profiler with command \"%F\"", FMT_STR(cmd));
+            b32 success = sys_run_command(cmd);
+            if (success) {
+                G.profiler_launched = true;
+                logf_success("Launched profiler successfully");
+            } else {
+                logf_error("Failed to launch profiler using command \"%F\" (is the executable in your PATH?)", FMT_STR(cmd));
+            }
+#else
+            logf_warning("Cannot launch profiler: Program is not in profiling mode");
+#endif
+        }
+    }
 
     {
         if (G.bind_states[USER_BIND_KIND_DEBUG_FOLLOW].num_presses > 0) {
@@ -2104,15 +2125,17 @@ INTERNAL void user_update(void)
  * User thread
  * ========================== */
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(user_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(user_thread_entry_point, arg)
 {
     (UNUSED)arg;
     i64 last_frame_ns = 0;
     i64 target_dt_ns = NS_FROM_SECONDS(USER_FPS_LIMIT > (0) ? (1.0 / USER_FPS_LIMIT) : 0);
 
     while (!atomic_i32_eval(&G.user_thread_shutdown)) {
-        __profscope(user_update_w_sleep);
-        sleep_frame(last_frame_ns, target_dt_ns);
+        {
+            __profscope(User sleep);
+            sleep_frame(last_frame_ns, target_dt_ns);
+        }
         last_frame_ns = sys_time_ns();
         user_update();
     }
@@ -2191,7 +2214,7 @@ struct sim_decode_queue {
 };
 
 
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(user_local_sim_thread_entry_point, arg)
+INTERNAL SYS_THREAD_DEF(user_local_sim_thread_entry_point, arg)
 {
 #if 0
     struct host_listen_address local_listen_addr = host_listen_address_from_local_name(LIT("LOCAL_SIM"));
@@ -2268,515 +2291,518 @@ INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(user_local_sim_thread_entry_point, arg)
     i64 step_dt_ns = NS_FROM_SECONDS(1) / SIM_TICKS_PER_SECOND;
     f64 compute_timescale = 1.0;
     while (!atomic_i32_eval(&G.local_sim_thread_shutdown)) {
-        __profscope(local_sim_loop);
         struct arena_temp scratch = scratch_begin_no_conflict();
         {
-            __profscope(local_sim_sleep);
+            __profscope(Sim sleep);
             sleep_frame(real_time_ns, step_dt_ns * compute_timescale);
         }
-        real_dt_ns = sys_time_ns() - real_time_ns;
-        real_time_ns += real_dt_ns;
-
-        struct host_event_list host_events = host_update_begin(scratch.arena, host);
-
-        /* Read net messages */
-        struct sim_decode_queue queue = ZI;
         {
-            for (struct host_event *event = host_events.first; event; event = event->next) {
-                struct host_channel_id channel_id = event->channel_id;
-                struct sim_client *client = sim_client_from_channel_id(store, channel_id);
-                switch (event->kind) {
-                    case HOST_EVENT_KIND_CHANNEL_OPENED:
-                    {
-                        if (!client->valid) {
-                            if (is_master) {
-                                /* Create remote client */
-                                client = sim_client_alloc(store);
-                                sim_client_set_channel_id(client, channel_id);
-                            } else {
-                                /* Create master client */
-                                if (!master_client->valid) {
+            __profscope(Sim update);
+
+            real_dt_ns = sys_time_ns() - real_time_ns;
+            real_time_ns += real_dt_ns;
+
+            struct host_event_list host_events = host_update_begin(scratch.arena, host);
+
+            /* Read net messages */
+            struct sim_decode_queue queue = ZI;
+            {
+                for (struct host_event *event = host_events.first; event; event = event->next) {
+                    struct host_channel_id channel_id = event->channel_id;
+                    struct sim_client *client = sim_client_from_channel_id(store, channel_id);
+                    switch (event->kind) {
+                        case HOST_EVENT_KIND_CHANNEL_OPENED:
+                        {
+                            if (!client->valid) {
+                                if (is_master) {
+                                    /* Create remote client */
                                     client = sim_client_alloc(store);
                                     sim_client_set_channel_id(client, channel_id);
-                                    master_client = client;
-                                    master_blended_client = sim_client_alloc(store);
                                 } else {
-                                    /* We already have a master client */
-                                    ASSERT(false);
+                                    /* Create master client */
+                                    if (!master_client->valid) {
+                                        client = sim_client_alloc(store);
+                                        sim_client_set_channel_id(client, channel_id);
+                                        master_client = client;
+                                        master_blended_client = sim_client_alloc(store);
+                                    } else {
+                                        /* We already have a master client */
+                                        ASSERT(false);
+                                    }
                                 }
                             }
-                        }
-                    } break;
+                        } break;
 
-                    case HOST_EVENT_KIND_MSG:
-                    {
-                        if (client->valid) {
-                            struct bitbuff msg_bb = bitbuff_from_string(event->msg);
-                            struct bitbuff_reader msg_br = br_from_bitbuff(&msg_bb);
+                        case HOST_EVENT_KIND_MSG:
+                        {
+                            if (client->valid) {
+                                struct bitbuff msg_bb = bitbuff_from_string(event->msg);
+                                struct bitbuff_reader msg_br = br_from_bitbuff(&msg_bb);
 
-                            u64 ack = br_read_uv(&msg_br);
-                            u64 double_ack = br_read_uv(&msg_br);
-                            if (ack > client->ack) {
-                                client->ack = ack;
-                            }
-                            if (double_ack > client->double_ack) {
-                                client->double_ack = double_ack;
-                            }
+                                u64 ack = br_read_uv(&msg_br);
+                                u64 double_ack = br_read_uv(&msg_br);
+                                if (ack > client->ack) {
+                                    client->ack = ack;
+                                }
+                                if (double_ack > client->double_ack) {
+                                    client->double_ack = double_ack;
+                                }
 
-                            /* Read & queue incoming snapshots for decoding */
-                            u64 tmp_encoded_len = br_read_uv(&msg_br);
-                            while (tmp_encoded_len > 0) {
-                                u8 *tmp_encoded_bytes = br_read_bytes_raw(&msg_br, tmp_encoded_len);
-                                if (!tmp_encoded_bytes) break;
+                                /* Read & queue incoming snapshots for decoding */
+                                u64 tmp_encoded_len = br_read_uv(&msg_br);
+                                while (tmp_encoded_len > 0) {
+                                    u8 *tmp_encoded_bytes = br_read_bytes_raw(&msg_br, tmp_encoded_len);
+                                    if (!tmp_encoded_bytes) break;
 
-                                struct bitbuff decoder_bb = bitbuff_from_string(STRING(tmp_encoded_len, tmp_encoded_bytes));
-                                struct bitbuff_reader decoder_br = br_from_bitbuff(&decoder_bb);
-                                u64 base_tick = br_read_uv(&decoder_br);
-                                u64 tick = br_read_uv(&decoder_br);
+                                    struct bitbuff decoder_bb = bitbuff_from_string(STRING(tmp_encoded_len, tmp_encoded_bytes));
+                                    struct bitbuff_reader decoder_br = br_from_bitbuff(&decoder_bb);
+                                    u64 base_tick = br_read_uv(&decoder_br);
+                                    u64 tick = br_read_uv(&decoder_br);
 
-                                struct string tmp_encoded = ZI;
-                                tmp_encoded.len = br_num_bytes_left(&decoder_br);
-                                tmp_encoded.text = br_read_bytes_raw(&decoder_br, tmp_encoded.len);
-                                if (!tmp_encoded.text) tmp_encoded.len = 0;
+                                    struct string tmp_encoded = ZI;
+                                    tmp_encoded.len = br_num_bytes_left(&decoder_br);
+                                    tmp_encoded.text = br_read_bytes_raw(&decoder_br, tmp_encoded.len);
+                                    if (!tmp_encoded.text) tmp_encoded.len = 0;
 
-                                struct sim_snapshot *base_ss = sim_snapshot_from_tick(client, base_tick);
-                                if (base_ss->tick == base_tick) {
-                                    if (is_master) {
-                                        /* Queue incoming slave client snapshot for decoding */
-                                        //b32 should_decode = tick == client->highest_received_tick + 1 || client->highest_received_tick == 0;
-                                        b32 should_decode = tick > client->highest_received_tick;
-                                        if (should_decode) {
-                                            struct sim_ss_decode_node *node = arena_push(scratch.arena, struct sim_ss_decode_node);
-                                            node->client = client;
-                                            node->tick = tick;
-                                            node->base_tick = base_tick;
-                                            node->tmp_encoded = tmp_encoded;
-                                            if (queue.last) {
-                                                queue.last->next = node;
-                                            } else {
-                                                queue.first = node;
+                                    struct sim_snapshot *base_ss = sim_snapshot_from_tick(client, base_tick);
+                                    if (base_ss->tick == base_tick) {
+                                        if (is_master) {
+                                            /* Queue incoming slave client snapshot for decoding */
+                                            //b32 should_decode = tick == client->highest_received_tick + 1 || client->highest_received_tick == 0;
+                                            b32 should_decode = tick > client->highest_received_tick;
+                                            if (should_decode) {
+                                                struct sim_ss_decode_node *node = arena_push(scratch.arena, struct sim_ss_decode_node);
+                                                node->client = client;
+                                                node->tick = tick;
+                                                node->base_tick = base_tick;
+                                                node->tmp_encoded = tmp_encoded;
+                                                if (queue.last) {
+                                                    queue.last->next = node;
+                                                } else {
+                                                    queue.first = node;
+                                                }
+                                                queue.last = node;
+                                                if (tick > client->highest_received_tick) {
+                                                    client->highest_received_tick = tick;
+                                                }
                                             }
-                                            queue.last = node;
-                                            if (tick > client->highest_received_tick) {
-                                                client->highest_received_tick = tick;
+                                        } else {
+                                            /* Decode incoming master client snapshots for decoding (only the newest one) */
+                                            b32 should_decode = client == master_client && tick > client->highest_received_tick;
+                                            if (should_decode) {
+                                                struct sim_ss_decode_node *node = queue.first ? queue.first : arena_push(scratch.arena, struct sim_ss_decode_node);
+                                                node->client = client;
+                                                node->tick = tick;
+                                                node->base_tick = base_tick;
+                                                node->tmp_encoded = tmp_encoded;
+                                                queue.first = node;
+                                                queue.last = node;
+                                                if (tick > client->highest_received_tick) {
+                                                    client->highest_received_tick = tick;
+                                                    if (average_master_receive_dt_ns == 0) {
+                                                        average_master_receive_dt_ns = NS_FROM_SECONDS(1) / SIM_TICKS_PER_SECOND;
+                                                    } else {
+                                                        average_master_receive_dt_ns -= average_master_receive_dt_ns / 50;
+                                                        average_master_receive_dt_ns += (real_time_ns - last_tick_from_master_received_at_ns) / 50;
+                                                    }
+                                                    last_tick_from_master_received_at_ns = real_time_ns;
+                                                }
                                             }
                                         }
                                     } else {
-                                        /* Decode incoming master client snapshots for decoding (only the newest one) */
-                                        b32 should_decode = client == master_client && tick > client->highest_received_tick;
-                                        if (should_decode) {
-                                            struct sim_ss_decode_node *node = queue.first ? queue.first : arena_push(scratch.arena, struct sim_ss_decode_node);
-                                            node->client = client;
-                                            node->tick = tick;
-                                            node->base_tick = base_tick;
-                                            node->tmp_encoded = tmp_encoded;
-                                            queue.first = node;
-                                            queue.last = node;
-                                            if (tick > client->highest_received_tick) {
-                                                client->highest_received_tick = tick;
-                                                if (average_master_receive_dt_ns == 0) {
-                                                    average_master_receive_dt_ns = NS_FROM_SECONDS(1) / SIM_TICKS_PER_SECOND;
-                                                } else {
-                                                    average_master_receive_dt_ns -= average_master_receive_dt_ns / 50;
-                                                    average_master_receive_dt_ns += (real_time_ns - last_tick_from_master_received_at_ns) / 50;
-                                                }
-                                                last_tick_from_master_received_at_ns = real_time_ns;
-                                            }
-                                        }
+                                        /* We do not have the tick that the incoming delta is based from */
+                                        ASSERT(false);
                                     }
-                                } else {
-                                    /* We do not have the tick that the incoming delta is based from */
-                                    ASSERT(false);
+
+                                    tmp_encoded_len = br_read_uv(&msg_br);
                                 }
-
-                                tmp_encoded_len = br_read_uv(&msg_br);
                             }
-                        }
-                    } break;
+                        } break;
 
-                    default: break;
-                }
-            }
-        }
-
-        /* Decode incoming snapshots */
-        for (struct sim_ss_decode_node *n = queue.first; n; n = n->next) {
-            struct sim_client *client = n->client;
-            u64 base_tick = n->base_tick;
-            u64 tick = n->tick;
-            struct sim_snapshot *base_ss = sim_snapshot_from_tick(client, base_tick);
-            if (base_ss->tick == base_tick) {
-                struct bitbuff bb = bitbuff_from_string(n->tmp_encoded);
-                struct bitbuff_reader br = br_from_bitbuff(&bb);
-
-                /* Alloc & decode snapshot */
-                struct sim_snapshot *ss = sim_snapshot_alloc(client, base_ss, tick);
-                sim_snapshot_decode(&br, ss);
-
-                /* Assume all incoming ents want to be sync srcs */
-                for (u64 i = 0; i < ss->num_ents_reserved; ++i) {
-                    struct sim_ent *ent = &ss->ents[i];
-                    if (ent->valid && sim_ent_has_prop(ent, SEPROP_SYNC_DST)) {
-                        sim_ent_disable_prop(ent, SEPROP_SYNC_DST);
-                        sim_ent_enable_prop(ent, SEPROP_SYNC_SRC);
+                        default: break;
                     }
                 }
-            } else {
-                /* We do not have the tick that the incoming delta is based from.
-                 * This decode should never have been queued in the first place. */
-                ASSERT(false);
             }
-        }
 
-        if (!is_master && !initialized_from_master) {
-            if (master_client->valid && master_client->last_tick > 0) {
-                initialized_from_master = true;
-            } else {
+            /* Decode incoming snapshots */
+            for (struct sim_ss_decode_node *n = queue.first; n; n = n->next) {
+                struct sim_client *client = n->client;
+                u64 base_tick = n->base_tick;
+                u64 tick = n->tick;
+                struct sim_snapshot *base_ss = sim_snapshot_from_tick(client, base_tick);
+                if (base_ss->tick == base_tick) {
+                    struct bitbuff bb = bitbuff_from_string(n->tmp_encoded);
+                    struct bitbuff_reader br = br_from_bitbuff(&bb);
+
+                    /* Alloc & decode snapshot */
+                    struct sim_snapshot *ss = sim_snapshot_alloc(client, base_ss, tick);
+                    sim_snapshot_decode(&br, ss);
+
+                    /* Assume all incoming ents want to be sync srcs */
+                    for (u64 i = 0; i < ss->num_ents_reserved; ++i) {
+                        struct sim_ent *ent = &ss->ents[i];
+                        if (ent->valid && sim_ent_has_prop(ent, SEPROP_SYNC_DST)) {
+                            sim_ent_disable_prop(ent, SEPROP_SYNC_DST);
+                            sim_ent_enable_prop(ent, SEPROP_SYNC_SRC);
+                        }
+                    }
+                } else {
+                    /* We do not have the tick that the incoming delta is based from.
+                     * This decode should never have been queued in the first place. */
+                    ASSERT(false);
+                }
+            }
+
+            if (!is_master && !initialized_from_master) {
+                if (master_client->valid && master_client->last_tick > 0) {
+                    initialized_from_master = true;
+                } else {
+                    goto skip_step;
+                }
+            }
+
+            b32 should_step = !atomic_i32_eval(&G.user_paused);
+            if (atomic_i32_eval(&G.user_paused_steps) > 0) {
+                should_step = true;
+                atomic_i32_eval_add(&G.user_paused_steps, -1);
+            }
+
+            if (!should_step) {
                 goto skip_step;
             }
-        }
 
-        b32 should_step = !atomic_i32_eval(&G.user_paused);
-        if (atomic_i32_eval(&G.user_paused_steps) > 0) {
-            should_step = true;
-            atomic_i32_eval_add(&G.user_paused_steps, -1);
-        }
-
-        if (!should_step) {
-            goto skip_step;
-        }
-
-        /* Update networked clients */
-        u64 oldest_client_ack = 0;
-        for (u64 i = 0; i < store->num_clients_reserved; ++i) {
-            struct sim_client *client = &store->clients[i];
-            if (client->valid && client != local_client && client != publish_client && client != user_input_client && client != master_client) {
-                client->last_rtt_ns = host_get_channel_last_rtt_ns(host, client->channel_id);
-                /* Release unneeded received snapshots */
-                /* TDOO: Cap how many client snapshots we're willing to retain */
-                if (client->double_ack > 0) {
-                    u64 keep_tick = min_u64(client->double_ack, local_client->last_tick);
-                    if (keep_tick > 0) {
-                        sim_snapshot_release_ticks_in_range(client, 0, keep_tick - 1);
+            /* Update networked clients */
+            u64 oldest_client_ack = 0;
+            for (u64 i = 0; i < store->num_clients_reserved; ++i) {
+                struct sim_client *client = &store->clients[i];
+                if (client->valid && client != local_client && client != publish_client && client != user_input_client && client != master_client) {
+                    client->last_rtt_ns = host_get_channel_last_rtt_ns(host, client->channel_id);
+                    /* Release unneeded received snapshots */
+                    /* TDOO: Cap how many client snapshots we're willing to retain */
+                    if (client->double_ack > 0) {
+                        u64 keep_tick = min_u64(client->double_ack, local_client->last_tick);
+                        if (keep_tick > 0) {
+                            sim_snapshot_release_ticks_in_range(client, 0, keep_tick - 1);
+                        }
+                    }
+                    if (client->ack < oldest_client_ack || oldest_client_ack == 0) {
+                        oldest_client_ack = client->ack;
                     }
                 }
-                if (client->ack < oldest_client_ack || oldest_client_ack == 0) {
-                    oldest_client_ack = client->ack;
-                }
             }
-        }
 
-        /* Release unneeded published snapshots */
-        {
-            u64 keep_tick = oldest_client_ack;
-            if (keep_tick == 0 && publish_client->last_tick > 0) {
-                keep_tick = publish_client->last_tick - 1;
-            }
-            if (keep_tick > 0) {
-                --keep_tick;
-            }
-            sim_snapshot_release_ticks_in_range(publish_client, 0, keep_tick);
-        }
-
-        /* Release old local snapshots */
-        {
-            u64 keep_range = 50;
-            if (local_client->last_tick > keep_range) {
-                u64 keep_tick = local_client->last_tick - keep_range;
-                sim_snapshot_release_ticks_in_range(local_client, 0, keep_tick);
-            }
-        }
-
-        /* Release unneeded user input snapshots */
-        sim_snapshot_release_ticks_in_range(user_input_client, 0, local_client->first_tick - 1);
-
-
-
-
-
-
-
-
-
-        if (is_master) {
-            /* Step master */
-            u64 prev_tick = local_client->last_tick;
-            u64 next_tick = prev_tick + 1;
-            struct sim_step_ctx ctx = ZI;
-            ctx.is_master = is_master;
-            ctx.sim_dt_ns = step_dt_ns;
-            ctx.accel = &accel;
-            ctx.user_input_client = user_input_client;
-            ctx.master_client = master_client;
-            ctx.publish_client = publish_client;
-            struct sim_snapshot *prev_world = sim_snapshot_from_tick(local_client, prev_tick);
-            ctx.world = sim_snapshot_alloc(local_client, prev_world, next_tick);
-            generate_user_input_cmds(user_input_client, next_tick);
-            sim_step(&ctx);
-        } else if (master_client->valid) {
-            /* Step client */
-
-            /* TODO: Eventually determine master tick based on a delay to allow for jitter and also interpolation so we can lower snapshot publish frequency */
-
-
-            b32 master_ss_is_blended = false;
-            struct sim_snapshot *master_ss = sim_snapshot_nil();
+            /* Release unneeded published snapshots */
             {
-                /* How along are we between master sim ticks (0 = start of tick, 1 = end of tick) */
-                f64 tick_progress = 0;
-                i64 next_tick_expected_ns = last_tick_from_master_received_at_ns + average_master_receive_dt_ns;
-                if (next_tick_expected_ns > last_tick_from_master_received_at_ns) {
-                    tick_progress = (f64)(real_time_ns - last_tick_from_master_received_at_ns) / (f64)(next_tick_expected_ns - last_tick_from_master_received_at_ns);
+                u64 keep_tick = oldest_client_ack;
+                if (keep_tick == 0 && publish_client->last_tick > 0) {
+                    keep_tick = publish_client->last_tick - 1;
                 }
-
-                /* Predict master sim time based on average snapshot publish dt. */
-                struct sim_snapshot *newest_snapshot = sim_snapshot_from_tick(master_client, master_client->last_tick);
-                i64 master_sim_predicted_time_ns = newest_snapshot->sim_time_ns + (newest_snapshot->sim_dt_ns * tick_progress);
-
-                /* Determine blend time */
-                i64 master_blend_time_target_ns = master_sim_predicted_time_ns - (SIM_CLIENT_INTERP_RATIO * average_master_receive_dt_ns);
-                if (average_master_receive_dt_ns > 0) {
-                    master_blend_time_ns += real_dt_ns;
+                if (keep_tick > 0) {
+                    --keep_tick;
                 }
+                sim_snapshot_release_ticks_in_range(publish_client, 0, keep_tick);
+            }
 
-                i64 blend_time_target_diff_ns = master_blend_time_target_ns - master_blend_time_ns;
-                if (blend_time_target_diff_ns > NS_FROM_SECONDS(0.100) || blend_time_target_diff_ns < NS_FROM_SECONDS(-0.100)) {
-                    /* Snap blend time if it gets too far from target blend time */
-                    master_blend_time_ns = master_blend_time_target_ns;
+            /* Release old local snapshots */
+            {
+                u64 keep_range = 50;
+                if (local_client->last_tick > keep_range) {
+                    u64 keep_tick = local_client->last_tick - keep_range;
+                    sim_snapshot_release_ticks_in_range(local_client, 0, keep_tick);
                 }
-                u64 master_blend_tick = master_blend_time_ns / newest_snapshot->sim_dt_ns;
+            }
 
-                /* Get snapshot nearest to master blend time */
-                /* TODO: Blend */
-                struct sim_snapshot *left_snapshot = sim_snapshot_nil();
-                struct sim_snapshot *right_snapshot = newest_snapshot;
+            /* Release unneeded user input snapshots */
+            sim_snapshot_release_ticks_in_range(user_input_client, 0, local_client->first_tick - 1);
+
+
+
+
+
+
+
+
+
+            if (is_master) {
+                /* Step master */
+                u64 prev_tick = local_client->last_tick;
+                u64 next_tick = prev_tick + 1;
+                struct sim_step_ctx ctx = ZI;
+                ctx.is_master = is_master;
+                ctx.sim_dt_ns = step_dt_ns;
+                ctx.accel = &accel;
+                ctx.user_input_client = user_input_client;
+                ctx.master_client = master_client;
+                ctx.publish_client = publish_client;
+                struct sim_snapshot *prev_world = sim_snapshot_from_tick(local_client, prev_tick);
+                ctx.world = sim_snapshot_alloc(local_client, prev_world, next_tick);
+                generate_user_input_cmds(user_input_client, next_tick);
+                sim_step(&ctx);
+            } else if (master_client->valid) {
+                /* Step client */
+
+                /* TODO: Eventually determine master tick based on a delay to allow for jitter and also interpolation so we can lower snapshot publish frequency */
+
+
+                b32 master_ss_is_blended = false;
+                struct sim_snapshot *master_ss = sim_snapshot_nil();
                 {
-                    struct sim_snapshot *ss = sim_snapshot_from_tick(master_client, master_client->first_tick);
-                    while (ss->valid) {
-                        u64 next_tick = ss->next_tick;
-                        i64 ss_time_ns = ss->sim_time_ns;
-                        if (ss_time_ns < master_blend_time_ns && ss_time_ns > left_snapshot->sim_time_ns) {
-                            left_snapshot = ss;
-                        }
-                        if (ss_time_ns > master_blend_time_ns && ss_time_ns < right_snapshot->sim_time_ns) {
-                            right_snapshot = ss;
-                        }
-                        ss = sim_snapshot_from_tick(master_client, next_tick);
+                    /* How along are we between master sim ticks (0 = start of tick, 1 = end of tick) */
+                    f64 tick_progress = 0;
+                    i64 next_tick_expected_ns = last_tick_from_master_received_at_ns + average_master_receive_dt_ns;
+                    if (next_tick_expected_ns > last_tick_from_master_received_at_ns) {
+                        tick_progress = (f64)(real_time_ns - last_tick_from_master_received_at_ns) / (f64)(next_tick_expected_ns - last_tick_from_master_received_at_ns);
                     }
-                }
 
-                /* Create world from blended master snapshots */
-                f64 blend = 0;
-                if (left_snapshot->valid && right_snapshot->valid && right_snapshot->tick > left_snapshot->tick) {
-                    blend = (f64)(master_blend_tick - left_snapshot->tick) / (f64)(right_snapshot->tick - left_snapshot->tick);
-                    f64 epsilon = 0.001;
-                    if (blend < epsilon) {
-                        master_ss_is_blended = false;
-                        master_ss = left_snapshot;
-                    } else if (blend > 1 - epsilon) {
-                        master_ss_is_blended = false;
-                        master_ss = right_snapshot;
+                    /* Predict master sim time based on average snapshot publish dt. */
+                    struct sim_snapshot *newest_snapshot = sim_snapshot_from_tick(master_client, master_client->last_tick);
+                    i64 master_sim_predicted_time_ns = newest_snapshot->sim_time_ns + (newest_snapshot->sim_dt_ns * tick_progress);
+
+                    /* Determine blend time */
+                    i64 master_blend_time_target_ns = master_sim_predicted_time_ns - (SIM_CLIENT_INTERP_RATIO * average_master_receive_dt_ns);
+                    if (average_master_receive_dt_ns > 0) {
+                        master_blend_time_ns += real_dt_ns;
+                    }
+
+                    i64 blend_time_target_diff_ns = master_blend_time_target_ns - master_blend_time_ns;
+                    if (blend_time_target_diff_ns > NS_FROM_SECONDS(0.100) || blend_time_target_diff_ns < NS_FROM_SECONDS(-0.100)) {
+                        /* Snap blend time if it gets too far from target blend time */
+                        master_blend_time_ns = master_blend_time_target_ns;
+                    }
+                    u64 master_blend_tick = master_blend_time_ns / newest_snapshot->sim_dt_ns;
+
+                    /* Get snapshot nearest to master blend time */
+                    /* TODO: Blend */
+                    struct sim_snapshot *left_snapshot = sim_snapshot_nil();
+                    struct sim_snapshot *right_snapshot = newest_snapshot;
+                    {
+                        struct sim_snapshot *ss = sim_snapshot_from_tick(master_client, master_client->first_tick);
+                        while (ss->valid) {
+                            u64 next_tick = ss->next_tick;
+                            i64 ss_time_ns = ss->sim_time_ns;
+                            if (ss_time_ns < master_blend_time_ns && ss_time_ns > left_snapshot->sim_time_ns) {
+                                left_snapshot = ss;
+                            }
+                            if (ss_time_ns > master_blend_time_ns && ss_time_ns < right_snapshot->sim_time_ns) {
+                                right_snapshot = ss;
+                            }
+                            ss = sim_snapshot_from_tick(master_client, next_tick);
+                        }
+                    }
+
+                    /* Create world from blended master snapshots */
+                    f64 blend = 0;
+                    if (left_snapshot->valid && right_snapshot->valid && right_snapshot->tick > left_snapshot->tick) {
+                        blend = (f64)(master_blend_tick - left_snapshot->tick) / (f64)(right_snapshot->tick - left_snapshot->tick);
+                        f64 epsilon = 0.001;
+                        if (blend < epsilon) {
+                            master_ss_is_blended = false;
+                            master_ss = left_snapshot;
+                        } else if (blend > 1 - epsilon) {
+                            master_ss_is_blended = false;
+                            master_ss = right_snapshot;
+                        } else {
+                            master_ss_is_blended = true;
+                            master_ss = sim_snapshot_alloc_from_lerp(master_blended_client, left_snapshot, right_snapshot, blend);
+
+                            /* Release unneeded blended master snapshots */
+                            if (master_ss->tick > 0) {
+                                sim_snapshot_release_ticks_in_range(master_blended_client, 0, master_ss->tick - 1);
+                                sim_snapshot_release_ticks_in_range(master_blended_client, master_ss->tick + 1, U64_MAX);
+                            }
+                        }
                     } else {
-                        master_ss_is_blended = true;
-                        master_ss = sim_snapshot_alloc_from_lerp(master_blended_client, left_snapshot, right_snapshot, blend);
-
-                        /* Release unneeded blended master snapshots */
-                        if (master_ss->tick > 0) {
-                            sim_snapshot_release_ticks_in_range(master_blended_client, 0, master_ss->tick - 1);
-                            sim_snapshot_release_ticks_in_range(master_blended_client, master_ss->tick + 1, U64_MAX);
-                        }
+                        master_ss_is_blended = false;
+                        master_ss = left_snapshot->valid ? left_snapshot : right_snapshot;
                     }
-                } else {
-                    master_ss_is_blended = false;
-                    master_ss = left_snapshot->valid ? left_snapshot : right_snapshot;
-                }
 
-                /* Release unneeded master snapshots */
-                u64 keep_master_tick = min_u64(left_snapshot->tick, master_client->double_ack);
-                if (keep_master_tick > 0) {
-                    sim_snapshot_release_ticks_in_range(master_client, 0, keep_master_tick - 1);
-                }
+                    /* Release unneeded master snapshots */
+                    u64 keep_master_tick = min_u64(left_snapshot->tick, master_client->double_ack);
+                    if (keep_master_tick > 0) {
+                        sim_snapshot_release_ticks_in_range(master_client, 0, keep_master_tick - 1);
+                    }
 
 #if 0
-                DEBUGBREAKABLE;
-                logf_debug("*************************************************");
-                logf_debug("local_client->last_tick: %F", FMT_UINT(local_client->last_tick));
-                logf_debug("master_sim_predicted_time_ns: %F", FMT_SINT(master_sim_predicted_time_ns));
-                logf_debug("tick_progress: %F", FMT_FLOAT(tick_progress));
-                logf_debug("sim_publish_timescale: %F", FMT_FLOAT(sim_publish_timescale));
-                logf_debug("last_tick_from_master_received_at_ns: %F", FMT_SINT(last_tick_from_master_received_at_ns));
-                logf_debug("average_master_receive_dt_ns: %F", FMT_SINT(average_master_receive_dt_ns));
-                logf_debug("next_tick_expected_ns: %F", FMT_SINT(next_tick_expected_ns));
-                logf_debug("master_blend_time_target_ns: %F", FMT_SINT(master_blend_time_target_ns));
-                logf_debug("blend_time_target_diff_ns: %F", FMT_SINT(blend_time_target_diff_ns));
-                logf_debug("master_blend_time_ns: %F", FMT_SINT(master_blend_time_ns));
-                logf_debug("left_snapshot->tick: %F", FMT_UINT(left_snapshot->tick));
-                logf_debug("right_snapshot->tick: %F", FMT_UINT(right_snapshot->tick));
-                logf_debug("master_ss->tick: %F", FMT_UINT(master_ss->tick));
+                    DEBUGBREAKABLE;
+                    logf_debug("*************************************************");
+                    logf_debug("local_client->last_tick: %F", FMT_UINT(local_client->last_tick));
+                    logf_debug("master_sim_predicted_time_ns: %F", FMT_SINT(master_sim_predicted_time_ns));
+                    logf_debug("tick_progress: %F", FMT_FLOAT(tick_progress));
+                    logf_debug("sim_publish_timescale: %F", FMT_FLOAT(sim_publish_timescale));
+                    logf_debug("last_tick_from_master_received_at_ns: %F", FMT_SINT(last_tick_from_master_received_at_ns));
+                    logf_debug("average_master_receive_dt_ns: %F", FMT_SINT(average_master_receive_dt_ns));
+                    logf_debug("next_tick_expected_ns: %F", FMT_SINT(next_tick_expected_ns));
+                    logf_debug("master_blend_time_target_ns: %F", FMT_SINT(master_blend_time_target_ns));
+                    logf_debug("blend_time_target_diff_ns: %F", FMT_SINT(blend_time_target_diff_ns));
+                    logf_debug("master_blend_time_ns: %F", FMT_SINT(master_blend_time_ns));
+                    logf_debug("left_snapshot->tick: %F", FMT_UINT(left_snapshot->tick));
+                    logf_debug("right_snapshot->tick: %F", FMT_UINT(right_snapshot->tick));
+                    logf_debug("master_ss->tick: %F", FMT_UINT(master_ss->tick));
 #endif
-            }
-
-            if (master_ss->valid) {
-                struct sim_ent *master_player = sim_ent_find_first_match_one(master_ss, SEPROP_PLAYER_IS_MASTER);
-
-                /* Update ent id from master */
-                {
-                    user_input_client->player_id = master_ss->local_player;
-                    local_client->player_id = master_ss->local_player;
                 }
 
-                /* Check for misprediction */
-                u64 mispredicted_tick = 0;
-                if (!master_ss_is_blended) {
-                    /* TODO: Actually check for misprediction rather than triggering mispredict any time a new master snapshot is received */
-                    mispredicted_tick = master_ss->tick;
-                }
+                if (master_ss->valid) {
+                    struct sim_ent *master_player = sim_ent_find_first_match_one(master_ss, SEPROP_PLAYER_IS_MASTER);
 
-
-                u64 step_base_tick = local_client->last_tick;
-                u64 step_end_tick = step_base_tick + 1;
-                if (mispredicted_tick > 0) {
-                    step_base_tick = mispredicted_tick;
-                    if (step_end_tick <= step_base_tick) {
-                        step_end_tick = step_base_tick + 1;
-                    }
-                }
-
-                /* We want to simulate the ahead of the server to predict client input.
-                 * How many ticks ahead we want to simulate is a balance between added latency and the server not receiving our inputs on time.
-                 * We can take the server's ack minus the server's tick to determine how many cmds of ours the server has buffered.
-                 *
-                 * If this buffer gets too low (because we are lagging behind or the connection is unstable), meaning the server is not getting our input on time:
-                 *   - Shorten local compute rate to increase the rate at which we predict ahead & produce cmds, until the server's ack indicates a buffer size within desired range.
-                 *
-                 * If this buffer gets too large (because the client predicts too far ahead), meaning unneeded latency is being introduced:
-                 *   - Dilate local compute rate to decrease the rate at which we predict ahead & produce cmds until the server's ack indicates a buffer size within desired range.
-                 */
-                {
-                    i64 cmds_ahead_on_master = (i64)master_client->ack - (i64)master_client->last_tick;
-                    if (cmds_ahead_on_master < -3 || cmds_ahead_on_master > 10) {
-                        /* Cmds are too far from master time, snap step end tick */
-                        i64 rtt_ns = master_client->last_rtt_ns;
-                        f64 rtt_tick_ratio = (f64)(rtt_ns + (step_dt_ns - 1)) / (f64)step_dt_ns;
-                        i64 num_predict_ticks = math_round_to_int64(rtt_tick_ratio) + 5;
-                        step_end_tick = master_client->last_tick + num_predict_ticks;
-                        compute_timescale = 1.1;
-                    } else  if (cmds_ahead_on_master > 2) {
-                        /* Slow down simulation to dial back how far ahead we are predicting and bring local sim time closer to master sim time */
-                        compute_timescale = 1.1;
-                    } else if (cmds_ahead_on_master < 1) {
-                        /* Speed up simulation rate predict more ticks and give master more inputs to work with */
-                        compute_timescale = 0.9;
-                    } else {
-                        /* Server's cmd buffer is in a healthy range */
-                        compute_timescale = 1;
-                    }
-                }
-
-                /* Sync master with local base tick */
-                struct sim_snapshot *base_ss = sim_snapshot_from_tick(local_client, step_base_tick);
-                if (mispredicted_tick) {
-                    if (base_ss->valid) {
-                        sim_snapshot_sync_ents(base_ss, master_ss, master_player->id, 0);
-                    } else {
-                        base_ss = sim_snapshot_alloc(local_client, master_ss, step_base_tick);
-                    }
-                }
-
-                /* Release any existing ticks that are about to be simulated */
-                sim_snapshot_release_ticks_in_range(local_client, step_base_tick + 1, U64_MAX);
-
-                /* Step */
-                generate_user_input_cmds(user_input_client, step_end_tick);
-                {
-                    struct sim_step_ctx ctx = ZI;
-                    ctx.is_master = is_master;
-                    ctx.sim_dt_ns = step_dt_ns;
-                    ctx.accel = &accel;
-                    ctx.user_input_client = user_input_client;
-                    ctx.master_client = master_client;
-                    ctx.publish_client = publish_client;
-
-                    u64 step_tick = step_base_tick + 1;
-                    struct sim_snapshot *prev_ss = base_ss;
-                    while (step_tick <= step_end_tick) {
-                        ctx.world = sim_snapshot_alloc(local_client, prev_ss, step_tick);
-                        if (!mispredicted_tick && step_tick == step_end_tick) {
-                            sim_snapshot_sync_ents(ctx.world, master_ss, master_player->id, SIM_SYNC_FLAG_NOSYNC_PREDICTABLES);
-                        }
-                        sim_step(&ctx);
-                        prev_ss = ctx.world;
-                        ++step_tick;
-                    }
-                }
-            }
-        }
-
-        /* Publish snapshot to remote clients */
-        for (u64 i = 0; i < store->num_clients_reserved; ++i) {
-            struct sim_client *client = &store->clients[i];
-            if (client->valid && client != user_input_client && client != local_client && client != publish_client) {
-                struct bitbuff_writer msg_bw = bw_from_bitbuff(&msg_writer_bb);
-
-                bw_write_uv(&msg_bw, client->highest_received_tick);     /* ack */
-                bw_write_uv(&msg_bw, client->ack);                       /* double ack */
-
-                struct sim_snapshot *base_ss = sim_snapshot_from_tick(publish_client, client->ack);
-                struct sim_snapshot *publish_ss;
-                if (client == master_client) {
-                    /* If sending to master, start sending all snapshots since last ack */
-                    publish_ss = sim_snapshot_from_closest_tick_gte(publish_client, base_ss->tick + 1);
-                } else {
-                    /* If sending to slave, only send latest snapshot */
-                    publish_ss = sim_snapshot_from_tick(publish_client, publish_client->last_tick);
-                }
-
-                while (publish_ss->valid) {
-                    struct bitbuff_writer snapshot_bw = bw_from_bitbuff(&snapshot_writer_bb);
-                    struct string tmp_snapshot_encoded = ZI;
+                    /* Update ent id from master */
                     {
-                        bw_write_uv(&snapshot_bw, base_ss->tick);
-                        bw_write_uv(&snapshot_bw, publish_ss->tick);
-                        sim_snapshot_encode(&snapshot_bw, client, base_ss, publish_ss);
-                        tmp_snapshot_encoded.len = bw_num_bytes_written(&snapshot_bw);
-                        tmp_snapshot_encoded.text = bw_get_written_raw(&snapshot_bw);
+                        user_input_client->player_id = master_ss->local_player;
+                        local_client->player_id = master_ss->local_player;
+                    }
+
+                    /* Check for misprediction */
+                    u64 mispredicted_tick = 0;
+                    if (!master_ss_is_blended) {
+                        /* TODO: Actually check for misprediction rather than triggering mispredict any time a new master snapshot is received */
+                        mispredicted_tick = master_ss->tick;
+                    }
+
+
+                    u64 step_base_tick = local_client->last_tick;
+                    u64 step_end_tick = step_base_tick + 1;
+                    if (mispredicted_tick > 0) {
+                        step_base_tick = mispredicted_tick;
+                        if (step_end_tick <= step_base_tick) {
+                            step_end_tick = step_base_tick + 1;
+                        }
+                    }
+
+                    /* We want to simulate the ahead of the server to predict client input.
+                     * How many ticks ahead we want to simulate is a balance between added latency and the server not receiving our inputs on time.
+                     * We can take the server's ack minus the server's tick to determine how many cmds of ours the server has buffered.
+                     *
+                     * If this buffer gets too low (because we are lagging behind or the connection is unstable), meaning the server is not getting our input on time:
+                     *   - Shorten local compute rate to increase the rate at which we predict ahead & produce cmds, until the server's ack indicates a buffer size within desired range.
+                     *
+                     * If this buffer gets too large (because the client predicts too far ahead), meaning unneeded latency is being introduced:
+                     *   - Dilate local compute rate to decrease the rate at which we predict ahead & produce cmds until the server's ack indicates a buffer size within desired range.
+                     */
+                    {
+                        i64 cmds_ahead_on_master = (i64)master_client->ack - (i64)master_client->last_tick;
+                        if (cmds_ahead_on_master < -3 || cmds_ahead_on_master > 10) {
+                            /* Cmds are too far from master time, snap step end tick */
+                            i64 rtt_ns = master_client->last_rtt_ns;
+                            f64 rtt_tick_ratio = (f64)(rtt_ns + (step_dt_ns - 1)) / (f64)step_dt_ns;
+                            i64 num_predict_ticks = math_round_to_int64(rtt_tick_ratio) + 5;
+                            step_end_tick = master_client->last_tick + num_predict_ticks;
+                            compute_timescale = 1.1;
+                        } else  if (cmds_ahead_on_master > 2) {
+                            /* Slow down simulation to dial back how far ahead we are predicting and bring local sim time closer to master sim time */
+                            compute_timescale = 1.1;
+                        } else if (cmds_ahead_on_master < 1) {
+                            /* Speed up simulation rate predict more ticks and give master more inputs to work with */
+                            compute_timescale = 0.9;
+                        } else {
+                            /* Server's cmd buffer is in a healthy range */
+                            compute_timescale = 1;
+                        }
+                    }
+
+                    /* Sync master with local base tick */
+                    struct sim_snapshot *base_ss = sim_snapshot_from_tick(local_client, step_base_tick);
+                    if (mispredicted_tick) {
+                        if (base_ss->valid) {
+                            sim_snapshot_sync_ents(base_ss, master_ss, master_player->id, 0);
+                        } else {
+                            base_ss = sim_snapshot_alloc(local_client, master_ss, step_base_tick);
+                        }
+                    }
+
+                    /* Release any existing ticks that are about to be simulated */
+                    sim_snapshot_release_ticks_in_range(local_client, step_base_tick + 1, U64_MAX);
+
+                    /* Step */
+                    generate_user_input_cmds(user_input_client, step_end_tick);
+                    {
+                        struct sim_step_ctx ctx = ZI;
+                        ctx.is_master = is_master;
+                        ctx.sim_dt_ns = step_dt_ns;
+                        ctx.accel = &accel;
+                        ctx.user_input_client = user_input_client;
+                        ctx.master_client = master_client;
+                        ctx.publish_client = publish_client;
+
+                        u64 step_tick = step_base_tick + 1;
+                        struct sim_snapshot *prev_ss = base_ss;
+                        while (step_tick <= step_end_tick) {
+                            ctx.world = sim_snapshot_alloc(local_client, prev_ss, step_tick);
+                            if (!mispredicted_tick && step_tick == step_end_tick) {
+                                sim_snapshot_sync_ents(ctx.world, master_ss, master_player->id, SIM_SYNC_FLAG_NOSYNC_PREDICTABLES);
+                            }
+                            sim_step(&ctx);
+                            prev_ss = ctx.world;
+                            ++step_tick;
+                        }
                     }
-                    bw_write_uv(&msg_bw, tmp_snapshot_encoded.len);
-                    bw_write_bytes(&msg_bw, tmp_snapshot_encoded);
-                    publish_ss = sim_snapshot_from_tick(publish_client, publish_ss->tick + 1);
                 }
-                bw_write_uv(&msg_bw, 0);
-
-                struct string encoded = ZI;
-                encoded.len = bw_num_bytes_written(&msg_bw);
-                encoded.text = bw_get_written_raw(&msg_bw);
-                host_queue_write(host, client->channel_id, encoded, 0);
             }
-        }
 
-        /* Copy local snapshot to user client */
-        {
-            struct sim_snapshot *local_ss = sim_snapshot_from_tick(local_client, local_client->last_tick);
-            if (local_ss->valid) {
-                /* TODO: Double buffer */
-                struct sys_lock lock = sys_mutex_lock_e(G.local_to_user_client_mutex);
-                sim_snapshot_alloc(G.local_to_user_client, local_ss, local_ss->tick);
-                i64 publish_ns = sys_time_ns();
-                G.local_to_user_client_publish_dt_ns = publish_ns - last_publish_to_user_ns;
-                G.local_to_user_client_publish_time_ns = publish_ns;
-                last_publish_to_user_ns = publish_ns;
-                sim_snapshot_release_ticks_in_range(G.local_to_user_client, 0, local_ss->tick - 1);
-                sys_mutex_unlock(&lock);
+            /* Publish snapshot to remote clients */
+            for (u64 i = 0; i < store->num_clients_reserved; ++i) {
+                struct sim_client *client = &store->clients[i];
+                if (client->valid && client != user_input_client && client != local_client && client != publish_client) {
+                    struct bitbuff_writer msg_bw = bw_from_bitbuff(&msg_writer_bb);
+
+                    bw_write_uv(&msg_bw, client->highest_received_tick);     /* ack */
+                    bw_write_uv(&msg_bw, client->ack);                       /* double ack */
+
+                    struct sim_snapshot *base_ss = sim_snapshot_from_tick(publish_client, client->ack);
+                    struct sim_snapshot *publish_ss;
+                    if (client == master_client) {
+                        /* If sending to master, start sending all snapshots since last ack */
+                        publish_ss = sim_snapshot_from_closest_tick_gte(publish_client, base_ss->tick + 1);
+                    } else {
+                        /* If sending to slave, only send latest snapshot */
+                        publish_ss = sim_snapshot_from_tick(publish_client, publish_client->last_tick);
+                    }
+
+                    while (publish_ss->valid) {
+                        struct bitbuff_writer snapshot_bw = bw_from_bitbuff(&snapshot_writer_bb);
+                        struct string tmp_snapshot_encoded = ZI;
+                        {
+                            bw_write_uv(&snapshot_bw, base_ss->tick);
+                            bw_write_uv(&snapshot_bw, publish_ss->tick);
+                            sim_snapshot_encode(&snapshot_bw, client, base_ss, publish_ss);
+                            tmp_snapshot_encoded.len = bw_num_bytes_written(&snapshot_bw);
+                            tmp_snapshot_encoded.text = bw_get_written_raw(&snapshot_bw);
+                        }
+                        bw_write_uv(&msg_bw, tmp_snapshot_encoded.len);
+                        bw_write_bytes(&msg_bw, tmp_snapshot_encoded);
+                        publish_ss = sim_snapshot_from_tick(publish_client, publish_ss->tick + 1);
+                    }
+                    bw_write_uv(&msg_bw, 0);
+
+                    struct string encoded = ZI;
+                    encoded.len = bw_num_bytes_written(&msg_bw);
+                    encoded.text = bw_get_written_raw(&msg_bw);
+                    host_queue_write(host, client->channel_id, encoded, 0);
+                }
             }
+
+            /* Copy local snapshot to user client */
+            {
+                struct sim_snapshot *local_ss = sim_snapshot_from_tick(local_client, local_client->last_tick);
+                if (local_ss->valid) {
+                    /* TODO: Double buffer */
+                    struct sys_lock lock = sys_mutex_lock_e(G.local_to_user_client_mutex);
+                    sim_snapshot_alloc(G.local_to_user_client, local_ss, local_ss->tick);
+                    i64 publish_ns = sys_time_ns();
+                    G.local_to_user_client_publish_dt_ns = publish_ns - last_publish_to_user_ns;
+                    G.local_to_user_client_publish_time_ns = publish_ns;
+                    last_publish_to_user_ns = publish_ns;
+                    sim_snapshot_release_ticks_in_range(G.local_to_user_client, 0, local_ss->tick - 1);
+                    sys_mutex_unlock(&lock);
+                }
+            }
+
+skip_step:
+
+            /* Send host messages */
+            host_update_end(host);
+            __profframe("Local sim");
+
+            scratch_end(scratch);
         }
-
-    skip_step:
-
-        /* Send host messages */
-        host_update_end(host);
-        __profframe("Local sim");
-
-        scratch_end(scratch);
     }
 
     sim_client_store_release(store);
diff --git a/src/user.h b/src/user.h
index b5c49ea8..42ce727e 100644
--- a/src/user.h
+++ b/src/user.h
@@ -2,7 +2,6 @@
 #define USER_H
 
 struct sys_window;
-struct work_startup_receipt;
 struct gp_startup_receipt;
 struct font_startup_receipt;
 struct sprite_startup_receipt;
@@ -36,6 +35,7 @@ enum user_bind_kind {
     USER_BIND_KIND_DEBUG_WALLS,
     USER_BIND_KIND_DEBUG_FOLLOW,
     USER_BIND_KIND_DEBUG_DRAW,
+    USER_BIND_KIND_PROFILER,
     USER_BIND_KIND_DEBUG_CONSOLE,
     USER_BIND_KIND_DEBUG_CAMERA,
     USER_BIND_KIND_DEBUG_PAUSE,
@@ -61,8 +61,7 @@ enum user_bind_kind {
 };
 
 struct user_startup_receipt { i32 _; };
-struct user_startup_receipt user_startup(struct work_startup_receipt *work_sr,
-                                         struct gp_startup_receipt *gp_sr,
+struct user_startup_receipt user_startup(struct gp_startup_receipt *gp_sr,
                                          struct font_startup_receipt *font_sr,
                                          struct sprite_startup_receipt *sprite_sr,
                                          struct draw_startup_receipt *draw_sr,
diff --git a/src/util.h b/src/util.h
index aad39548..51a25376 100644
--- a/src/util.h
+++ b/src/util.h
@@ -306,7 +306,6 @@ INLINE void sync_flag_wait(struct sync_flag *sf)
 
 INLINE void sleep_frame(i64 last_frame_time_ns, i64 target_dt_ns)
 {
-    __prof;
     if (last_frame_time_ns != 0 && target_dt_ns > 0) {
         i64 now_ns = sys_time_ns();
         i64 last_frame_dt_ns = now_ns - last_frame_time_ns;
diff --git a/src/work.c b/src/work.c
deleted file mode 100644
index fb248144..00000000
--- a/src/work.c
+++ /dev/null
@@ -1,612 +0,0 @@
-#include "work.h"
-#include "intrinsics.h"
-#include "sys.h"
-#include "arena.h"
-#include "scratch.h"
-#include "memory.h"
-#include "string.h"
-#include "log.h"
-#include "thread_local.h"
-#include "atomic.h"
-#include "app.h"
-
-/* Terminology:
- *
- * Task: Single unit of stuff to be done (a function with a data pointer)
- *
- * Work: A group of tasks (doesn't have to be homogeneous) bundled together.
- *       Work is "complete" when all of its tasks are complete.
- *
- * Work Slate: A list of tasks used as a building-tool for constructing work.
- *
- * Worker: A thread that can do work. "work_startup" will create a certain
- *         amount of dedicated worker threads. Note that non-worker threads can
- *         also do work themselves (IE: callers of "work_wait")
- */
-
-struct worker {
-    struct sys_thread *thread;
-    struct worker *next;
-};
-
-struct work_task;
-
-struct work {
-    enum work_priority priority;
-    enum work_status status;
-    u32 workers;
-
-    struct sys_condition_variable *condition_variable_finished;
-
-    struct work *prev_scheduled;
-    struct work *next_scheduled;
-    struct work *next_free;
-
-    struct work_task *task_head;  /* Unstarted task head */
-    u32 tasks_incomplete;
-
-    u64 gen;
-};
-
-struct work_task {
-    void *data;
-    work_task_func *func;
-    struct work *work;
-
-    struct work_task *next_in_work;
-    struct work_task *next_free;
-};
-
-/* ========================== *
- * Global state
- * ========================== */
-
-GLOBAL struct {
-    struct arena *arena;
-
-    b32 workers_shutdown;
-    struct sys_mutex *mutex;
-    struct sys_condition_variable *cv;
-
-    u32 worker_count;
-    u32 idle_worker_count;
-    struct worker *worker_head;
-
-    /* TODO: Make below pointers volatile? */
-
-    struct work_task *free_task_head;
-
-    struct work *free_work_head;
-    struct work *scheduled_work_head;
-
-    /* Pointers to the last piece of work of each priority in the scheduled
-     * work list (used for O(1) insertion) */
-    struct work *scheduled_work_priority_tails[NUM_WORK_PRIORITIES];
-} G = ZI, DEBUG_ALIAS(G, G_work);
-
-/* ========================== *
- * Thread local state
- * ========================== */
-
-struct worker_ctx {
-    b32 is_worker;
-};
-
-GLOBAL THREAD_LOCAL_VAR_DEF(tl_worker_ctx, struct worker_ctx, NULL, NULL);
-
-/* ========================== *
- * Startup
- * ========================== */
-
-INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(work_shutdown);
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(worker_thread_entry_point, thread_data);
-
-struct work_startup_receipt work_startup(u32 num_worker_threads)
-{
-    struct arena_temp scratch = scratch_begin_no_conflict();
-
-    if (num_worker_threads <= 0) {
-        sys_panic(LIT("Tried to start up worker pool with 0 threads"));
-    }
-
-    G.arena = arena_alloc(GIGABYTE(64));
-    G.mutex = sys_mutex_alloc();
-    G.cv = sys_condition_variable_alloc();
-    G.worker_count = num_worker_threads;
-    G.idle_worker_count = num_worker_threads;
-    app_register_exit_callback(&work_shutdown);
-
-    /* Initialize threads */
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        struct worker *prev = NULL;
-        for (u32 i = 0; i < num_worker_threads; ++i) {
-            struct string thread_name = string_format(scratch.arena,
-                                                    LIT("[P6] Worker %F"),
-                                                    FMT_UINT(i));
-
-            struct worker *worker = arena_push(G.arena, struct worker);
-            worker->thread = sys_thread_alloc(&worker_thread_entry_point, NULL, thread_name);
-            if (prev) {
-                prev->next = worker;
-            } else {
-                G.worker_head = worker;
-            }
-            prev = worker;
-        }
-    }
-    sys_mutex_unlock(&lock);
-
-    scratch_end(scratch);
-
-    return (struct work_startup_receipt) { 0 };
-}
-
-INTERNAL APP_EXIT_CALLBACK_FUNC_DEF(work_shutdown)
-{
-    __prof;
-
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        G.workers_shutdown = true;
-        sys_condition_variable_broadcast(G.cv);
-    }
-    sys_mutex_unlock(&lock);
-
-    for (struct worker *worker = G.worker_head; worker; worker = worker->next) {
-        sys_thread_wait_release(worker->thread);
-    }
-}
-
-/* ========================== *
- * Internal work / task allocation
- * ========================== */
-
-INTERNAL struct work *work_alloc_locked(struct sys_lock *lock)
-{
-    __prof;
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    struct work *work = NULL;
-
-    /* Allocate work */
-    if (G.free_work_head) {
-        /* Reuse from free list */
-        work = G.free_work_head;
-        G.free_work_head = work->next_free;
-        *work = (struct work) {
-            .condition_variable_finished = work->condition_variable_finished,
-            .gen = work->gen + 1
-        };
-    } else {
-        /* Make new */
-        work = arena_push_no_zero(G.arena, struct work);
-        *work = (struct work) {
-            .condition_variable_finished = sys_condition_variable_alloc(),
-            .gen = 1
-        };
-    }
-    return work;
-}
-
-INTERNAL void work_release_locked(struct sys_lock *lock, struct work *work)
-{
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    work->next_free = G.free_work_head;
-    G.free_work_head = work;
-    ++work->gen;
-}
-
-INTERNAL struct work_handle work_to_handle_locked(struct sys_lock *lock, struct work *work)
-{
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    return (struct work_handle) {
-        .work = work,
-        .gen = work->gen
-    };
-}
-
-INTERNAL struct work_task *task_alloc_locked(struct sys_lock *lock)
-{
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    struct work_task *task = NULL;
-
-    /* Allocate task */
-    if (G.free_task_head) {
-        /* Reuse from free list */
-        task = G.free_task_head;
-        G.free_task_head = task->next_free;
-        *task = (struct work_task) { 0 };
-    } else {
-        /* Make new */
-        task = arena_push(G.arena, struct work_task);
-    }
-
-    return task;
-}
-
-INTERNAL void task_release_locked(struct sys_lock *lock, struct work_task *task)
-{
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    task->next_free = G.free_task_head;
-    G.free_task_head = task;
-}
-
-/* ========================== *
- * Work scheduling / insertion
- * ========================== */
-
-INTERNAL void work_schedule_locked(struct sys_lock *lock, struct work *work)
-{
-    __prof;
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    enum work_priority priority = work->priority;
-
-    if (G.scheduled_work_head) {
-        struct work *head = G.scheduled_work_head;
-
-        if (head->priority >= priority) {
-            /* Head is lower priority, insert work as new head */
-            G.scheduled_work_head = work;
-            work->next_scheduled = head;
-            head->prev_scheduled = work;
-        } else {
-            /* Find higher priority */
-            struct work *tail = NULL;
-            for (i32 i = priority; i >= 0; --i) {
-                tail = G.scheduled_work_priority_tails[i];
-                if (tail) {
-                    break;
-                }
-            }
-            /* Hook work */
-            work->next_scheduled = tail->next_scheduled;
-            work->prev_scheduled = tail;
-            tail->next_scheduled = work;
-        }
-    } else {
-        G.scheduled_work_head = work;
-    }
-
-    G.scheduled_work_priority_tails[priority] = work;
-
-    sys_condition_variable_signal(G.cv, work->tasks_incomplete);
-}
-
-INTERNAL void work_unschedule_locked(struct sys_lock *lock, struct work *work)
-{
-    __prof;
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    struct work *prev = (struct work *)work->prev_scheduled;
-    struct work *next = (struct work *)work->next_scheduled;
-
-    /* Remove from priority tails array */
-    enum work_priority priority = work->priority;
-    struct work *priority_tail = G.scheduled_work_priority_tails[priority];
-    if (priority_tail == work && (!prev || prev->priority == priority)) {
-        G.scheduled_work_priority_tails[priority] = prev;
-    }
-
-    /* Unhook work */
-    if (prev) {
-        prev->next_scheduled = next;
-    }
-    if (next) {
-        next->prev_scheduled = prev;
-    }
-    if (work == G.scheduled_work_head) {
-        G.scheduled_work_head = next;
-    }
-}
-
-/* ========================== *
- * Task dequeuing
- * ========================== */
-
-INTERNAL struct work_task *work_dequeue_task_locked(struct sys_lock *lock, struct work *work)
-{
-    __prof;
-    sys_assert_locked_e(lock, G.mutex);
-
-    struct work_task *task = work->task_head;
-    if (task) {
-        work->task_head = task->next_in_work;
-        if (!work->task_head) {
-            /* Unschedule work if last task */
-            work_unschedule_locked(lock, work);
-        }
-    }
-    return task;
-}
-
-/* ========================== *
- * Work doing
- * ========================== */
-
-/* NOTE: This function will release `work` if there are no more tasks once completed.
- * Returns `true` if more tasks are still present in the work after completion. */
-INTERNAL b32 work_exec_single_task_maybe_release_locked(struct sys_lock *lock, struct work *work)
-{
-    __prof;
-    sys_assert_locked_e(lock, G.mutex);
-
-    struct work_task *task = work_dequeue_task_locked(lock, work);
-    b32 more_tasks = work->task_head != NULL;
-
-    if (task) {
-        work->status = WORK_STATUS_IN_PROGRESS;
-
-        ++work->workers;
-        /* Do task (temporarily unlock) */
-        {
-            sys_mutex_unlock(lock);
-            task->func(task->data);
-            *lock = sys_mutex_lock_e(G.mutex);
-        }
-        --work->workers;
-        --work->tasks_incomplete;
-        task_release_locked(lock, task);
-
-        if (work->tasks_incomplete == 0) {
-            /* Signal finished */
-            work->status = WORK_STATUS_DONE;
-            sys_condition_variable_broadcast(work->condition_variable_finished);
-
-            /* Release */
-            work_release_locked(lock, work);
-        }
-    }
-
-    return more_tasks;
-}
-
-INTERNAL void work_exec_remaining_tasks_maybe_release_locked(struct sys_lock *lock, struct work *work)
-{
-    __prof;
-    sys_assert_locked_e(lock, G.mutex);
-
-    b32 more_tasks = true;
-    while (more_tasks) {
-        more_tasks = work_exec_single_task_maybe_release_locked(lock, work);
-    }
-}
-
-/* ========================== *
- * Work thread proc
- * ========================== */
-
-INTERNAL SYS_THREAD_ENTRY_POINT_FUNC_DEF(worker_thread_entry_point, thread_data)
-{
-    (UNUSED)thread_data;
-
-    struct worker_ctx *ctx = thread_local_var_eval(&tl_worker_ctx);
-    *ctx = (struct worker_ctx) {
-        .is_worker = true
-    };
-
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        while (!G.workers_shutdown) {
-            struct work *work = G.scheduled_work_head;
-            if (work) {
-                __profscope(work_pool_task);
-                --G.idle_worker_count;
-                work_exec_single_task_maybe_release_locked(&lock, work);
-                ++G.idle_worker_count;
-            } else {
-                sys_condition_variable_wait(G.cv, &lock);
-            }
-        }
-    }
-    sys_mutex_unlock(&lock);
-}
-
-/* ========================== *
- * Work pushing interface
- * ========================== */
-
-/* If `help` is true, then the calling thread will start picking up tasks immediately (before other workers can see it) */
-INTERNAL struct work_handle work_push_from_slate_locked(struct sys_lock *lock, struct work_slate *ws, b32 help, enum work_priority priority)
-{
-    __prof;
-    sys_assert_locked_e(lock, G.mutex);
-
-    struct work *work = work_alloc_locked(lock);
-    struct work_handle wh = work_to_handle_locked(lock,  work);
-
-    work->priority = priority;
-    work->status = WORK_STATUS_IN_PROGRESS;
-
-    work->task_head = ws->task_head;
-    work->tasks_incomplete = ws->num_tasks;
-
-    work_schedule_locked(lock, work);
-
-    if (help) {
-        work_exec_remaining_tasks_maybe_release_locked(lock, work);
-    } else {
-        /* When work is submitted from a worker thread, we want the worker to pick
-         * up the tasks itself when idle workers = 0 and work.workers = 0
-         * (work.workers will always = 0 when work is first pushed).
-         *
-         * This is not ideal, however it is necessary to prevent
-         * a scenario in which all workers are waiting on child work to complete in
-         * a subtle way (IE: outside of work_wait). Since all workers are waiting,
-         * there would be no remaining workers to complete the child work, meaning
-         * there is a deadlock.
-         *
-         * By forcing workers to do their own child work in this scenario, we can
-         * guarantee that this does not occur. However it is not ideal since it
-         * creates situations in which work is not done asynchronously.
-         */
-        struct worker_ctx *ctx = thread_local_var_eval(&tl_worker_ctx);
-        if (ctx->is_worker) {
-            b32 work_done = false;
-            while (!work_done && G.idle_worker_count == 0 && work->workers == 0) {
-                work_done = !work_exec_single_task_maybe_release_locked(lock, work);
-            }
-        }
-    }
-
-    return wh;
-}
-
-INTERNAL struct work_handle work_push_task_internal(work_task_func *func, void *data, b32 help, enum work_priority priority)
-{
-    struct work_handle handle;
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        struct work_task *task = task_alloc_locked(&lock);
-        task->data = data;
-        task->func = func;
-
-        struct work_slate ws = {
-            .task_head = task,
-            .task_tail = task,
-            .num_tasks = 1
-        };
-        handle = work_push_from_slate_locked(&lock, &ws, help, priority);
-    }
-    sys_mutex_unlock(&lock);
-    return handle;
-}
-
-/* Push work that contains a single task */
-struct work_handle work_push_task(work_task_func *func, void *data, enum work_priority priority)
-{
-    __prof;
-    struct work_handle handle = work_push_task_internal(func, data, false, priority);
-    return handle;
-}
-
-struct work_handle work_push_task_and_help(work_task_func *func, void *data, enum work_priority priority)
-{
-    __prof;
-    struct work_handle handle = work_push_task_internal(func, data, true, priority);
-    return handle;
-}
-
-struct work_slate work_slate_begin(void)
-{
-    __prof;
-    struct work_slate ws = ZI;
-    return ws;
-}
-
-void work_slate_push_task(struct work_slate *ws, work_task_func *func, void *data)
-{
-    __prof;
-
-    struct work_task *task = NULL;
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        task = task_alloc_locked(&lock);
-    }
-    sys_mutex_unlock(&lock);
-
-    task->data = data;
-    task->func = func;
-
-    if (ws->task_tail) {
-        ws->task_tail->next_in_work = task;
-    } else {
-        ws->task_head = task;
-    }
-    ws->task_tail = task;
-    ++ws->num_tasks;
-
-}
-
-/* Push work that contains multiple tasks (work slate) */
-struct work_handle work_slate_end(struct work_slate *ws, enum work_priority priority)
-{
-    __prof;
-
-    struct work_handle handle;
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        handle = work_push_from_slate_locked(&lock, ws, false, priority);
-    }
-    sys_mutex_unlock(&lock);
-
-    return handle;
-}
-
-struct work_handle work_slate_end_and_help(struct work_slate *ws, enum work_priority priority)
-{
-    __prof;
-    struct work_handle handle = ZI;
-    if (ws->num_tasks > 0) {
-        struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-        handle = work_push_from_slate_locked(&lock, ws, true, priority);
-        sys_mutex_unlock(&lock);
-    }
-    return handle;
-}
-
-/* ========================== *
- * Work intervention interface
- * ========================== */
-
-INTERNAL struct work *work_from_handle_locked(struct sys_lock *lock, struct work_handle handle)
-{
-    sys_assert_locked_e(lock, G.mutex);
-    (UNUSED)lock;
-
-    struct work *work = handle.work;
-    if (work && work->gen != handle.gen) {
-        work = NULL;
-    }
-    return work;
-}
-
-/* Wait for all tasks in work to be completed. Will also pick up any unstarted
- * tasks in the work since the caller will be idle while waiting anyway. */
-void work_wait(struct work_handle handle)
-{
-    __prof;
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        struct work *work = work_from_handle_locked(&lock, handle);
-        if (work) {
-            /* Help with tasks */
-            work_exec_remaining_tasks_maybe_release_locked(&lock, work);
-
-            /* Wait for work completion */
-            work = work_from_handle_locked(&lock, handle);  /* Re-checking work is sitll valid here in case work_exec caused work to release */
-            if (work) {
-                while (work->status != WORK_STATUS_DONE) {
-                    sys_condition_variable_wait(work->condition_variable_finished, &lock);
-                }
-            }
-        }
-    }
-    sys_mutex_unlock(&lock);
-}
-
-/* Try to pick up any scheduled tasks */
-void work_help(struct work_handle handle)
-{
-    __prof;
-    struct sys_lock lock = sys_mutex_lock_e(G.mutex);
-    {
-        struct work *work = work_from_handle_locked(&lock, handle);
-        if (work) {
-            work_exec_remaining_tasks_maybe_release_locked(&lock, work);
-        }
-    }
-    sys_mutex_unlock(&lock);
-}
diff --git a/src/work.h b/src/work.h
deleted file mode 100644
index 196ae0a4..00000000
--- a/src/work.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef WORK_H
-#define WORK_H
-
-enum work_status {
-    WORK_STATUS_DONE,
-    WORK_STATUS_SCHEDULED,
-    WORK_STATUS_IN_PROGRESS
-};
-
-enum work_priority {
-    WORK_PRIORITY_HIGH,
-    WORK_PRIORITY_NORMAL,
-
-    NUM_WORK_PRIORITIES
-};
-
-#define WORK_TASK_FUNC_DEF(name, arg_name) void name(void *arg_name)
-typedef WORK_TASK_FUNC_DEF(work_task_func, data);
-
-struct work;
-struct work_task;
-
-struct work_handle {
-    struct work *work;
-    u64 gen;
-};
-
-struct work_slate {
-    struct work_task *task_head;
-    struct work_task *task_tail;
-    u32 num_tasks;
-};
-
-struct work_startup_receipt { i32 _; };
-struct work_startup_receipt work_startup(u32 num_worker_threads);
-
-struct work_slate work_slate_begin(void);
-struct work_handle work_slate_end(struct work_slate *ws, enum work_priority priority);
-struct work_handle work_slate_end_and_help(struct work_slate *ws, enum work_priority priority);
-
-struct work_handle work_push_task(work_task_func *func, void *data, enum work_priority priority);
-struct work_handle work_push_task_and_help(work_task_func *func, void *data, enum work_priority priority);
-void work_slate_push_task(struct work_slate *ws, work_task_func *func, void *data);
-
-void work_wait(struct work_handle handle);
-void work_help(struct work_handle handle);
-
-#endif