diff --git a/src/base/base.h b/src/base/base.h
index e978df8d..5eb1883b 100644
--- a/src/base/base.h
+++ b/src/base/base.h
@@ -713,12 +713,6 @@ Struct(ComputeShader)   { Resource resource; };
 ////////////////////////////////
 //~ Fibers
 
-/* If virtual fibers are enabled, each fiber will get its own OS thread,
- * and fiber suspend/resume will be emulated using OS thread primitives.
- * This is slow but allows for easier debugging in tricky cases
- * since the debugger won't be confused by fiber context switching. */
-#define VirtualFibersEnabled 0
-
 # define MaxFibers 4096
 StaticAssert(MaxFibers < I16Max);  /* MaxFibers should fit in FiberId */
 
@@ -730,7 +724,6 @@ StaticAssert(MaxFibers < I16Max);  /* MaxFibers should fit in FiberId */
 # endif
 #endif
 
-
 ////////////////////////////////
 //~ Exit callback types
 
diff --git a/src/base/base_job.h b/src/base/base_job.h
index 81a9830d..8072c5ac 100644
--- a/src/base/base_job.h
+++ b/src/base/base_job.h
@@ -39,6 +39,29 @@ Enum(JobPool)
 
 typedef void JobFunc(void *, i32);
 
+Enum(JobFlag)
+{
+    JobFlag_None        = 0,
+
+    /* A dedicated job is a heavy weight job that will receive its own OS
+     * thread and will never yield. When the fiber running the job suspends
+     * itself, the dedicated thread will perform a blocking wait rather than
+     * yielding the thread to another fiber. This is mainly useful long-running
+     * dispatcher-esque jobs that block on OS primitives, since occupying a
+     * worker thread (and thereby preventing non-blocking jobs from running on
+     * that worker) is unwanted.
+     *
+     * For example, Win32 window message processing is required by the OS to
+     * occur on the same thread that initially created the window, which means
+     * it actually must run inside a dedicated job to prevent message processing
+     * from yielding & resuming on another thread. The message processing loop
+     * can block until messages are received from the OS without having to
+     * occupy a job worker while it blocks, and can then wake yielding
+     * jobs onto job worker pools based on the messages it received.
+     */
+    JobFlag_Dedicated   = (1 << 0),
+};
+
 Struct(Job)
 {
     /* Internal */
@@ -51,6 +74,7 @@ Struct(Job)
     JobPool pool;
 
     /* Configurable between OpenJob & CloseJob */
+    JobFlag flags;
     i32 count;
     Fence *fence;
     void *sig;
@@ -72,10 +96,10 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids);  /* NOTE: Must only be c
 
 #define EmptySig { i32 _; }
 
-#define JobDecl(job, sigdef)                                                                                \
-    typedef struct job##_Sig sigdef job##_Sig;                                                              \
-    Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; job##_Sig sig; };     \
-    void job(job##_Sig *, i32);                                                                             \
+#define JobDecl(job, sigdef)                                                                                        \
+    typedef struct job##_Sig sigdef job##_Sig;                                                                      \
+    Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; JobFlag flags; job##_Sig sig; };     \
+    void job(job##_Sig *, i32);                                                                                     \
     StaticAssert(1)
 
 #define JobDef(job, sig_arg, id_arg) void job(job##_Sig *sig_arg, i32 id_arg)
@@ -103,6 +127,7 @@ do {
     Job *__job = OpenJob(__desc.func, __desc.pool);                                                     \
     __job->count = __desc.count;                                                                        \
     __job->fence = __desc.fence;                                                                        \
+    __job->flags = __desc.flags;                                                                        \
     __job->sig = PushStructNoZero(__job->arena, job_func##_Sig);                                        \
     CopyBytes(__job->sig, &__desc.sig, sizeof(__desc.sig));                                             \
     CloseJob(__job);                                                                                    \
@@ -110,20 +135,3 @@ do {
 
 Job *OpenJob(JobFunc *func, JobPool pool_kind);
 u32 CloseJob(Job *job);
-
-////////////////////////////////
-//~ @hookdecl Dedicated job operations
-
-/* A dedicated job is a heavy weight job that will not operate inside of any
- * job pool. As such, it receives its own dedicated thread, and never yields to
- * other fibers. Instead of yielding when the fiber suspends, it performs a blocking
- * wait that puts the OS thread to sleep. This is mainly useful for
- * implementing long-running blocking dispatcher-like jobs tasks for subsystems.
- *
- * For example, Win32 window message processing is required by the OS to occur
- * on the same thread that initially created the window, which means it
- * actually must run inside a dedicated job to prevent message processing from
- * yielding & resuming on another thread.
- */
-
-void RunDedicatedJob(JobFunc job_func);
diff --git a/src/base/base_win32/base_win32_job.c b/src/base/base_win32/base_win32_job.c
index a4d47870..39b7737d 100644
--- a/src/base/base_win32/base_win32_job.c
+++ b/src/base/base_win32/base_win32_job.c
@@ -95,6 +95,10 @@ void InitJobSystem(void)
 ////////////////////////////////
 //~ Win32 thread
 
+JobDef(W32_DummyJob, sig, id)
+{
+}
+
 DWORD WINAPI W32_Win32ThreadProc(LPVOID vt)
 {
     /* Convert thread to fiber */
@@ -141,15 +145,7 @@ W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, Str
     t->thread_udata = thread_udata;
     t->profiler_group = profiler_group;
 
-    t->handle = CreateThread(
-        0,
-        W32_FiberStackSize,
-        W32_Win32ThreadProc,
-        t,
-        0,
-        0
-    );
-
+    t->handle = CreateThread(0, W32_FiberStackSize, W32_Win32ThreadProc, t, 0, 0);
     if (!t->handle)
     {
         Panic(Lit("Failed to create thread"));
@@ -278,7 +274,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
         {
             __profn("CreateFiber");
             fiber->pool = pool->kind;
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
             fiber->addr = CreateThread(0, W32_FiberStackSize, W32_VirtualFiberEntryPoint, (void *)(i64)fiber_id, 0, 0);
 #else
             fiber->addr = CreateFiber(W32_FiberStackSize, W32_FiberEntryPoint, (void *)(i64)fiber_id);
@@ -289,7 +285,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
             /* Fiber is not a part of a job pool, convert thread to fiber */
             __profn("ConvertThreadToFiber");
             fiber->addr = ConvertThreadToFiber((void *)(i64)fiber_id);
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
             fiber->addr = GetCurrentThread();
 #endif
         }
@@ -319,7 +315,7 @@ ForceInline W32_Fiber *W32_FiberFromId(i16 id)
 
 void W32_SwitchToFiber(W32_Fiber *target)
 {
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
     W32_Fiber *self = W32_FiberFromId(FiberId());
     Atomic8Set(&self->virtual_yield, 1);
     /* Signal virtual target */
@@ -351,9 +347,9 @@ void W32_FiberEntryPoint(void *_)
     W32_JobPool *pool = &W32_shared_job_state.job_pools[fiber->pool];
     JobPool pool_kind = fiber->pool;
     char *fiber_name_cstr = fiber->name_cstr;
+    __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
     for (;;)
     {
-        __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
         W32_Task *task = fiber->task;
         Job *job = task->job;
 
@@ -542,26 +538,12 @@ void SuspendFiber(void)
     __prof;
     i16 fiber_id = FiberId();
     W32_Fiber *fiber = W32_FiberFromId(FiberId());
-    i16 return_id = fiber->return_id;
     __prof_fiber_leave();
-    if (return_id > 0)
     {
-        /* Suspend task fiber (return control flow to parent/worker fiber) */
         Atomic8Set(&fiber->status, W32_FiberStatus_Suspending);
-        W32_Fiber *parent_fiber = W32_FiberFromId(return_id);
+        W32_Fiber *parent_fiber = W32_FiberFromId(fiber->return_id);
         W32_SwitchToFiber(parent_fiber);
     }
-    else
-    {
-        /* Suspend dedicated fiber (block thread) */
-        Atomic8Set(&fiber->status, W32_FiberStatus_Suspended);
-        i8 status = W32_FiberStatus_Suspended;
-        while (status != W32_FiberStatus_None)
-        {
-            WaitOnAddress(&fiber->status, &status, sizeof(status), INFINITE);
-            status = Atomic8Fetch(&fiber->status);
-        }
-    }
     __prof_fiber_enter(fiber->name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(fiber->pool) + Kibi(1) + fiber->id);
 }
 
@@ -587,21 +569,21 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids)
         /* Update fiber status */
         Atomic8Set(&fiber->status, W32_FiberStatus_None);
 
-        i16 return_id = fiber->return_id;
-        if (return_id > 0)
+        W32_Task *task = fiber->task;
+        // if (task->job->flags & JobFlag_Dedicated)
+        if (0)
+        {
+            /* TODO: Wake dedicated fiber right now */
+            WakeByAddressSingle(&fiber->status);
+        }
+        else
         {
             /* Group task based on pool */
-            W32_Task *task = fiber->task;
             JobPool pool_kind = fiber->pool;
             W32_TaskList *pool_tasks = &tasks_by_pool[pool_kind];
             QueuePush(pool_tasks->first, pool_tasks->last, task);
             ++pool_tasks->count;
         }
-        else
-        {
-            /* Wake dedicated fiber right now */
-            WakeByAddressSingle(&fiber->status);
-        }
     }
 
     /* Submit tasks */
@@ -689,105 +671,99 @@ u32 CloseJob(Job *job)
 {
     TempArena scratch = BeginScratchNoConflict();
 
-    W32_JobPool *pool = &W32_shared_job_state.job_pools[job->pool];
+    JobPool pool_kind = job->pool;
+    W32_JobPool *pool = &W32_shared_job_state.job_pools[pool_kind];
     u32 num_tasks = job->count;
 
-    if (num_tasks > 0)
+    if (num_tasks == 0)
     {
-        /* Allocate tasks from free list */
-        u32 num_tasks_allocated = 0;
-        W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks);
+        Assert(0);
+        job->func = W32_DummyJob;
+        num_tasks = 1;
+    }
+
+    /* Allocate tasks from free list */
+    u32 num_tasks_allocated = 0;
+    W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks);
+    {
+        LockTicketMutex(&pool->free_tasks_tm);
         {
-            LockTicketMutex(&pool->free_tasks_tm);
+            while (num_tasks_allocated < num_tasks)
             {
-                while (num_tasks_allocated < num_tasks)
+                W32_Task *task = pool->first_free_task;
+                if (task)
                 {
-                    W32_Task *task = pool->first_free_task;
-                    if (task)
-                    {
-                        tasks_array[num_tasks_allocated++] = task;
-                        StackPop(pool->first_free_task);
-                    }
-                    else
-                    {
-                        break;
-                    }
-                }
-            }
-            UnlockTicketMutex(&pool->free_tasks_tm);
-        }
-
-        /* Allocate new tasks from memory */
-        u32 remaining = num_tasks - num_tasks_allocated;
-        if (remaining > 0)
-        {
-            Arena *perm = PermArena();
-            PushAlign(perm, CachelineSize);
-            W32_Task *pushed_tasks = PushStructsNoZero(perm, W32_Task, remaining);
-            for (u32 i = 0; i < remaining; ++i)
-            {
-                tasks_array[num_tasks_allocated + i] = &pushed_tasks[i];
-            }
-            num_tasks_allocated += remaining;
-            PushAlign(perm, CachelineSize);
-        }
-
-        /* Generate task list */
-        W32_TaskList tasks = ZI;
-        for (u32 i = 0; i < num_tasks; ++i)
-        {
-            W32_Task *task = tasks_array[i];
-            ZeroStruct(task);
-            task->job = job;
-            task->task_id = tasks.count++;
-            QueuePush(tasks.first, tasks.last, task);
-        }
-
-        /* Push tasks to back of pool */
-        {
-            LockTicketMutex(&pool->tasks_tm);
-            {
-                if (pool->last_task)
-                {
-                    pool->last_task->next = tasks.first;
+                    tasks_array[num_tasks_allocated++] = task;
+                    StackPop(pool->first_free_task);
                 }
                 else
                 {
-                    pool->first_task = tasks.first;
+                    break;
                 }
-                pool->last_task = tasks.last;
-                Atomic64FetchAdd(&pool->tasks_count.v, num_tasks);
-            }
-            UnlockTicketMutex(&pool->tasks_tm);
-        }
-
-        /* Wake workers */
-        if (num_tasks >= W32_WakeAllWorkersThreshold)
-        {
-            WakeByAddressAll(&pool->tasks_count);
-        }
-        else
-        {
-            for (u32 i = 0; i < num_tasks; ++i)
-            {
-                WakeByAddressSingle(&pool->tasks_count);
             }
         }
+        UnlockTicketMutex(&pool->free_tasks_tm);
     }
-    else if (job->fence)
+
+    /* Allocate new tasks from memory */
+    u32 remaining = num_tasks - num_tasks_allocated;
+    if (remaining > 0)
     {
-        FetchAddFence(job->fence, 1);
+        Arena *perm = PermArena();
+        PushAlign(perm, CachelineSize);
+        W32_Task *pushed_tasks = PushStructsNoZero(perm, W32_Task, remaining);
+        for (u32 i = 0; i < remaining; ++i)
+        {
+            tasks_array[num_tasks_allocated + i] = &pushed_tasks[i];
+        }
+        num_tasks_allocated += remaining;
+        PushAlign(perm, CachelineSize);
+    }
+
+    /* FIXME: Handle dedicated jobs separately */
+
+    /* Generate task list */
+    W32_TaskList tasks = ZI;
+    for (u32 i = 0; i < num_tasks; ++i)
+    {
+        W32_Task *task = tasks_array[i];
+        ZeroStruct(task);
+        task->job = job;
+        task->task_id = tasks.count++;
+        QueuePush(tasks.first, tasks.last, task);
+    }
+
+    /* Push tasks to back of pool */
+    {
+        LockTicketMutex(&pool->tasks_tm);
+        {
+            if (pool->last_task)
+            {
+                pool->last_task->next = tasks.first;
+            }
+            else
+            {
+                pool->first_task = tasks.first;
+            }
+            pool->last_task = tasks.last;
+            Atomic64FetchAdd(&pool->tasks_count.v, num_tasks);
+        }
+        UnlockTicketMutex(&pool->tasks_tm);
+    }
+
+    /* Wake workers */
+    if (num_tasks >= W32_WakeAllWorkersThreshold)
+    {
+        WakeByAddressAll(&pool->tasks_count);
+    }
+    else
+    {
+        for (u32 i = 0; i < num_tasks; ++i)
+        {
+            WakeByAddressSingle(&pool->tasks_count);
+        }
     }
 
     EndScratch(scratch);
     return 1;
 }
-
-////////////////////////////////
-//~ @hookdef Dedicated job operations
-
-void RunDedicatedJob(JobFunc job_func)
-{
-    /* TODO: Implement */
-    Assert(0);
-}
diff --git a/src/base/base_win32/base_win32_job.h b/src/base/base_win32/base_win32_job.h
index 7e015dc5..e7d6a78e 100644
--- a/src/base/base_win32/base_win32_job.h
+++ b/src/base/base_win32/base_win32_job.h
@@ -178,6 +178,7 @@ DWORD WINAPI W32_Win32ThreadProc(LPVOID vt);
 W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, String thread_name, i32 profiler_group);
 b32 W32_TryEndThread(W32_Thread *thread, f32 timeout_seconds);
 void W32_WaitEndThread(W32_Thread *thread);
+JobDecl(W32_DummyJob, EmptySig);
 
 ////////////////////////////////
 //~ Fiber operations
diff --git a/src/config.h b/src/config.h
index 14a472c5..ad4e4af4 100644
--- a/src/config.h
+++ b/src/config.h
@@ -69,6 +69,14 @@
 
 #define FLOOD_DEBUG 0
 
+#define GPU_DEBUG 1
+
+/* If virtual fibers are enabled, each fiber will get its own OS thread,
+ * and fiber suspend/resume will be emulated using OS thread primitives.
+ * This is slow but allows for easier debugging in tricky cases
+ * since the debugger won't be confused by fiber context switching. */
+#define VIRTUAL_FIBERS 0
+
 /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
 #define BITBUFF_DEBUG 0
 #define BITBUFF_TEST RtcIsEnabled
diff --git a/src/gpu/gpu.h b/src/gpu/gpu.h
index 8581d29a..b0e0e53e 100644
--- a/src/gpu/gpu.h
+++ b/src/gpu/gpu.h
@@ -259,15 +259,6 @@ Struct(GPU_Scissor)
     f32 bottom;
 };
 
-////////////////////////////////
-//~ Fence types
-
-Struct(GPU_Fence)
-{
-    u64 targets[GPU_NumQueues];
-    u32 num_targets;
-};
-
 ////////////////////////////////
 //~ Memory info types
 
@@ -281,6 +272,11 @@ Struct(GPU_MemoryInfo)
 
 void GPU_Startup(void);
 
+////////////////////////////////
+//~ @hookdecl Fence operations
+
+Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind);
+
 ////////////////////////////////
 //~ @hookdecl Rasterizer helpers
 
@@ -299,8 +295,8 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource);
 ////////////////////////////////
 //~ @hookdecl Command list operations
 
-GPU_CommandList *GPU_BeginCommandList(void);
-u32 GPU_EndCommandList(GPU_CommandList *cl, Fence *fence);
+GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind);
+u64 GPU_EndCommandList(GPU_CommandList *cl);  /* Returns the value that the queue's fence will be set to once the command is completed */
 
 ////////////////////////////////
 //~ @hookdecl Profiling helpers
diff --git a/src/gpu/gpu_dx12/gpu_dx12.c b/src/gpu/gpu_dx12/gpu_dx12.c
index 4a2ad868..8b45eda1 100644
--- a/src/gpu/gpu_dx12/gpu_dx12.c
+++ b/src/gpu/gpu_dx12/gpu_dx12.c
@@ -1,7 +1,5 @@
 GPU_D12_SharedState GPU_D12_shared_state = ZI;
 
-
-
 ////////////////////////////////
 //~ Helpers
 
@@ -53,18 +51,66 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc)
 
 void GPU_D12_Startup(void)
 {
+    /* Init device */
     GPU_D12_InitDevice();
+
+    /* Init queues */
+    {
+        GPU_D12_QueueDesc descs[] = {
+            {.kind = GPU_QueueKind_Direct,          .d3d_type = D3D12_COMMAND_LIST_TYPE_DIRECT,     .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Direct queue") },
+            {.kind = GPU_QueueKind_Compute,         .d3d_type = D3D12_COMMAND_LIST_TYPE_COMPUTE,    .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Compute queue") },
+            {.kind = GPU_QueueKind_Copy,            .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY,       .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH,      .dbg_name = Lit("Copy queue") },
+            {.kind = GPU_QueueKind_BackgroundCopy,  .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY,       .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Background copy queue") }
+        };
+        u32 job_count = 0; Fence job_fence = ZI;
+        job_count += RunJob(GPU_D12_InitQueue, .count = GPU_NumQueues, .sig.descs = descs, .fence = &job_fence);
+        YieldOnFence(&job_fence, job_count);
+    }
+
+    /* Start queue sync job */
+    RunJob(GPU_D12_StartQueueSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
 }
 
 ////////////////////////////////
-//~ Device initialization
+//~ Initialization
+
+//- Device initialization
 
 void GPU_D12_InitDevice(void)
 {
     GPU_D12_SharedState *g = &GPU_D12_shared_state;
     TempArena scratch = BeginScratchNoConflict();
     HRESULT hr = 0;
+
+    /* Enable debug layer */
     u32 dxgi_factory_flags = 0;
+#if GPU_DEBUG
+    {
+        __profn("Enable debug layer");
+        ID3D12Debug *debug_controller0 = 0;
+        hr = D3D12GetDebugInterface(&IID_ID3D12Debug, (void **)&debug_controller0);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to create ID3D12Debug0"));
+        }
+
+        ID3D12Debug1 *debug_controller1 = 0;
+        hr = ID3D12Debug_QueryInterface(debug_controller0, &IID_ID3D12Debug1, (void **)&debug_controller1);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to create ID3D12Debug1"));
+        }
+
+        ID3D12Debug_EnableDebugLayer(debug_controller0);
+
+        /* FIXME: Enable this */
+        //ID3D12Debug1_SetEnableGPUBasedValidation(debug_controller1, 1);
+
+        ID3D12Debug_Release(debug_controller1);
+        ID3D12Debug_Release(debug_controller0);
+        dxgi_factory_flags |= DXGI_CREATE_FACTORY_DEBUG;
+    }
+#endif
 
     /* Create factory */
     {
@@ -131,9 +177,74 @@ void GPU_D12_InitDevice(void)
         g->device = device;
     }
 
+#if GPU_DEBUG
+    /* Enable D3D12 Debug break */
+    {
+        __profn("Enable d3d12 debug break");
+        ID3D12InfoQueue *info = 0;
+        hr = ID3D12Device_QueryInterface(g->device, &IID_ID3D12InfoQueue, (void **)&info);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to query ID3D12Device interface"));
+        }
+        ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_CORRUPTION, 1);
+        ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_ERROR, 1);
+        ID3D12InfoQueue_Release(info);
+    }
+
+    /* Enable DXGI Debug break */
+    {
+        __profn("Enable dxgi debug break");
+        IDXGIInfoQueue *dxgi_info = 0;
+        hr = DXGIGetDebugInterface1(0, &IID_IDXGIInfoQueue, (void **)&dxgi_info);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to get DXGI debug interface"));
+        }
+        IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION, 1);
+        IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR, 1);
+        IDXGIInfoQueue_Release(dxgi_info);
+    }
+#endif
+
     EndScratch(scratch);
 }
 
+//- Queue initialization
+
+JobDef(GPU_D12_InitQueue, sig, id)
+{
+    GPU_D12_SharedState *g = &GPU_D12_shared_state;
+    GPU_D12_QueueDesc desc = sig->descs[id];
+    Arena *perm = PermArena();
+    HRESULT hr = 0;
+
+    GPU_D12_Queue *queue = 0;
+    {
+        PushAlign(perm, CachelineSize);
+        queue = PushStruct(perm, GPU_D12_Queue);
+        PushAlign(perm, CachelineSize);
+    }
+    queue->desc = desc;
+
+    D3D12_COMMAND_QUEUE_DESC d3d_desc = ZI;
+    d3d_desc.Type = desc.d3d_type;
+    d3d_desc.Priority = desc.d3d_priority;
+    hr = ID3D12Device_CreateCommandQueue(g->device, &d3d_desc, &IID_ID3D12CommandQueue, (void **)&queue->cq);
+    if (FAILED(hr))
+    {
+        Panic(Lit("Failed to create command queue"));
+    }
+
+    hr = ID3D12Device_CreateFence(g->device, 0, 0, &IID_ID3D12Fence, (void **)&queue->submit_fence);
+    if (FAILED(hr))
+    {
+        Panic(Lit("Failed to create command queue fence"));
+    }
+
+    g->queues[desc.kind] = queue;
+}
+
 ////////////////////////////////
 //~ Pipeline operations
 
@@ -148,8 +259,8 @@ GPU_D12_Pipeline *GPU_D12_PipelineFromDesc(GPU_D12_PipelineDesc desc)
 
 GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind)
 {
-    /* TODO */
-    return 0;
+    GPU_D12_SharedState *g = &GPU_D12_shared_state;
+    return g->queues[kind];
 }
 
 ////////////////////////////////
@@ -190,13 +301,13 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
         }
         cl->queue = queue;
 
-        HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.type, &IID_ID3D12CommandAllocator, (void **)&cl->ca);
+        HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.d3d_type, &IID_ID3D12CommandAllocator, (void **)&cl->ca);
         if (FAILED(hr))
         {
             Panic(Lit("Failed to create command allocator"));
         }
 
-        hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl);
+        hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.d3d_type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl);
         if (FAILED(hr))
         {
             Panic(Lit("Failed to create command list"));
@@ -227,7 +338,7 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
     return cl;
 }
 
-void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
+u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
 {
     GPU_D12_Queue *queue = cl->queue;
 
@@ -243,11 +354,12 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
     }
 
     /* Submit */
+    u64 target = 0;
     {
         __profn("Execute");
         Lock lock = LockE(&queue->submit_mutex);
         {
-            u64 target = ++queue->submit_fence_target;
+            target = ++queue->submit_fence_target;
             cl->submit_fence_target = target;
             /* Execute */
             ID3D12CommandQueue_ExecuteCommandLists(queue->cq, 1, (ID3D12CommandList **)&cl->cl);
@@ -257,6 +369,38 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
         }
         Unlock(&lock);
     }
+
+    return target;
+}
+
+////////////////////////////////
+//~ Queue sync job
+
+JobDef(GPU_D12_StartQueueSync, _, __)
+{
+    GPU_D12_SharedState *g = &GPU_D12_shared_state;
+    HANDLE queue_fences_events[GPU_NumQueues] = ZI;
+    u64 queue_fences_seen[GPU_NumQueues] = ZI;
+    for (i32 i = 0; i < countof(queue_fences_events); ++i)
+    {
+        queue_fences_events[i] = CreateEvent(0, 0, 1, 0);
+    }
+    for (;;)
+    {
+        WaitForMultipleObjects(countof(queue_fences_events), queue_fences_events, 0, INFINITE);
+        for (GPU_QueueKind queue_kind = 0; queue_kind < GPU_NumQueues; ++queue_kind)
+        {
+            GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
+            u64 last_seen = queue_fences_seen[queue_kind];
+            u64 completed = ID3D12Fence_GetCompletedValue(queue->submit_fence);
+            if (completed > last_seen)
+            {
+                SetFence(&queue->sync_fence, completed);
+                queue_fences_seen[queue_kind] = completed;
+                ID3D12Fence_SetEventOnCompletion(queue->submit_fence, completed + 1, queue_fences_events[queue_kind]);
+            }
+        }
+    }
 }
 
 ////////////////////////////////
@@ -267,6 +411,15 @@ void GPU_Startup(void)
     GPU_D12_Startup();
 }
 
+////////////////////////////////
+//~ @hookdecl Fence hooks
+
+Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind)
+{
+    GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
+    return &queue->sync_fence;
+}
+
 ////////////////////////////////
 //~ @hookdef Rasterizer helper hooks
 
@@ -398,7 +551,7 @@ GPU_Resource *GPU_AcquireResource(GPU_ResourceDesc desc)
                 d3d_desc.Alignment = 0;
                 d3d_desc.Width = desc.texture.size.x;
                 d3d_desc.Height = desc.texture.size.y;
-                d3d_desc.DepthOrArraySize = desc.texture.size.y;
+                d3d_desc.DepthOrArraySize = desc.texture.size.z;
                 d3d_desc.MipLevels = 1;
                 d3d_desc.SampleDesc.Count = 1;
                 d3d_desc.SampleDesc.Quality = 0;
@@ -465,7 +618,7 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource)
 ////////////////////////////////
 //~ @hookdef Command list hooks
 
-GPU_CommandList *GPU_BeginCommandList(void)
+GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind)
 {
     GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
     Arena *perm = PermArena();
@@ -479,23 +632,16 @@ GPU_CommandList *GPU_BeginCommandList(void)
     {
         cl = PushStruct(perm, GPU_D12_CommandList);
     }
+    cl->queue_kind = queue_kind;
     return (GPU_CommandList *)cl;
 }
 
-u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
+u64 GPU_EndCommandList(GPU_CommandList *gpu_cl)
 {
     GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
     GPU_D12_CommandList *cl = (GPU_D12_CommandList *)gpu_cl;
-
-    /* Determine queue kind */
-#if 0
-    GPU_QueueKind queue_kind = GPU_QueueKind_Direct;
-#else
-    GPU_QueueKind queue_kind = GPU_QueueKind_BackgroundCopy;
-    for (GPU_D12_Command *cmd = cl->first; cmd; cmd = cmd->next)
-    {
-    }
-#endif
+    GPU_QueueKind queue_kind = cl->queue_kind;
+    GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
 
     /* Begin dx12 command list */
     GPU_D12_RawCommandList *dx12_cl = GPU_D12_BeginRawCommandList(queue_kind);
@@ -654,7 +800,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
     }
 
     /* End dx12 command list */
-    GPU_D12_EndRawCommandList(dx12_cl);
+    u64 fence_target = GPU_D12_EndRawCommandList(dx12_cl);
 
     /* Free commands */
     if (cl->last)
@@ -666,7 +812,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
     /* Free command list */
     StackPush(f->first_free_command_list, cl);
 
-    return 1;
+    return fence_target;
 }
 
 ////////////////////////////////
diff --git a/src/gpu/gpu_dx12/gpu_dx12.h b/src/gpu/gpu_dx12/gpu_dx12.h
index 88930ce9..55c429e9 100644
--- a/src/gpu/gpu_dx12/gpu_dx12.h
+++ b/src/gpu/gpu_dx12/gpu_dx12.h
@@ -53,8 +53,9 @@ Struct(GPU_D12_Resource)
 
 Struct(GPU_D12_QueueDesc)
 {
-    enum D3D12_COMMAND_LIST_TYPE type;
-    enum D3D12_COMMAND_QUEUE_PRIORITY priority;
+    GPU_QueueKind kind;
+    D3D12_COMMAND_LIST_TYPE d3d_type;
+    D3D12_COMMAND_QUEUE_PRIORITY d3d_priority;
     String dbg_name;
 };
 
@@ -68,6 +69,8 @@ Struct(GPU_D12_Queue)
     u64 submit_fence_target;
     struct GPU_D12_RawCommandList *first_submitted_cl;
     struct GPU_D12_RawCommandList *last_submitted_cl;
+
+    Fence sync_fence;
 };
 
 ////////////////////////////////
@@ -163,6 +166,8 @@ Struct(GPU_D12_CommandList)
     GPU_D12_Command *first;
     GPU_D12_Command *last;
     u64 count;
+
+    GPU_QueueKind queue_kind;
 };
 
 ////////////////////////////////
@@ -189,6 +194,9 @@ Struct(GPU_D12_SharedState)
 {
     GPU_D12_FiberState *fiber_states[MaxFibers];
 
+    /* Queues */
+    GPU_D12_Queue *queues[GPU_NumQueues];
+
     /* Resources */
     Mutex free_resources_mutex;
     GPU_D12_Resource *first_free_resource;
@@ -213,10 +221,14 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc);
 void GPU_D12_Startup(void);
 
 ////////////////////////////////
-//~ Device initialization
+//~ Initialization
 
+//- Device initialization
 void GPU_D12_InitDevice(void);
 
+//- Queue initialization
+JobDecl(GPU_D12_InitQueue, { GPU_D12_QueueDesc *descs; });
+
 ////////////////////////////////
 //~ Pipeline operations
 
@@ -231,4 +243,9 @@ GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind);
 //~ Raw command list operations
 
 GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind);
-void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl);
+u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl);
+
+////////////////////////////////
+//~ Sync job
+
+JobDecl(GPU_D12_StartQueueSync, EmptySig);
diff --git a/src/platform/platform_win32/platform_win32.c b/src/platform/platform_win32/platform_win32.c
index 030d8880..d79638a3 100644
--- a/src/platform/platform_win32/platform_win32.c
+++ b/src/platform/platform_win32/platform_win32.c
@@ -101,7 +101,7 @@ void P_Startup(void)
     g->socks_arena = AcquireArena(Gibi(64));
 
     //- Init timer
-    RunJob(P_W32_UpdateTimer);
+    RunJob(P_W32_StartTimerSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
 }
 
 ////////////////////////////////
@@ -179,11 +179,11 @@ P_W32_Window *P_W32_AcquireWindow(void)
     window->event_arenas[0] = AcquireArena(Gibi(64));
     window->event_arenas[1] = AcquireArena(Gibi(64));
 
-    /* Start window event thread */
-    /* NOTE: This thread must finish building for the window to actually be
+    /* Start window event job */
+    /* NOTE: This job must finish starting for the window to actually be
      * created and receive a HWND, because on Windows a the event proc must run on
      * the same thread that created the window. */
-    window->window_thread = W32_StartThread(&P_W32_WindowThreadEntryFunc, window, Lit("Window thread"), PROF_THREAD_GROUP_WINDOW);
+    RunJob(P_W32_StartWindowMsgProcessing, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated, .sig.window = window);
     YieldOnFence(&window->ready_fence, 1);
 
     return window;
@@ -195,7 +195,7 @@ void P_W32_ReleaseWindow(P_W32_Window *window)
     Atomic32Set(&window->shutdown, 1);
     P_W32_SharedState *g = &P_W32_shared_state;
     P_W32_WakeWindow(window);
-    W32_WaitEndThread(window->window_thread);
+    YieldOnFence(&window->finished_fence, 1);
 
     Lock lock = LockE(&g->windows_mutex);
     {
@@ -389,11 +389,11 @@ void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *sett
 }
 
 ////////////////////////////////
-//~ Win32 window thread
+//~ Win32 window message processing
 
-W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg)
+JobDef(P_W32_StartWindowMsgProcessing, sig, id)
 {
-    P_W32_Window *window = (P_W32_Window *)arg;
+    P_W32_Window *window = sig->window;
 
     /* Win32 limitation: Window must be initialized on same thread that processes events */
     window->hwnd = P_W32_InitWindow(window);
@@ -419,6 +419,7 @@ W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg)
 
     /* Destroy window hwnd */
     DestroyWindow(window->hwnd);
+    SetFence(&window->finished_fence, 1);
 }
 
 void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event)
@@ -867,7 +868,7 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr)
 ////////////////////////////////
 //~ Timer job
 
-JobDef(P_W32_UpdateTimer, _, __)
+JobDef(P_W32_StartTimerSync, _, __)
 {
     P_W32_SharedState *g = &P_W32_shared_state;
     SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
diff --git a/src/platform/platform_win32/platform_win32.h b/src/platform/platform_win32/platform_win32.h
index 4c05d1d3..f43a9d85 100644
--- a/src/platform/platform_win32/platform_win32.h
+++ b/src/platform/platform_win32/platform_win32.h
@@ -42,6 +42,7 @@ Struct(P_W32_Window)
 
     HWND hwnd;
     Fence ready_fence;
+    Fence finished_fence;
 
     u16 utf16_high_surrogate_last_input;
 
@@ -67,8 +68,6 @@ Struct(P_W32_Window)
     i32 current_event_arena_index;
     Arena *event_arenas[2];
 
-    W32_Thread *window_thread;
-
     Atomic32 shutdown;
     P_W32_Window *next_free;
 };
@@ -160,12 +159,16 @@ P_W32_Window *P_W32_AcquireWindow(void);
 void P_W32_ReleaseWindow(P_W32_Window *window);
 HWND P_W32_InitWindow(P_W32_Window *window);
 
-//- Window settings
+////////////////////////////////
+//~ Window settings
+
 void P_W32_UpdateWindowFromSystem(P_W32_Window *window);
 void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *settings);
 
-//- Window thread
-W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg);
+////////////////////////////////
+//~ Window message processing
+
+JobDecl(P_W32_StartWindowMsgProcessing, { P_W32_Window *window; });
 void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event);
 void P_W32_WakeWindow(P_W32_Window *window);
 LRESULT CALLBACK P_W32_Win32WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam);
@@ -180,4 +183,4 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr);
 ////////////////////////////////
 //~ Timer job
 
-JobDecl(P_W32_UpdateTimer, EmptySig);
+JobDecl(P_W32_StartTimerSync, EmptySig);
diff --git a/src/pp/pp.c b/src/pp/pp.c
index a7aba144..455df07f 100644
--- a/src/pp/pp.c
+++ b/src/pp/pp.c
@@ -406,13 +406,12 @@ GPU_Resource *AcquireGbuffer(GPU_Format format, Vec2I32 size)
 GPU_Resource *AcquireUploadBuffer(u32 element_count, u32 element_size, void *src)
 {
     __prof;
-    u64 size = element_size * element_count;
     GPU_ResourceDesc desc = ZI;
     desc.kind = GPU_ResourceKind_Buffer;
     desc.flags = GPU_ResourceFlag_None;
     desc.buffer.heap_kind = GPU_HeapKind_Upload;
-    desc.buffer.element_size = size;
     desc.buffer.element_count = element_count;
+    desc.buffer.element_capacity = element_count;
     desc.buffer.element_size = element_size;
     GPU_Resource *r = GPU_AcquireResource(desc);
     {
@@ -2151,14 +2150,20 @@ void UpdateUser(P_Window *window)
 
     {
         __profn("Render");
+        GPU_QueueKind gpu_render_queue = GPU_QueueKind_Direct;
         Rect ui_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->ui_size.x, g->ui_size.y));
         Rect render_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->render_size.x, g->render_size.y));
 
+        if (!g->gpu_render_fence)
+        {
+            g->gpu_render_fence = GPU_FenceFromQueue(gpu_render_queue);
+        }
+
         /* Acquire gbuffers */
         if (g->shade_target && !EqVec2I32(g->render_size, GPU_GetTextureSize(g->shade_target)))
         {
             __profn("Release render resources");
-            YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target);
+            YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
             GPU_ReleaseResource(g->albedo,                  GPU_ReleaseFlag_None);
             GPU_ReleaseResource(g->emittance,               GPU_ReleaseFlag_None);
             GPU_ReleaseResource(g->emittance_flood_read,    GPU_ReleaseFlag_None);
@@ -2181,7 +2186,7 @@ void UpdateUser(P_Window *window)
         /* Acquire ui buffers */
         if (g->ui_target && !EqVec2I32(g->ui_size, GPU_GetTextureSize(g->ui_target)))
         {
-            YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target);
+            YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
             GPU_ReleaseResource(g->ui_target, GPU_ReleaseFlag_None);
             g->ui_target = 0;
         }
@@ -2200,7 +2205,7 @@ void UpdateUser(P_Window *window)
         GPU_Resource *ui_shape_indices_buffer = AcquireUploadBufferFromArena(g->ui_shape_indices_count, g->ui_shape_indices_arena);
         GPU_Resource *grids_buffer = AcquireUploadBufferFromArena(g->grids_count, g->grids_arena);
 
-        GPU_CommandList *cl = GPU_BeginCommandList();
+        GPU_CommandList *cl = GPU_BeginCommandList(gpu_render_queue);
         {
             __profn("Run render");
             GPU_ProfN(cl, Lit("Run render"));
@@ -2427,7 +2432,7 @@ void UpdateUser(P_Window *window)
                               GPU_RasterizeMode_TriangleList);
             }
         }
-        g->gpu_render_fence_target += GPU_EndCommandList(cl, &g->gpu_render_fence);
+        g->gpu_render_fence_target = GPU_EndCommandList(cl);
 
         /* Release transfer buffers */
         {
@@ -2444,7 +2449,7 @@ void UpdateUser(P_Window *window)
                 {
                     DelayReleaseGpuResources_Sig *sig = PushStruct(job->arena, DelayReleaseGpuResources_Sig);
                     job->count = countof(release_resources);
-                    sig->begin_fence = &g->gpu_render_fence;
+                    sig->begin_fence = g->gpu_render_fence;
                     sig->begin_fence_target = g->gpu_render_fence_target;
                     sig->resources = PushStructsNoZero(job->arena, GPU_Resource *, job->count);
                     sig->flags = GPU_ReleaseFlag_Reuse;
diff --git a/src/pp/pp.h b/src/pp/pp.h
index 752cca1d..6fde0995 100644
--- a/src/pp/pp.h
+++ b/src/pp/pp.h
@@ -195,7 +195,7 @@ Struct(SharedUserState)
     u32 ui_shape_indices_count;
     u32 grids_count;
 
-    Fence gpu_render_fence;
+    Fence *gpu_render_fence;
     u64 gpu_render_fence_target;
 
     //- Bind state