gpu refactor progress

2025-09-16 22:40:56 -05:00 · 2025-09-16 22:40:56 -05:00 · 34294754c7
commit 34294754c7
parent 4d3a5b7c3e
12 changed files with 363 additions and 209 deletions
--- a/src/base/base.h
+++ b/src/base/base.h
@ -713,12 +713,6 @@ Struct(ComputeShader)   { Resource resource; };
 ////////////////////////////////
 //~ Fibers

-/* If virtual fibers are enabled, each fiber will get its own OS thread,
- * and fiber suspend/resume will be emulated using OS thread primitives.
- * This is slow but allows for easier debugging in tricky cases
- * since the debugger won't be confused by fiber context switching. */
-#define VirtualFibersEnabled 0
-
 # define MaxFibers 4096
 StaticAssert(MaxFibers < I16Max);  /* MaxFibers should fit in FiberId */

@ -730,7 +724,6 @@ StaticAssert(MaxFibers < I16Max);  /* MaxFibers should fit in FiberId */
 # endif
 #endif

-
 ////////////////////////////////
 //~ Exit callback types

--- a/src/base/base_job.h
+++ b/src/base/base_job.h
@ -39,6 +39,29 @@ Enum(JobPool)

 typedef void JobFunc(void *, i32);

+Enum(JobFlag)
+{
+    JobFlag_None        = 0,
+
+    /* A dedicated job is a heavy weight job that will receive its own OS
+     * thread and will never yield. When the fiber running the job suspends
+     * itself, the dedicated thread will perform a blocking wait rather than
+     * yielding the thread to another fiber. This is mainly useful long-running
+     * dispatcher-esque jobs that block on OS primitives, since occupying a
+     * worker thread (and thereby preventing non-blocking jobs from running on
+     * that worker) is unwanted.
+     *
+     * For example, Win32 window message processing is required by the OS to
+     * occur on the same thread that initially created the window, which means
+     * it actually must run inside a dedicated job to prevent message processing
+     * from yielding & resuming on another thread. The message processing loop
+     * can block until messages are received from the OS without having to
+     * occupy a job worker while it blocks, and can then wake yielding
+     * jobs onto job worker pools based on the messages it received.
+     */
+    JobFlag_Dedicated   = (1 << 0),
+};
+
 Struct(Job)
 {
    /* Internal */
@ -51,6 +74,7 @@ Struct(Job)
    JobPool pool;

    /* Configurable between OpenJob & CloseJob */
+    JobFlag flags;
    i32 count;
    Fence *fence;
    void *sig;
@ -72,10 +96,10 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids);  /* NOTE: Must only be c

 #define EmptySig { i32 _; }

-#define JobDecl(job, sigdef)                                                                                \
-    typedef struct job##_Sig sigdef job##_Sig;                                                              \
-    Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; job##_Sig sig; };     \
-    void job(job##_Sig *, i32);                                                                             \
+#define JobDecl(job, sigdef)                                                                                        \
+    typedef struct job##_Sig sigdef job##_Sig;                                                                      \
+    Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; JobFlag flags; job##_Sig sig; };     \
+    void job(job##_Sig *, i32);                                                                                     \
    StaticAssert(1)

 #define JobDef(job, sig_arg, id_arg) void job(job##_Sig *sig_arg, i32 id_arg)
@ -103,6 +127,7 @@ do {
    Job *__job = OpenJob(__desc.func, __desc.pool);                                                     \
    __job->count = __desc.count;                                                                        \
    __job->fence = __desc.fence;                                                                        \
+    __job->flags = __desc.flags;                                                                        \
    __job->sig = PushStructNoZero(__job->arena, job_func##_Sig);                                        \
    CopyBytes(__job->sig, &__desc.sig, sizeof(__desc.sig));                                             \
    CloseJob(__job);                                                                                    \
@ -110,20 +135,3 @@ do {

 Job *OpenJob(JobFunc *func, JobPool pool_kind);
 u32 CloseJob(Job *job);
-
-////////////////////////////////
-//~ @hookdecl Dedicated job operations
-
-/* A dedicated job is a heavy weight job that will not operate inside of any
- * job pool. As such, it receives its own dedicated thread, and never yields to
- * other fibers. Instead of yielding when the fiber suspends, it performs a blocking
- * wait that puts the OS thread to sleep. This is mainly useful for
- * implementing long-running blocking dispatcher-like jobs tasks for subsystems.
- *
- * For example, Win32 window message processing is required by the OS to occur
- * on the same thread that initially created the window, which means it
- * actually must run inside a dedicated job to prevent message processing from
- * yielding & resuming on another thread.
- */
-
-void RunDedicatedJob(JobFunc job_func);
--- a/src/base/base_win32/base_win32_job.c
+++ b/src/base/base_win32/base_win32_job.c
@ -95,6 +95,10 @@ void InitJobSystem(void)
 ////////////////////////////////
 //~ Win32 thread

+JobDef(W32_DummyJob, sig, id)
+{
+}
+
 DWORD WINAPI W32_Win32ThreadProc(LPVOID vt)
 {
    /* Convert thread to fiber */
@ -141,15 +145,7 @@ W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, Str
    t->thread_udata = thread_udata;
    t->profiler_group = profiler_group;

-    t->handle = CreateThread(
-        0,
-        W32_FiberStackSize,
-        W32_Win32ThreadProc,
-        t,
-        0,
-        0
-    );
-
+    t->handle = CreateThread(0, W32_FiberStackSize, W32_Win32ThreadProc, t, 0, 0);
    if (!t->handle)
    {
        Panic(Lit("Failed to create thread"));
@ -278,7 +274,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
        {
            __profn("CreateFiber");
            fiber->pool = pool->kind;
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
            fiber->addr = CreateThread(0, W32_FiberStackSize, W32_VirtualFiberEntryPoint, (void *)(i64)fiber_id, 0, 0);
 #else
            fiber->addr = CreateFiber(W32_FiberStackSize, W32_FiberEntryPoint, (void *)(i64)fiber_id);
@ -289,7 +285,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
            /* Fiber is not a part of a job pool, convert thread to fiber */
            __profn("ConvertThreadToFiber");
            fiber->addr = ConvertThreadToFiber((void *)(i64)fiber_id);
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
            fiber->addr = GetCurrentThread();
 #endif
        }
@ -319,7 +315,7 @@ ForceInline W32_Fiber *W32_FiberFromId(i16 id)

 void W32_SwitchToFiber(W32_Fiber *target)
 {
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
    W32_Fiber *self = W32_FiberFromId(FiberId());
    Atomic8Set(&self->virtual_yield, 1);
    /* Signal virtual target */
@ -351,9 +347,9 @@ void W32_FiberEntryPoint(void *_)
    W32_JobPool *pool = &W32_shared_job_state.job_pools[fiber->pool];
    JobPool pool_kind = fiber->pool;
    char *fiber_name_cstr = fiber->name_cstr;
+    __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
    for (;;)
    {
-        __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
        W32_Task *task = fiber->task;
        Job *job = task->job;

@ -542,26 +538,12 @@ void SuspendFiber(void)
    __prof;
    i16 fiber_id = FiberId();
    W32_Fiber *fiber = W32_FiberFromId(FiberId());
-    i16 return_id = fiber->return_id;
    __prof_fiber_leave();
-    if (return_id > 0)
    {
-        /* Suspend task fiber (return control flow to parent/worker fiber) */
        Atomic8Set(&fiber->status, W32_FiberStatus_Suspending);
-        W32_Fiber *parent_fiber = W32_FiberFromId(return_id);
+        W32_Fiber *parent_fiber = W32_FiberFromId(fiber->return_id);
        W32_SwitchToFiber(parent_fiber);
    }
-    else
-    {
-        /* Suspend dedicated fiber (block thread) */
-        Atomic8Set(&fiber->status, W32_FiberStatus_Suspended);
-        i8 status = W32_FiberStatus_Suspended;
-        while (status != W32_FiberStatus_None)
-        {
-            WaitOnAddress(&fiber->status, &status, sizeof(status), INFINITE);
-            status = Atomic8Fetch(&fiber->status);
-        }
-    }
    __prof_fiber_enter(fiber->name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(fiber->pool) + Kibi(1) + fiber->id);
 }

@ -587,21 +569,21 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids)
        /* Update fiber status */
        Atomic8Set(&fiber->status, W32_FiberStatus_None);

-        i16 return_id = fiber->return_id;
-        if (return_id > 0)
+        W32_Task *task = fiber->task;
+        // if (task->job->flags & JobFlag_Dedicated)
+        if (0)
+        {
+            /* TODO: Wake dedicated fiber right now */
+            WakeByAddressSingle(&fiber->status);
+        }
+        else
        {
            /* Group task based on pool */
-            W32_Task *task = fiber->task;
            JobPool pool_kind = fiber->pool;
            W32_TaskList *pool_tasks = &tasks_by_pool[pool_kind];
            QueuePush(pool_tasks->first, pool_tasks->last, task);
            ++pool_tasks->count;
        }
-        else
-        {
-            /* Wake dedicated fiber right now */
-            WakeByAddressSingle(&fiber->status);
-        }
    }

    /* Submit tasks */
@ -689,105 +671,99 @@ u32 CloseJob(Job *job)
 {
    TempArena scratch = BeginScratchNoConflict();

-    W32_JobPool *pool = &W32_shared_job_state.job_pools[job->pool];
+    JobPool pool_kind = job->pool;
+    W32_JobPool *pool = &W32_shared_job_state.job_pools[pool_kind];
    u32 num_tasks = job->count;

-    if (num_tasks > 0)
+    if (num_tasks == 0)
    {
-        /* Allocate tasks from free list */
-        u32 num_tasks_allocated = 0;
-        W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks);
+        Assert(0);
+        job->func = W32_DummyJob;
+        num_tasks = 1;
+    }
+
+    /* Allocate tasks from free list */
+    u32 num_tasks_allocated = 0;
+    W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks);
+    {
+        LockTicketMutex(&pool->free_tasks_tm);
        {
-            LockTicketMutex(&pool->free_tasks_tm);
+            while (num_tasks_allocated < num_tasks)
            {
-                while (num_tasks_allocated < num_tasks)
+                W32_Task *task = pool->first_free_task;
+                if (task)
                {
-                    W32_Task *task = pool->first_free_task;
-                    if (task)
-                    {
-                        tasks_array[num_tasks_allocated++] = task;
-                        StackPop(pool->first_free_task);
-                    }
-                    else
-                    {
-                        break;
-                    }
-                }
-            }
-            UnlockTicketMutex(&pool->free_tasks_tm);
-        }
-
-        /* Allocate new tasks from memory */
-        u32 remaining = num_tasks - num_tasks_allocated;
-        if (remaining > 0)
-        {
-            Arena *perm = PermArena();
-            PushAlign(perm, CachelineSize);
-            W32_Task *pushed_tasks = PushStructsNoZero(perm, W32_Task, remaining);
-            for (u32 i = 0; i < remaining; ++i)
-            {
-                tasks_array[num_tasks_allocated + i] = &pushed_tasks[i];
-            }
-            num_tasks_allocated += remaining;
-            PushAlign(perm, CachelineSize);
-        }
-
-        /* Generate task list */
-        W32_TaskList tasks = ZI;
-        for (u32 i = 0; i < num_tasks; ++i)
-        {
-            W32_Task *task = tasks_array[i];
-            ZeroStruct(task);
-            task->job = job;
-            task->task_id = tasks.count++;
-            QueuePush(tasks.first, tasks.last, task);
-        }
-
-        /* Push tasks to back of pool */
-        {
-            LockTicketMutex(&pool->tasks_tm);
-            {
-                if (pool->last_task)
-                {
-                    pool->last_task->next = tasks.first;
+                    tasks_array[num_tasks_allocated++] = task;
+                    StackPop(pool->first_free_task);
                }
                else
                {
-                    pool->first_task = tasks.first;
+                    break;
                }
-                pool->last_task = tasks.last;
-                Atomic64FetchAdd(&pool->tasks_count.v, num_tasks);
-            }
-            UnlockTicketMutex(&pool->tasks_tm);
-        }
-
-        /* Wake workers */
-        if (num_tasks >= W32_WakeAllWorkersThreshold)
-        {
-            WakeByAddressAll(&pool->tasks_count);
-        }
-        else
-        {
-            for (u32 i = 0; i < num_tasks; ++i)
-            {
-                WakeByAddressSingle(&pool->tasks_count);
            }
        }
+        UnlockTicketMutex(&pool->free_tasks_tm);
    }
-    else if (job->fence)
+
+    /* Allocate new tasks from memory */
+    u32 remaining = num_tasks - num_tasks_allocated;
+    if (remaining > 0)
    {
-        FetchAddFence(job->fence, 1);
+        Arena *perm = PermArena();
+        PushAlign(perm, CachelineSize);
+        W32_Task *pushed_tasks = PushStructsNoZero(perm, W32_Task, remaining);
+        for (u32 i = 0; i < remaining; ++i)
+        {
+            tasks_array[num_tasks_allocated + i] = &pushed_tasks[i];
+        }
+        num_tasks_allocated += remaining;
+        PushAlign(perm, CachelineSize);
+    }
+
+    /* FIXME: Handle dedicated jobs separately */
+
+    /* Generate task list */
+    W32_TaskList tasks = ZI;
+    for (u32 i = 0; i < num_tasks; ++i)
+    {
+        W32_Task *task = tasks_array[i];
+        ZeroStruct(task);
+        task->job = job;
+        task->task_id = tasks.count++;
+        QueuePush(tasks.first, tasks.last, task);
+    }
+
+    /* Push tasks to back of pool */
+    {
+        LockTicketMutex(&pool->tasks_tm);
+        {
+            if (pool->last_task)
+            {
+                pool->last_task->next = tasks.first;
+            }
+            else
+            {
+                pool->first_task = tasks.first;
+            }
+            pool->last_task = tasks.last;
+            Atomic64FetchAdd(&pool->tasks_count.v, num_tasks);
+        }
+        UnlockTicketMutex(&pool->tasks_tm);
+    }
+
+    /* Wake workers */
+    if (num_tasks >= W32_WakeAllWorkersThreshold)
+    {
+        WakeByAddressAll(&pool->tasks_count);
+    }
+    else
+    {
+        for (u32 i = 0; i < num_tasks; ++i)
+        {
+            WakeByAddressSingle(&pool->tasks_count);
+        }
    }

    EndScratch(scratch);
    return 1;
 }
-
-////////////////////////////////
-//~ @hookdef Dedicated job operations
-
-void RunDedicatedJob(JobFunc job_func)
-{
-    /* TODO: Implement */
-    Assert(0);
-}
--- a/src/base/base_win32/base_win32_job.h
+++ b/src/base/base_win32/base_win32_job.h
@ -178,6 +178,7 @@ DWORD WINAPI W32_Win32ThreadProc(LPVOID vt);
 W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, String thread_name, i32 profiler_group);
 b32 W32_TryEndThread(W32_Thread *thread, f32 timeout_seconds);
 void W32_WaitEndThread(W32_Thread *thread);
+JobDecl(W32_DummyJob, EmptySig);

 ////////////////////////////////
 //~ Fiber operations
--- a/src/config.h
+++ b/src/config.h
@ -69,6 +69,14 @@

 #define FLOOD_DEBUG 0

+#define GPU_DEBUG 1
+
+/* If virtual fibers are enabled, each fiber will get its own OS thread,
+ * and fiber suspend/resume will be emulated using OS thread primitives.
+ * This is slow but allows for easier debugging in tricky cases
+ * since the debugger won't be confused by fiber context switching. */
+#define VIRTUAL_FIBERS 0
+
 /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
 #define BITBUFF_DEBUG 0
 #define BITBUFF_TEST RtcIsEnabled
--- a/src/gpu/gpu.h
+++ b/src/gpu/gpu.h
@ -259,15 +259,6 @@ Struct(GPU_Scissor)
    f32 bottom;
 };

-////////////////////////////////
-//~ Fence types
-
-Struct(GPU_Fence)
-{
-    u64 targets[GPU_NumQueues];
-    u32 num_targets;
-};
-
 ////////////////////////////////
 //~ Memory info types

@ -281,6 +272,11 @@ Struct(GPU_MemoryInfo)

 void GPU_Startup(void);

+////////////////////////////////
+//~ @hookdecl Fence operations
+
+Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind);
+
 ////////////////////////////////
 //~ @hookdecl Rasterizer helpers

@ -299,8 +295,8 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource);
 ////////////////////////////////
 //~ @hookdecl Command list operations

-GPU_CommandList *GPU_BeginCommandList(void);
-u32 GPU_EndCommandList(GPU_CommandList *cl, Fence *fence);
+GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind);
+u64 GPU_EndCommandList(GPU_CommandList *cl);  /* Returns the value that the queue's fence will be set to once the command is completed */

 ////////////////////////////////
 //~ @hookdecl Profiling helpers
--- a/src/gpu/gpu_dx12/gpu_dx12.c
+++ b/src/gpu/gpu_dx12/gpu_dx12.c
@ -1,7 +1,5 @@
 GPU_D12_SharedState GPU_D12_shared_state = ZI;

-
-
 ////////////////////////////////
 //~ Helpers

@ -53,18 +51,66 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc)

 void GPU_D12_Startup(void)
 {
+    /* Init device */
    GPU_D12_InitDevice();
+
+    /* Init queues */
+    {
+        GPU_D12_QueueDesc descs[] = {
+            {.kind = GPU_QueueKind_Direct,          .d3d_type = D3D12_COMMAND_LIST_TYPE_DIRECT,     .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Direct queue") },
+            {.kind = GPU_QueueKind_Compute,         .d3d_type = D3D12_COMMAND_LIST_TYPE_COMPUTE,    .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Compute queue") },
+            {.kind = GPU_QueueKind_Copy,            .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY,       .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH,      .dbg_name = Lit("Copy queue") },
+            {.kind = GPU_QueueKind_BackgroundCopy,  .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY,       .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Background copy queue") }
+        };
+        u32 job_count = 0; Fence job_fence = ZI;
+        job_count += RunJob(GPU_D12_InitQueue, .count = GPU_NumQueues, .sig.descs = descs, .fence = &job_fence);
+        YieldOnFence(&job_fence, job_count);
+    }
+
+    /* Start queue sync job */
+    RunJob(GPU_D12_StartQueueSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
 }

 ////////////////////////////////
-//~ Device initialization
+//~ Initialization
+
+//- Device initialization

 void GPU_D12_InitDevice(void)
 {
    GPU_D12_SharedState *g = &GPU_D12_shared_state;
    TempArena scratch = BeginScratchNoConflict();
    HRESULT hr = 0;
+
+    /* Enable debug layer */
    u32 dxgi_factory_flags = 0;
+#if GPU_DEBUG
+    {
+        __profn("Enable debug layer");
+        ID3D12Debug *debug_controller0 = 0;
+        hr = D3D12GetDebugInterface(&IID_ID3D12Debug, (void **)&debug_controller0);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to create ID3D12Debug0"));
+        }
+
+        ID3D12Debug1 *debug_controller1 = 0;
+        hr = ID3D12Debug_QueryInterface(debug_controller0, &IID_ID3D12Debug1, (void **)&debug_controller1);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to create ID3D12Debug1"));
+        }
+
+        ID3D12Debug_EnableDebugLayer(debug_controller0);
+
+        /* FIXME: Enable this */
+        //ID3D12Debug1_SetEnableGPUBasedValidation(debug_controller1, 1);
+
+        ID3D12Debug_Release(debug_controller1);
+        ID3D12Debug_Release(debug_controller0);
+        dxgi_factory_flags |= DXGI_CREATE_FACTORY_DEBUG;
+    }
+#endif

    /* Create factory */
    {
@ -131,9 +177,74 @@ void GPU_D12_InitDevice(void)
        g->device = device;
    }

+#if GPU_DEBUG
+    /* Enable D3D12 Debug break */
+    {
+        __profn("Enable d3d12 debug break");
+        ID3D12InfoQueue *info = 0;
+        hr = ID3D12Device_QueryInterface(g->device, &IID_ID3D12InfoQueue, (void **)&info);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to query ID3D12Device interface"));
+        }
+        ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_CORRUPTION, 1);
+        ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_ERROR, 1);
+        ID3D12InfoQueue_Release(info);
+    }
+
+    /* Enable DXGI Debug break */
+    {
+        __profn("Enable dxgi debug break");
+        IDXGIInfoQueue *dxgi_info = 0;
+        hr = DXGIGetDebugInterface1(0, &IID_IDXGIInfoQueue, (void **)&dxgi_info);
+        if (FAILED(hr))
+        {
+            Panic(Lit("Failed to get DXGI debug interface"));
+        }
+        IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION, 1);
+        IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR, 1);
+        IDXGIInfoQueue_Release(dxgi_info);
+    }
+#endif
+
    EndScratch(scratch);
 }

+//- Queue initialization
+
+JobDef(GPU_D12_InitQueue, sig, id)
+{
+    GPU_D12_SharedState *g = &GPU_D12_shared_state;
+    GPU_D12_QueueDesc desc = sig->descs[id];
+    Arena *perm = PermArena();
+    HRESULT hr = 0;
+
+    GPU_D12_Queue *queue = 0;
+    {
+        PushAlign(perm, CachelineSize);
+        queue = PushStruct(perm, GPU_D12_Queue);
+        PushAlign(perm, CachelineSize);
+    }
+    queue->desc = desc;
+
+    D3D12_COMMAND_QUEUE_DESC d3d_desc = ZI;
+    d3d_desc.Type = desc.d3d_type;
+    d3d_desc.Priority = desc.d3d_priority;
+    hr = ID3D12Device_CreateCommandQueue(g->device, &d3d_desc, &IID_ID3D12CommandQueue, (void **)&queue->cq);
+    if (FAILED(hr))
+    {
+        Panic(Lit("Failed to create command queue"));
+    }
+
+    hr = ID3D12Device_CreateFence(g->device, 0, 0, &IID_ID3D12Fence, (void **)&queue->submit_fence);
+    if (FAILED(hr))
+    {
+        Panic(Lit("Failed to create command queue fence"));
+    }
+
+    g->queues[desc.kind] = queue;
+}
+
 ////////////////////////////////
 //~ Pipeline operations

@ -148,8 +259,8 @@ GPU_D12_Pipeline *GPU_D12_PipelineFromDesc(GPU_D12_PipelineDesc desc)

 GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind)
 {
-    /* TODO */
-    return 0;
+    GPU_D12_SharedState *g = &GPU_D12_shared_state;
+    return g->queues[kind];
 }

 ////////////////////////////////
@ -190,13 +301,13 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
        }
        cl->queue = queue;

-        HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.type, &IID_ID3D12CommandAllocator, (void **)&cl->ca);
+        HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.d3d_type, &IID_ID3D12CommandAllocator, (void **)&cl->ca);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to create command allocator"));
        }

-        hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl);
+        hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.d3d_type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to create command list"));
@ -227,7 +338,7 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
    return cl;
 }

-void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
+u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
 {
    GPU_D12_Queue *queue = cl->queue;

@ -243,11 +354,12 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
    }

    /* Submit */
+    u64 target = 0;
    {
        __profn("Execute");
        Lock lock = LockE(&queue->submit_mutex);
        {
-            u64 target = ++queue->submit_fence_target;
+            target = ++queue->submit_fence_target;
            cl->submit_fence_target = target;
            /* Execute */
            ID3D12CommandQueue_ExecuteCommandLists(queue->cq, 1, (ID3D12CommandList **)&cl->cl);
@ -257,6 +369,38 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
        }
        Unlock(&lock);
    }
+
+    return target;
+}
+
+////////////////////////////////
+//~ Queue sync job
+
+JobDef(GPU_D12_StartQueueSync, _, __)
+{
+    GPU_D12_SharedState *g = &GPU_D12_shared_state;
+    HANDLE queue_fences_events[GPU_NumQueues] = ZI;
+    u64 queue_fences_seen[GPU_NumQueues] = ZI;
+    for (i32 i = 0; i < countof(queue_fences_events); ++i)
+    {
+        queue_fences_events[i] = CreateEvent(0, 0, 1, 0);
+    }
+    for (;;)
+    {
+        WaitForMultipleObjects(countof(queue_fences_events), queue_fences_events, 0, INFINITE);
+        for (GPU_QueueKind queue_kind = 0; queue_kind < GPU_NumQueues; ++queue_kind)
+        {
+            GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
+            u64 last_seen = queue_fences_seen[queue_kind];
+            u64 completed = ID3D12Fence_GetCompletedValue(queue->submit_fence);
+            if (completed > last_seen)
+            {
+                SetFence(&queue->sync_fence, completed);
+                queue_fences_seen[queue_kind] = completed;
+                ID3D12Fence_SetEventOnCompletion(queue->submit_fence, completed + 1, queue_fences_events[queue_kind]);
+            }
+        }
+    }
 }

 ////////////////////////////////
@ -267,6 +411,15 @@ void GPU_Startup(void)
    GPU_D12_Startup();
 }

+////////////////////////////////
+//~ @hookdecl Fence hooks
+
+Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind)
+{
+    GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
+    return &queue->sync_fence;
+}
+
 ////////////////////////////////
 //~ @hookdef Rasterizer helper hooks

@ -398,7 +551,7 @@ GPU_Resource *GPU_AcquireResource(GPU_ResourceDesc desc)
                d3d_desc.Alignment = 0;
                d3d_desc.Width = desc.texture.size.x;
                d3d_desc.Height = desc.texture.size.y;
-                d3d_desc.DepthOrArraySize = desc.texture.size.y;
+                d3d_desc.DepthOrArraySize = desc.texture.size.z;
                d3d_desc.MipLevels = 1;
                d3d_desc.SampleDesc.Count = 1;
                d3d_desc.SampleDesc.Quality = 0;
@ -465,7 +618,7 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource)
 ////////////////////////////////
 //~ @hookdef Command list hooks

-GPU_CommandList *GPU_BeginCommandList(void)
+GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind)
 {
    GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
    Arena *perm = PermArena();
@ -479,23 +632,16 @@ GPU_CommandList *GPU_BeginCommandList(void)
    {
        cl = PushStruct(perm, GPU_D12_CommandList);
    }
+    cl->queue_kind = queue_kind;
    return (GPU_CommandList *)cl;
 }

-u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
+u64 GPU_EndCommandList(GPU_CommandList *gpu_cl)
 {
    GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
    GPU_D12_CommandList *cl = (GPU_D12_CommandList *)gpu_cl;
-
-    /* Determine queue kind */
-#if 0
-    GPU_QueueKind queue_kind = GPU_QueueKind_Direct;
-#else
-    GPU_QueueKind queue_kind = GPU_QueueKind_BackgroundCopy;
-    for (GPU_D12_Command *cmd = cl->first; cmd; cmd = cmd->next)
-    {
-    }
-#endif
+    GPU_QueueKind queue_kind = cl->queue_kind;
+    GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);

    /* Begin dx12 command list */
    GPU_D12_RawCommandList *dx12_cl = GPU_D12_BeginRawCommandList(queue_kind);
@ -654,7 +800,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
    }

    /* End dx12 command list */
-    GPU_D12_EndRawCommandList(dx12_cl);
+    u64 fence_target = GPU_D12_EndRawCommandList(dx12_cl);

    /* Free commands */
    if (cl->last)
@ -666,7 +812,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
    /* Free command list */
    StackPush(f->first_free_command_list, cl);

-    return 1;
+    return fence_target;
 }

 ////////////////////////////////
--- a/src/gpu/gpu_dx12/gpu_dx12.h
+++ b/src/gpu/gpu_dx12/gpu_dx12.h
@ -53,8 +53,9 @@ Struct(GPU_D12_Resource)

 Struct(GPU_D12_QueueDesc)
 {
-    enum D3D12_COMMAND_LIST_TYPE type;
-    enum D3D12_COMMAND_QUEUE_PRIORITY priority;
+    GPU_QueueKind kind;
+    D3D12_COMMAND_LIST_TYPE d3d_type;
+    D3D12_COMMAND_QUEUE_PRIORITY d3d_priority;
    String dbg_name;
 };

@ -68,6 +69,8 @@ Struct(GPU_D12_Queue)
    u64 submit_fence_target;
    struct GPU_D12_RawCommandList *first_submitted_cl;
    struct GPU_D12_RawCommandList *last_submitted_cl;
+
+    Fence sync_fence;
 };

 ////////////////////////////////
@ -163,6 +166,8 @@ Struct(GPU_D12_CommandList)
    GPU_D12_Command *first;
    GPU_D12_Command *last;
    u64 count;
+
+    GPU_QueueKind queue_kind;
 };

 ////////////////////////////////
@ -189,6 +194,9 @@ Struct(GPU_D12_SharedState)
 {
    GPU_D12_FiberState *fiber_states[MaxFibers];

+    /* Queues */
+    GPU_D12_Queue *queues[GPU_NumQueues];
+
    /* Resources */
    Mutex free_resources_mutex;
    GPU_D12_Resource *first_free_resource;
@ -213,10 +221,14 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc);
 void GPU_D12_Startup(void);

 ////////////////////////////////
-//~ Device initialization
+//~ Initialization

+//- Device initialization
 void GPU_D12_InitDevice(void);

+//- Queue initialization
+JobDecl(GPU_D12_InitQueue, { GPU_D12_QueueDesc *descs; });
+
 ////////////////////////////////
 //~ Pipeline operations

@ -231,4 +243,9 @@ GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind);
 //~ Raw command list operations

 GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind);
-void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl);
+u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl);
+
+////////////////////////////////
+//~ Sync job
+
+JobDecl(GPU_D12_StartQueueSync, EmptySig);
--- a/src/platform/platform_win32/platform_win32.c
+++ b/src/platform/platform_win32/platform_win32.c
@ -101,7 +101,7 @@ void P_Startup(void)
    g->socks_arena = AcquireArena(Gibi(64));

    //- Init timer
-    RunJob(P_W32_UpdateTimer);
+    RunJob(P_W32_StartTimerSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
 }

 ////////////////////////////////
@ -179,11 +179,11 @@ P_W32_Window *P_W32_AcquireWindow(void)
    window->event_arenas[0] = AcquireArena(Gibi(64));
    window->event_arenas[1] = AcquireArena(Gibi(64));

-    /* Start window event thread */
-    /* NOTE: This thread must finish building for the window to actually be
+    /* Start window event job */
+    /* NOTE: This job must finish starting for the window to actually be
     * created and receive a HWND, because on Windows a the event proc must run on
     * the same thread that created the window. */
-    window->window_thread = W32_StartThread(&P_W32_WindowThreadEntryFunc, window, Lit("Window thread"), PROF_THREAD_GROUP_WINDOW);
+    RunJob(P_W32_StartWindowMsgProcessing, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated, .sig.window = window);
    YieldOnFence(&window->ready_fence, 1);

    return window;
@ -195,7 +195,7 @@ void P_W32_ReleaseWindow(P_W32_Window *window)
    Atomic32Set(&window->shutdown, 1);
    P_W32_SharedState *g = &P_W32_shared_state;
    P_W32_WakeWindow(window);
-    W32_WaitEndThread(window->window_thread);
+    YieldOnFence(&window->finished_fence, 1);

    Lock lock = LockE(&g->windows_mutex);
    {
@ -389,11 +389,11 @@ void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *sett
 }

 ////////////////////////////////
-//~ Win32 window thread
+//~ Win32 window message processing

-W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg)
+JobDef(P_W32_StartWindowMsgProcessing, sig, id)
 {
-    P_W32_Window *window = (P_W32_Window *)arg;
+    P_W32_Window *window = sig->window;

    /* Win32 limitation: Window must be initialized on same thread that processes events */
    window->hwnd = P_W32_InitWindow(window);
@ -419,6 +419,7 @@ W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg)

    /* Destroy window hwnd */
    DestroyWindow(window->hwnd);
+    SetFence(&window->finished_fence, 1);
 }

 void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event)
@ -867,7 +868,7 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr)
 ////////////////////////////////
 //~ Timer job

-JobDef(P_W32_UpdateTimer, _, __)
+JobDef(P_W32_StartTimerSync, _, __)
 {
    P_W32_SharedState *g = &P_W32_shared_state;
    SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
--- a/src/platform/platform_win32/platform_win32.h
+++ b/src/platform/platform_win32/platform_win32.h
@ -42,6 +42,7 @@ Struct(P_W32_Window)

    HWND hwnd;
    Fence ready_fence;
+    Fence finished_fence;

    u16 utf16_high_surrogate_last_input;

@ -67,8 +68,6 @@ Struct(P_W32_Window)
    i32 current_event_arena_index;
    Arena *event_arenas[2];

-    W32_Thread *window_thread;
-
    Atomic32 shutdown;
    P_W32_Window *next_free;
 };
@ -160,12 +159,16 @@ P_W32_Window *P_W32_AcquireWindow(void);
 void P_W32_ReleaseWindow(P_W32_Window *window);
 HWND P_W32_InitWindow(P_W32_Window *window);

-//- Window settings
+////////////////////////////////
+//~ Window settings
+
 void P_W32_UpdateWindowFromSystem(P_W32_Window *window);
 void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *settings);

-//- Window thread
-W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg);
+////////////////////////////////
+//~ Window message processing
+
+JobDecl(P_W32_StartWindowMsgProcessing, { P_W32_Window *window; });
 void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event);
 void P_W32_WakeWindow(P_W32_Window *window);
 LRESULT CALLBACK P_W32_Win32WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam);
@ -180,4 +183,4 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr);
 ////////////////////////////////
 //~ Timer job

-JobDecl(P_W32_UpdateTimer, EmptySig);
+JobDecl(P_W32_StartTimerSync, EmptySig);
--- a/src/pp/pp.c
+++ b/src/pp/pp.c
@ -406,13 +406,12 @@ GPU_Resource *AcquireGbuffer(GPU_Format format, Vec2I32 size)
 GPU_Resource *AcquireUploadBuffer(u32 element_count, u32 element_size, void *src)
 {
    __prof;
-    u64 size = element_size * element_count;
    GPU_ResourceDesc desc = ZI;
    desc.kind = GPU_ResourceKind_Buffer;
    desc.flags = GPU_ResourceFlag_None;
    desc.buffer.heap_kind = GPU_HeapKind_Upload;
-    desc.buffer.element_size = size;
    desc.buffer.element_count = element_count;
+    desc.buffer.element_capacity = element_count;
    desc.buffer.element_size = element_size;
    GPU_Resource *r = GPU_AcquireResource(desc);
    {
@ -2151,14 +2150,20 @@ void UpdateUser(P_Window *window)

    {
        __profn("Render");
+        GPU_QueueKind gpu_render_queue = GPU_QueueKind_Direct;
        Rect ui_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->ui_size.x, g->ui_size.y));
        Rect render_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->render_size.x, g->render_size.y));

+        if (!g->gpu_render_fence)
+        {
+            g->gpu_render_fence = GPU_FenceFromQueue(gpu_render_queue);
+        }
+
        /* Acquire gbuffers */
        if (g->shade_target && !EqVec2I32(g->render_size, GPU_GetTextureSize(g->shade_target)))
        {
            __profn("Release render resources");
-            YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target);
+            YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
            GPU_ReleaseResource(g->albedo,                  GPU_ReleaseFlag_None);
            GPU_ReleaseResource(g->emittance,               GPU_ReleaseFlag_None);
            GPU_ReleaseResource(g->emittance_flood_read,    GPU_ReleaseFlag_None);
@ -2181,7 +2186,7 @@ void UpdateUser(P_Window *window)
        /* Acquire ui buffers */
        if (g->ui_target && !EqVec2I32(g->ui_size, GPU_GetTextureSize(g->ui_target)))
        {
-            YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target);
+            YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
            GPU_ReleaseResource(g->ui_target, GPU_ReleaseFlag_None);
            g->ui_target = 0;
        }
@ -2200,7 +2205,7 @@ void UpdateUser(P_Window *window)
        GPU_Resource *ui_shape_indices_buffer = AcquireUploadBufferFromArena(g->ui_shape_indices_count, g->ui_shape_indices_arena);
        GPU_Resource *grids_buffer = AcquireUploadBufferFromArena(g->grids_count, g->grids_arena);

-        GPU_CommandList *cl = GPU_BeginCommandList();
+        GPU_CommandList *cl = GPU_BeginCommandList(gpu_render_queue);
        {
            __profn("Run render");
            GPU_ProfN(cl, Lit("Run render"));
@ -2427,7 +2432,7 @@ void UpdateUser(P_Window *window)
                              GPU_RasterizeMode_TriangleList);
            }
        }
-        g->gpu_render_fence_target += GPU_EndCommandList(cl, &g->gpu_render_fence);
+        g->gpu_render_fence_target = GPU_EndCommandList(cl);

        /* Release transfer buffers */
        {
@ -2444,7 +2449,7 @@ void UpdateUser(P_Window *window)
                {
                    DelayReleaseGpuResources_Sig *sig = PushStruct(job->arena, DelayReleaseGpuResources_Sig);
                    job->count = countof(release_resources);
-                    sig->begin_fence = &g->gpu_render_fence;
+                    sig->begin_fence = g->gpu_render_fence;
                    sig->begin_fence_target = g->gpu_render_fence_target;
                    sig->resources = PushStructsNoZero(job->arena, GPU_Resource *, job->count);
                    sig->flags = GPU_ReleaseFlag_Reuse;
--- a/src/pp/pp.h
+++ b/src/pp/pp.h
@ -195,7 +195,7 @@ Struct(SharedUserState)
    u32 ui_shape_indices_count;
    u32 grids_count;

-    Fence gpu_render_fence;
+    Fence *gpu_render_fence;
    u64 gpu_render_fence_target;

    //- Bind state