gpu refactor progress

2025-09-16 22:40:56 -05:00 · 2025-09-16 22:40:56 -05:00 · 34294754c7
commit 34294754c7
parent 4d3a5b7c3e
12 changed files with 363 additions and 209 deletions
--- a/src/base/base.h
+++ b/src/base/base.h
@ -713,12 +713,6 @@ Struct(ComputeShader)   { Resource resource; };
 ////////////////////////////////
 //~ Fibers
 /* If virtual fibers are enabled, each fiber will get its own OS thread,
 * and fiber suspend/resume will be emulated using OS thread primitives.
 * This is slow but allows for easier debugging in tricky cases
 * since the debugger won't be confused by fiber context switching. */
 #define VirtualFibersEnabled 0
 # define MaxFibers 4096
 StaticAssert(MaxFibers < I16Max);  /* MaxFibers should fit in FiberId */
@ -730,7 +724,6 @@ StaticAssert(MaxFibers < I16Max);  /* MaxFibers should fit in FiberId */
 # endif
 #endif
 ////////////////////////////////
 //~ Exit callback types
--- a/src/base/base_job.h
+++ b/src/base/base_job.h
@ -39,6 +39,29 @@ Enum(JobPool)
 typedef void JobFunc(void *, i32);
 Enum(JobFlag)
 {
    JobFlag_None        = 0,
    /* A dedicated job is a heavy weight job that will receive its own OS
     * thread and will never yield. When the fiber running the job suspends
     * itself, the dedicated thread will perform a blocking wait rather than
     * yielding the thread to another fiber. This is mainly useful long-running
     * dispatcher-esque jobs that block on OS primitives, since occupying a
     * worker thread (and thereby preventing non-blocking jobs from running on
     * that worker) is unwanted.
     *
     * For example, Win32 window message processing is required by the OS to
     * occur on the same thread that initially created the window, which means
     * it actually must run inside a dedicated job to prevent message processing
     * from yielding & resuming on another thread. The message processing loop
     * can block until messages are received from the OS without having to
     * occupy a job worker while it blocks, and can then wake yielding
     * jobs onto job worker pools based on the messages it received.
     */
    JobFlag_Dedicated   = (1 << 0),
 };
 Struct(Job)
 {
    /* Internal */
@ -51,6 +74,7 @@ Struct(Job)
    JobPool pool;
    /* Configurable between OpenJob & CloseJob */
    JobFlag flags;
    i32 count;
    Fence *fence;
    void *sig;
@ -74,7 +98,7 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids);  /* NOTE: Must only be c
 #define JobDecl(job, sigdef)                                                                                        \
    typedef struct job##_Sig sigdef job##_Sig;                                                                      \
-    Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; job##_Sig sig; };     \
+    Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; JobFlag flags; job##_Sig sig; };     \
    void job(job##_Sig *, i32);                                                                                     \
    StaticAssert(1)
@ -103,6 +127,7 @@ do {
    Job *__job = OpenJob(__desc.func, __desc.pool);                                                     \
    __job->count = __desc.count;                                                                        \
    __job->fence = __desc.fence;                                                                        \
    __job->flags = __desc.flags;                                                                        \
    __job->sig = PushStructNoZero(__job->arena, job_func##_Sig);                                        \
    CopyBytes(__job->sig, &__desc.sig, sizeof(__desc.sig));                                             \
    CloseJob(__job);                                                                                    \
@ -110,20 +135,3 @@ do {
 Job *OpenJob(JobFunc *func, JobPool pool_kind);
 u32 CloseJob(Job *job);
 ////////////////////////////////
 //~ @hookdecl Dedicated job operations
 /* A dedicated job is a heavy weight job that will not operate inside of any
 * job pool. As such, it receives its own dedicated thread, and never yields to
 * other fibers. Instead of yielding when the fiber suspends, it performs a blocking
 * wait that puts the OS thread to sleep. This is mainly useful for
 * implementing long-running blocking dispatcher-like jobs tasks for subsystems.
 *
 * For example, Win32 window message processing is required by the OS to occur
 * on the same thread that initially created the window, which means it
 * actually must run inside a dedicated job to prevent message processing from
 * yielding & resuming on another thread.
 */
 void RunDedicatedJob(JobFunc job_func);
--- a/src/base/base_win32/base_win32_job.c
+++ b/src/base/base_win32/base_win32_job.c
@ -95,6 +95,10 @@ void InitJobSystem(void)
 ////////////////////////////////
 //~ Win32 thread
 JobDef(W32_DummyJob, sig, id)
 {
 }
 DWORD WINAPI W32_Win32ThreadProc(LPVOID vt)
 {
    /* Convert thread to fiber */
@ -141,15 +145,7 @@ W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, Str
    t->thread_udata = thread_udata;
    t->profiler_group = profiler_group;
-    t->handle = CreateThread(
+    t->handle = CreateThread(0, W32_FiberStackSize, W32_Win32ThreadProc, t, 0, 0);
        0,
        W32_FiberStackSize,
        W32_Win32ThreadProc,
        t,
        0,
        0
    );
    if (!t->handle)
    {
        Panic(Lit("Failed to create thread"));
@ -278,7 +274,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
        {
            __profn("CreateFiber");
            fiber->pool = pool->kind;
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
            fiber->addr = CreateThread(0, W32_FiberStackSize, W32_VirtualFiberEntryPoint, (void *)(i64)fiber_id, 0, 0);
 #else
            fiber->addr = CreateFiber(W32_FiberStackSize, W32_FiberEntryPoint, (void *)(i64)fiber_id);
@ -289,7 +285,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
            /* Fiber is not a part of a job pool, convert thread to fiber */
            __profn("ConvertThreadToFiber");
            fiber->addr = ConvertThreadToFiber((void *)(i64)fiber_id);
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
            fiber->addr = GetCurrentThread();
 #endif
        }
@ -319,7 +315,7 @@ ForceInline W32_Fiber *W32_FiberFromId(i16 id)
 void W32_SwitchToFiber(W32_Fiber *target)
 {
-#if VirtualFibersEnabled
+#if VIRTUAL_FIBERS
    W32_Fiber *self = W32_FiberFromId(FiberId());
    Atomic8Set(&self->virtual_yield, 1);
    /* Signal virtual target */
@ -351,9 +347,9 @@ void W32_FiberEntryPoint(void *_)
    W32_JobPool *pool = &W32_shared_job_state.job_pools[fiber->pool];
    JobPool pool_kind = fiber->pool;
    char *fiber_name_cstr = fiber->name_cstr;
    __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
    for (;;)
    {
        __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
        W32_Task *task = fiber->task;
        Job *job = task->job;
@ -542,26 +538,12 @@ void SuspendFiber(void)
    __prof;
    i16 fiber_id = FiberId();
    W32_Fiber *fiber = W32_FiberFromId(FiberId());
    i16 return_id = fiber->return_id;
    __prof_fiber_leave();
    if (return_id > 0)
    {
        /* Suspend task fiber (return control flow to parent/worker fiber) */
        Atomic8Set(&fiber->status, W32_FiberStatus_Suspending);
-        W32_Fiber *parent_fiber = W32_FiberFromId(return_id);
+        W32_Fiber *parent_fiber = W32_FiberFromId(fiber->return_id);
        W32_SwitchToFiber(parent_fiber);
    }
    else
    {
        /* Suspend dedicated fiber (block thread) */
        Atomic8Set(&fiber->status, W32_FiberStatus_Suspended);
        i8 status = W32_FiberStatus_Suspended;
        while (status != W32_FiberStatus_None)
        {
            WaitOnAddress(&fiber->status, &status, sizeof(status), INFINITE);
            status = Atomic8Fetch(&fiber->status);
        }
    }
    __prof_fiber_enter(fiber->name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(fiber->pool) + Kibi(1) + fiber->id);
 }
@ -587,21 +569,21 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids)
        /* Update fiber status */
        Atomic8Set(&fiber->status, W32_FiberStatus_None);
-        i16 return_id = fiber->return_id;
+        W32_Task *task = fiber->task;
-        if (return_id > 0)
+        // if (task->job->flags & JobFlag_Dedicated)
        if (0)
        {
            /* TODO: Wake dedicated fiber right now */
            WakeByAddressSingle(&fiber->status);
        }
        else
        {
            /* Group task based on pool */
            W32_Task *task = fiber->task;
            JobPool pool_kind = fiber->pool;
            W32_TaskList *pool_tasks = &tasks_by_pool[pool_kind];
            QueuePush(pool_tasks->first, pool_tasks->last, task);
            ++pool_tasks->count;
        }
        else
        {
            /* Wake dedicated fiber right now */
            WakeByAddressSingle(&fiber->status);
        }
    }
    /* Submit tasks */
@ -689,11 +671,17 @@ u32 CloseJob(Job *job)
 {
    TempArena scratch = BeginScratchNoConflict();
-    W32_JobPool *pool = &W32_shared_job_state.job_pools[job->pool];
+    JobPool pool_kind = job->pool;
    W32_JobPool *pool = &W32_shared_job_state.job_pools[pool_kind];
    u32 num_tasks = job->count;
-    if (num_tasks > 0)
+    if (num_tasks == 0)
    {
        Assert(0);
        job->func = W32_DummyJob;
        num_tasks = 1;
    }
    /* Allocate tasks from free list */
    u32 num_tasks_allocated = 0;
    W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks);
@ -732,6 +720,8 @@ u32 CloseJob(Job *job)
        PushAlign(perm, CachelineSize);
    }
    /* FIXME: Handle dedicated jobs separately */
    /* Generate task list */
    W32_TaskList tasks = ZI;
    for (u32 i = 0; i < num_tasks; ++i)
@ -773,21 +763,7 @@ u32 CloseJob(Job *job)
            WakeByAddressSingle(&pool->tasks_count);
        }
    }
    }
    else if (job->fence)
    {
        FetchAddFence(job->fence, 1);
    }
    EndScratch(scratch);
    return 1;
 }
 ////////////////////////////////
 //~ @hookdef Dedicated job operations
 void RunDedicatedJob(JobFunc job_func)
 {
    /* TODO: Implement */
    Assert(0);
 }
--- a/src/base/base_win32/base_win32_job.h
+++ b/src/base/base_win32/base_win32_job.h
@ -178,6 +178,7 @@ DWORD WINAPI W32_Win32ThreadProc(LPVOID vt);
 W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, String thread_name, i32 profiler_group);
 b32 W32_TryEndThread(W32_Thread *thread, f32 timeout_seconds);
 void W32_WaitEndThread(W32_Thread *thread);
 JobDecl(W32_DummyJob, EmptySig);
 ////////////////////////////////
 //~ Fiber operations
--- a/src/config.h
+++ b/src/config.h
@ -69,6 +69,14 @@
 #define FLOOD_DEBUG 0
 #define GPU_DEBUG 1
 /* If virtual fibers are enabled, each fiber will get its own OS thread,
 * and fiber suspend/resume will be emulated using OS thread primitives.
 * This is slow but allows for easier debugging in tricky cases
 * since the debugger won't be confused by fiber context switching. */
 #define VIRTUAL_FIBERS 0
 /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
 #define BITBUFF_DEBUG 0
 #define BITBUFF_TEST RtcIsEnabled
--- a/src/gpu/gpu.h
+++ b/src/gpu/gpu.h
@ -259,15 +259,6 @@ Struct(GPU_Scissor)
    f32 bottom;
 };
 ////////////////////////////////
 //~ Fence types
 Struct(GPU_Fence)
 {
    u64 targets[GPU_NumQueues];
    u32 num_targets;
 };
 ////////////////////////////////
 //~ Memory info types
@ -281,6 +272,11 @@ Struct(GPU_MemoryInfo)
 void GPU_Startup(void);
 ////////////////////////////////
 //~ @hookdecl Fence operations
 Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind);
 ////////////////////////////////
 //~ @hookdecl Rasterizer helpers
@ -299,8 +295,8 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource);
 ////////////////////////////////
 //~ @hookdecl Command list operations
-GPU_CommandList *GPU_BeginCommandList(void);
+GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind);
-u32 GPU_EndCommandList(GPU_CommandList *cl, Fence *fence);
+u64 GPU_EndCommandList(GPU_CommandList *cl);  /* Returns the value that the queue's fence will be set to once the command is completed */
 ////////////////////////////////
 //~ @hookdecl Profiling helpers
--- a/src/gpu/gpu_dx12/gpu_dx12.c
+++ b/src/gpu/gpu_dx12/gpu_dx12.c
@ -1,7 +1,5 @@
 GPU_D12_SharedState GPU_D12_shared_state = ZI;
 ////////////////////////////////
 //~ Helpers
@ -53,18 +51,66 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc)
 void GPU_D12_Startup(void)
 {
    /* Init device */
    GPU_D12_InitDevice();
    /* Init queues */
    {
        GPU_D12_QueueDesc descs[] = {
            {.kind = GPU_QueueKind_Direct,          .d3d_type = D3D12_COMMAND_LIST_TYPE_DIRECT,     .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Direct queue") },
            {.kind = GPU_QueueKind_Compute,         .d3d_type = D3D12_COMMAND_LIST_TYPE_COMPUTE,    .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Compute queue") },
            {.kind = GPU_QueueKind_Copy,            .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY,       .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH,      .dbg_name = Lit("Copy queue") },
            {.kind = GPU_QueueKind_BackgroundCopy,  .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY,       .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL,    .dbg_name = Lit("Background copy queue") }
        };
        u32 job_count = 0; Fence job_fence = ZI;
        job_count += RunJob(GPU_D12_InitQueue, .count = GPU_NumQueues, .sig.descs = descs, .fence = &job_fence);
        YieldOnFence(&job_fence, job_count);
    }
    /* Start queue sync job */
    RunJob(GPU_D12_StartQueueSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
 }
 ////////////////////////////////
-//~ Device initialization
+//~ Initialization
 //- Device initialization
 void GPU_D12_InitDevice(void)
 {
    GPU_D12_SharedState *g = &GPU_D12_shared_state;
    TempArena scratch = BeginScratchNoConflict();
    HRESULT hr = 0;
    /* Enable debug layer */
    u32 dxgi_factory_flags = 0;
 #if GPU_DEBUG
    {
        __profn("Enable debug layer");
        ID3D12Debug *debug_controller0 = 0;
        hr = D3D12GetDebugInterface(&IID_ID3D12Debug, (void **)&debug_controller0);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to create ID3D12Debug0"));
        }
        ID3D12Debug1 *debug_controller1 = 0;
        hr = ID3D12Debug_QueryInterface(debug_controller0, &IID_ID3D12Debug1, (void **)&debug_controller1);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to create ID3D12Debug1"));
        }
        ID3D12Debug_EnableDebugLayer(debug_controller0);
        /* FIXME: Enable this */
        //ID3D12Debug1_SetEnableGPUBasedValidation(debug_controller1, 1);
        ID3D12Debug_Release(debug_controller1);
        ID3D12Debug_Release(debug_controller0);
        dxgi_factory_flags |= DXGI_CREATE_FACTORY_DEBUG;
    }
 #endif
    /* Create factory */
    {
@ -131,9 +177,74 @@ void GPU_D12_InitDevice(void)
        g->device = device;
    }
 #if GPU_DEBUG
    /* Enable D3D12 Debug break */
    {
        __profn("Enable d3d12 debug break");
        ID3D12InfoQueue *info = 0;
        hr = ID3D12Device_QueryInterface(g->device, &IID_ID3D12InfoQueue, (void **)&info);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to query ID3D12Device interface"));
        }
        ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_CORRUPTION, 1);
        ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_ERROR, 1);
        ID3D12InfoQueue_Release(info);
    }
    /* Enable DXGI Debug break */
    {
        __profn("Enable dxgi debug break");
        IDXGIInfoQueue *dxgi_info = 0;
        hr = DXGIGetDebugInterface1(0, &IID_IDXGIInfoQueue, (void **)&dxgi_info);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to get DXGI debug interface"));
        }
        IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION, 1);
        IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR, 1);
        IDXGIInfoQueue_Release(dxgi_info);
    }
 #endif
    EndScratch(scratch);
 }
 //- Queue initialization
 JobDef(GPU_D12_InitQueue, sig, id)
 {
    GPU_D12_SharedState *g = &GPU_D12_shared_state;
    GPU_D12_QueueDesc desc = sig->descs[id];
    Arena *perm = PermArena();
    HRESULT hr = 0;
    GPU_D12_Queue *queue = 0;
    {
        PushAlign(perm, CachelineSize);
        queue = PushStruct(perm, GPU_D12_Queue);
        PushAlign(perm, CachelineSize);
    }
    queue->desc = desc;
    D3D12_COMMAND_QUEUE_DESC d3d_desc = ZI;
    d3d_desc.Type = desc.d3d_type;
    d3d_desc.Priority = desc.d3d_priority;
    hr = ID3D12Device_CreateCommandQueue(g->device, &d3d_desc, &IID_ID3D12CommandQueue, (void **)&queue->cq);
    if (FAILED(hr))
    {
        Panic(Lit("Failed to create command queue"));
    }
    hr = ID3D12Device_CreateFence(g->device, 0, 0, &IID_ID3D12Fence, (void **)&queue->submit_fence);
    if (FAILED(hr))
    {
        Panic(Lit("Failed to create command queue fence"));
    }
    g->queues[desc.kind] = queue;
 }
 ////////////////////////////////
 //~ Pipeline operations
@ -148,8 +259,8 @@ GPU_D12_Pipeline *GPU_D12_PipelineFromDesc(GPU_D12_PipelineDesc desc)
 GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind)
 {
-    /* TODO */
+    GPU_D12_SharedState *g = &GPU_D12_shared_state;
-    return 0;
+    return g->queues[kind];
 }
 ////////////////////////////////
@ -190,13 +301,13 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
        }
        cl->queue = queue;
-        HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.type, &IID_ID3D12CommandAllocator, (void **)&cl->ca);
+        HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.d3d_type, &IID_ID3D12CommandAllocator, (void **)&cl->ca);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to create command allocator"));
        }
-        hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl);
+        hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.d3d_type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl);
        if (FAILED(hr))
        {
            Panic(Lit("Failed to create command list"));
@ -227,7 +338,7 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
    return cl;
 }
-void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
+u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
 {
    GPU_D12_Queue *queue = cl->queue;
@ -243,11 +354,12 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
    }
    /* Submit */
    u64 target = 0;
    {
        __profn("Execute");
        Lock lock = LockE(&queue->submit_mutex);
        {
-            u64 target = ++queue->submit_fence_target;
+            target = ++queue->submit_fence_target;
            cl->submit_fence_target = target;
            /* Execute */
            ID3D12CommandQueue_ExecuteCommandLists(queue->cq, 1, (ID3D12CommandList **)&cl->cl);
@ -257,6 +369,38 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
        }
        Unlock(&lock);
    }
    return target;
 }
 ////////////////////////////////
 //~ Queue sync job
 JobDef(GPU_D12_StartQueueSync, _, __)
 {
    GPU_D12_SharedState *g = &GPU_D12_shared_state;
    HANDLE queue_fences_events[GPU_NumQueues] = ZI;
    u64 queue_fences_seen[GPU_NumQueues] = ZI;
    for (i32 i = 0; i < countof(queue_fences_events); ++i)
    {
        queue_fences_events[i] = CreateEvent(0, 0, 1, 0);
    }
    for (;;)
    {
        WaitForMultipleObjects(countof(queue_fences_events), queue_fences_events, 0, INFINITE);
        for (GPU_QueueKind queue_kind = 0; queue_kind < GPU_NumQueues; ++queue_kind)
        {
            GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
            u64 last_seen = queue_fences_seen[queue_kind];
            u64 completed = ID3D12Fence_GetCompletedValue(queue->submit_fence);
            if (completed > last_seen)
            {
                SetFence(&queue->sync_fence, completed);
                queue_fences_seen[queue_kind] = completed;
                ID3D12Fence_SetEventOnCompletion(queue->submit_fence, completed + 1, queue_fences_events[queue_kind]);
            }
        }
    }
 }
 ////////////////////////////////
@ -267,6 +411,15 @@ void GPU_Startup(void)
    GPU_D12_Startup();
 }
 ////////////////////////////////
 //~ @hookdecl Fence hooks
 Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind)
 {
    GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
    return &queue->sync_fence;
 }
 ////////////////////////////////
 //~ @hookdef Rasterizer helper hooks
@ -398,7 +551,7 @@ GPU_Resource *GPU_AcquireResource(GPU_ResourceDesc desc)
                d3d_desc.Alignment = 0;
                d3d_desc.Width = desc.texture.size.x;
                d3d_desc.Height = desc.texture.size.y;
-                d3d_desc.DepthOrArraySize = desc.texture.size.y;
+                d3d_desc.DepthOrArraySize = desc.texture.size.z;
                d3d_desc.MipLevels = 1;
                d3d_desc.SampleDesc.Count = 1;
                d3d_desc.SampleDesc.Quality = 0;
@ -465,7 +618,7 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource)
 ////////////////////////////////
 //~ @hookdef Command list hooks
-GPU_CommandList *GPU_BeginCommandList(void)
+GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind)
 {
    GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
    Arena *perm = PermArena();
@ -479,23 +632,16 @@ GPU_CommandList *GPU_BeginCommandList(void)
    {
        cl = PushStruct(perm, GPU_D12_CommandList);
    }
    cl->queue_kind = queue_kind;
    return (GPU_CommandList *)cl;
 }
-u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
+u64 GPU_EndCommandList(GPU_CommandList *gpu_cl)
 {
    GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
    GPU_D12_CommandList *cl = (GPU_D12_CommandList *)gpu_cl;
-
+    GPU_QueueKind queue_kind = cl->queue_kind;
-    /* Determine queue kind */
+    GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
 #if 0
    GPU_QueueKind queue_kind = GPU_QueueKind_Direct;
 #else
    GPU_QueueKind queue_kind = GPU_QueueKind_BackgroundCopy;
    for (GPU_D12_Command *cmd = cl->first; cmd; cmd = cmd->next)
    {
    }
 #endif
    /* Begin dx12 command list */
    GPU_D12_RawCommandList *dx12_cl = GPU_D12_BeginRawCommandList(queue_kind);
@ -654,7 +800,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
    }
    /* End dx12 command list */
-    GPU_D12_EndRawCommandList(dx12_cl);
+    u64 fence_target = GPU_D12_EndRawCommandList(dx12_cl);
    /* Free commands */
    if (cl->last)
@ -666,7 +812,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
    /* Free command list */
    StackPush(f->first_free_command_list, cl);
-    return 1;
+    return fence_target;
 }
 ////////////////////////////////
--- a/src/gpu/gpu_dx12/gpu_dx12.h
+++ b/src/gpu/gpu_dx12/gpu_dx12.h
@ -53,8 +53,9 @@ Struct(GPU_D12_Resource)
 Struct(GPU_D12_QueueDesc)
 {
-    enum D3D12_COMMAND_LIST_TYPE type;
+    GPU_QueueKind kind;
-    enum D3D12_COMMAND_QUEUE_PRIORITY priority;
+    D3D12_COMMAND_LIST_TYPE d3d_type;
    D3D12_COMMAND_QUEUE_PRIORITY d3d_priority;
    String dbg_name;
 };
@ -68,6 +69,8 @@ Struct(GPU_D12_Queue)
    u64 submit_fence_target;
    struct GPU_D12_RawCommandList *first_submitted_cl;
    struct GPU_D12_RawCommandList *last_submitted_cl;
    Fence sync_fence;
 };
 ////////////////////////////////
@ -163,6 +166,8 @@ Struct(GPU_D12_CommandList)
    GPU_D12_Command *first;
    GPU_D12_Command *last;
    u64 count;
    GPU_QueueKind queue_kind;
 };
 ////////////////////////////////
@ -189,6 +194,9 @@ Struct(GPU_D12_SharedState)
 {
    GPU_D12_FiberState *fiber_states[MaxFibers];
    /* Queues */
    GPU_D12_Queue *queues[GPU_NumQueues];
    /* Resources */
    Mutex free_resources_mutex;
    GPU_D12_Resource *first_free_resource;
@ -213,10 +221,14 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc);
 void GPU_D12_Startup(void);
 ////////////////////////////////
-//~ Device initialization
+//~ Initialization
 //- Device initialization
 void GPU_D12_InitDevice(void);
 //- Queue initialization
 JobDecl(GPU_D12_InitQueue, { GPU_D12_QueueDesc *descs; });
 ////////////////////////////////
 //~ Pipeline operations
@ -231,4 +243,9 @@ GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind);
 //~ Raw command list operations
 GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind);
-void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl);
+u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl);
 ////////////////////////////////
 //~ Sync job
 JobDecl(GPU_D12_StartQueueSync, EmptySig);
--- a/src/platform/platform_win32/platform_win32.c
+++ b/src/platform/platform_win32/platform_win32.c
@ -101,7 +101,7 @@ void P_Startup(void)
    g->socks_arena = AcquireArena(Gibi(64));
    //- Init timer
-    RunJob(P_W32_UpdateTimer);
+    RunJob(P_W32_StartTimerSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
 }
 ////////////////////////////////
@ -179,11 +179,11 @@ P_W32_Window *P_W32_AcquireWindow(void)
    window->event_arenas[0] = AcquireArena(Gibi(64));
    window->event_arenas[1] = AcquireArena(Gibi(64));
-    /* Start window event thread */
+    /* Start window event job */
-    /* NOTE: This thread must finish building for the window to actually be
+    /* NOTE: This job must finish starting for the window to actually be
     * created and receive a HWND, because on Windows a the event proc must run on
     * the same thread that created the window. */
-    window->window_thread = W32_StartThread(&P_W32_WindowThreadEntryFunc, window, Lit("Window thread"), PROF_THREAD_GROUP_WINDOW);
+    RunJob(P_W32_StartWindowMsgProcessing, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated, .sig.window = window);
    YieldOnFence(&window->ready_fence, 1);
    return window;
@ -195,7 +195,7 @@ void P_W32_ReleaseWindow(P_W32_Window *window)
    Atomic32Set(&window->shutdown, 1);
    P_W32_SharedState *g = &P_W32_shared_state;
    P_W32_WakeWindow(window);
-    W32_WaitEndThread(window->window_thread);
+    YieldOnFence(&window->finished_fence, 1);
    Lock lock = LockE(&g->windows_mutex);
    {
@ -389,11 +389,11 @@ void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *sett
 }
 ////////////////////////////////
-//~ Win32 window thread
+//~ Win32 window message processing
-W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg)
+JobDef(P_W32_StartWindowMsgProcessing, sig, id)
 {
-    P_W32_Window *window = (P_W32_Window *)arg;
+    P_W32_Window *window = sig->window;
    /* Win32 limitation: Window must be initialized on same thread that processes events */
    window->hwnd = P_W32_InitWindow(window);
@ -419,6 +419,7 @@ W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg)
    /* Destroy window hwnd */
    DestroyWindow(window->hwnd);
    SetFence(&window->finished_fence, 1);
 }
 void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event)
@ -867,7 +868,7 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr)
 ////////////////////////////////
 //~ Timer job
-JobDef(P_W32_UpdateTimer, _, __)
+JobDef(P_W32_StartTimerSync, _, __)
 {
    P_W32_SharedState *g = &P_W32_shared_state;
    SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);
--- a/src/platform/platform_win32/platform_win32.h
+++ b/src/platform/platform_win32/platform_win32.h
@ -42,6 +42,7 @@ Struct(P_W32_Window)
    HWND hwnd;
    Fence ready_fence;
    Fence finished_fence;
    u16 utf16_high_surrogate_last_input;
@ -67,8 +68,6 @@ Struct(P_W32_Window)
    i32 current_event_arena_index;
    Arena *event_arenas[2];
    W32_Thread *window_thread;
    Atomic32 shutdown;
    P_W32_Window *next_free;
 };
@ -160,12 +159,16 @@ P_W32_Window *P_W32_AcquireWindow(void);
 void P_W32_ReleaseWindow(P_W32_Window *window);
 HWND P_W32_InitWindow(P_W32_Window *window);
-//- Window settings
+////////////////////////////////
 //~ Window settings
 void P_W32_UpdateWindowFromSystem(P_W32_Window *window);
 void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *settings);
-//- Window thread
+////////////////////////////////
-W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg);
+//~ Window message processing
 JobDecl(P_W32_StartWindowMsgProcessing, { P_W32_Window *window; });
 void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event);
 void P_W32_WakeWindow(P_W32_Window *window);
 LRESULT CALLBACK P_W32_Win32WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam);
@ -180,4 +183,4 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr);
 ////////////////////////////////
 //~ Timer job
-JobDecl(P_W32_UpdateTimer, EmptySig);
+JobDecl(P_W32_StartTimerSync, EmptySig);
--- a/src/pp/pp.c
+++ b/src/pp/pp.c
@ -406,13 +406,12 @@ GPU_Resource *AcquireGbuffer(GPU_Format format, Vec2I32 size)
 GPU_Resource *AcquireUploadBuffer(u32 element_count, u32 element_size, void *src)
 {
    __prof;
    u64 size = element_size * element_count;
    GPU_ResourceDesc desc = ZI;
    desc.kind = GPU_ResourceKind_Buffer;
    desc.flags = GPU_ResourceFlag_None;
    desc.buffer.heap_kind = GPU_HeapKind_Upload;
    desc.buffer.element_size = size;
    desc.buffer.element_count = element_count;
    desc.buffer.element_capacity = element_count;
    desc.buffer.element_size = element_size;
    GPU_Resource *r = GPU_AcquireResource(desc);
    {
@ -2151,14 +2150,20 @@ void UpdateUser(P_Window *window)
    {
        __profn("Render");
        GPU_QueueKind gpu_render_queue = GPU_QueueKind_Direct;
        Rect ui_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->ui_size.x, g->ui_size.y));
        Rect render_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->render_size.x, g->render_size.y));
        if (!g->gpu_render_fence)
        {
            g->gpu_render_fence = GPU_FenceFromQueue(gpu_render_queue);
        }
        /* Acquire gbuffers */
        if (g->shade_target && !EqVec2I32(g->render_size, GPU_GetTextureSize(g->shade_target)))
        {
            __profn("Release render resources");
-            YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target);
+            YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
            GPU_ReleaseResource(g->albedo,                  GPU_ReleaseFlag_None);
            GPU_ReleaseResource(g->emittance,               GPU_ReleaseFlag_None);
            GPU_ReleaseResource(g->emittance_flood_read,    GPU_ReleaseFlag_None);
@ -2181,7 +2186,7 @@ void UpdateUser(P_Window *window)
        /* Acquire ui buffers */
        if (g->ui_target && !EqVec2I32(g->ui_size, GPU_GetTextureSize(g->ui_target)))
        {
-            YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target);
+            YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
            GPU_ReleaseResource(g->ui_target, GPU_ReleaseFlag_None);
            g->ui_target = 0;
        }
@ -2200,7 +2205,7 @@ void UpdateUser(P_Window *window)
        GPU_Resource *ui_shape_indices_buffer = AcquireUploadBufferFromArena(g->ui_shape_indices_count, g->ui_shape_indices_arena);
        GPU_Resource *grids_buffer = AcquireUploadBufferFromArena(g->grids_count, g->grids_arena);
-        GPU_CommandList *cl = GPU_BeginCommandList();
+        GPU_CommandList *cl = GPU_BeginCommandList(gpu_render_queue);
        {
            __profn("Run render");
            GPU_ProfN(cl, Lit("Run render"));
@ -2427,7 +2432,7 @@ void UpdateUser(P_Window *window)
                              GPU_RasterizeMode_TriangleList);
            }
        }
-        g->gpu_render_fence_target += GPU_EndCommandList(cl, &g->gpu_render_fence);
+        g->gpu_render_fence_target = GPU_EndCommandList(cl);
        /* Release transfer buffers */
        {
@ -2444,7 +2449,7 @@ void UpdateUser(P_Window *window)
                {
                    DelayReleaseGpuResources_Sig *sig = PushStruct(job->arena, DelayReleaseGpuResources_Sig);
                    job->count = countof(release_resources);
-                    sig->begin_fence = &g->gpu_render_fence;
+                    sig->begin_fence = g->gpu_render_fence;
                    sig->begin_fence_target = g->gpu_render_fence_target;
                    sig->resources = PushStructsNoZero(job->arena, GPU_Resource *, job->count);
                    sig->flags = GPU_ReleaseFlag_Reuse;
--- a/src/pp/pp.h
+++ b/src/pp/pp.h
@ -195,7 +195,7 @@ Struct(SharedUserState)
    u32 ui_shape_indices_count;
    u32 grids_count;
-    Fence gpu_render_fence;
+    Fence *gpu_render_fence;
    u64 gpu_render_fence_target;
    //- Bind state