diff --git a/src/base/base.h b/src/base/base.h index e978df8d..5eb1883b 100644 --- a/src/base/base.h +++ b/src/base/base.h @@ -713,12 +713,6 @@ Struct(ComputeShader) { Resource resource; }; //////////////////////////////// //~ Fibers -/* If virtual fibers are enabled, each fiber will get its own OS thread, - * and fiber suspend/resume will be emulated using OS thread primitives. - * This is slow but allows for easier debugging in tricky cases - * since the debugger won't be confused by fiber context switching. */ -#define VirtualFibersEnabled 0 - # define MaxFibers 4096 StaticAssert(MaxFibers < I16Max); /* MaxFibers should fit in FiberId */ @@ -730,7 +724,6 @@ StaticAssert(MaxFibers < I16Max); /* MaxFibers should fit in FiberId */ # endif #endif - //////////////////////////////// //~ Exit callback types diff --git a/src/base/base_job.h b/src/base/base_job.h index 81a9830d..8072c5ac 100644 --- a/src/base/base_job.h +++ b/src/base/base_job.h @@ -39,6 +39,29 @@ Enum(JobPool) typedef void JobFunc(void *, i32); +Enum(JobFlag) +{ + JobFlag_None = 0, + + /* A dedicated job is a heavy weight job that will receive its own OS + * thread and will never yield. When the fiber running the job suspends + * itself, the dedicated thread will perform a blocking wait rather than + * yielding the thread to another fiber. This is mainly useful long-running + * dispatcher-esque jobs that block on OS primitives, since occupying a + * worker thread (and thereby preventing non-blocking jobs from running on + * that worker) is unwanted. + * + * For example, Win32 window message processing is required by the OS to + * occur on the same thread that initially created the window, which means + * it actually must run inside a dedicated job to prevent message processing + * from yielding & resuming on another thread. The message processing loop + * can block until messages are received from the OS without having to + * occupy a job worker while it blocks, and can then wake yielding + * jobs onto job worker pools based on the messages it received. + */ + JobFlag_Dedicated = (1 << 0), +}; + Struct(Job) { /* Internal */ @@ -51,6 +74,7 @@ Struct(Job) JobPool pool; /* Configurable between OpenJob & CloseJob */ + JobFlag flags; i32 count; Fence *fence; void *sig; @@ -72,10 +96,10 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids); /* NOTE: Must only be c #define EmptySig { i32 _; } -#define JobDecl(job, sigdef) \ - typedef struct job##_Sig sigdef job##_Sig; \ - Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; job##_Sig sig; }; \ - void job(job##_Sig *, i32); \ +#define JobDecl(job, sigdef) \ + typedef struct job##_Sig sigdef job##_Sig; \ + Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; JobFlag flags; job##_Sig sig; }; \ + void job(job##_Sig *, i32); \ StaticAssert(1) #define JobDef(job, sig_arg, id_arg) void job(job##_Sig *sig_arg, i32 id_arg) @@ -103,6 +127,7 @@ do { Job *__job = OpenJob(__desc.func, __desc.pool); \ __job->count = __desc.count; \ __job->fence = __desc.fence; \ + __job->flags = __desc.flags; \ __job->sig = PushStructNoZero(__job->arena, job_func##_Sig); \ CopyBytes(__job->sig, &__desc.sig, sizeof(__desc.sig)); \ CloseJob(__job); \ @@ -110,20 +135,3 @@ do { Job *OpenJob(JobFunc *func, JobPool pool_kind); u32 CloseJob(Job *job); - -//////////////////////////////// -//~ @hookdecl Dedicated job operations - -/* A dedicated job is a heavy weight job that will not operate inside of any - * job pool. As such, it receives its own dedicated thread, and never yields to - * other fibers. Instead of yielding when the fiber suspends, it performs a blocking - * wait that puts the OS thread to sleep. This is mainly useful for - * implementing long-running blocking dispatcher-like jobs tasks for subsystems. - * - * For example, Win32 window message processing is required by the OS to occur - * on the same thread that initially created the window, which means it - * actually must run inside a dedicated job to prevent message processing from - * yielding & resuming on another thread. - */ - -void RunDedicatedJob(JobFunc job_func); diff --git a/src/base/base_win32/base_win32_job.c b/src/base/base_win32/base_win32_job.c index a4d47870..39b7737d 100644 --- a/src/base/base_win32/base_win32_job.c +++ b/src/base/base_win32/base_win32_job.c @@ -95,6 +95,10 @@ void InitJobSystem(void) //////////////////////////////// //~ Win32 thread +JobDef(W32_DummyJob, sig, id) +{ +} + DWORD WINAPI W32_Win32ThreadProc(LPVOID vt) { /* Convert thread to fiber */ @@ -141,15 +145,7 @@ W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, Str t->thread_udata = thread_udata; t->profiler_group = profiler_group; - t->handle = CreateThread( - 0, - W32_FiberStackSize, - W32_Win32ThreadProc, - t, - 0, - 0 - ); - + t->handle = CreateThread(0, W32_FiberStackSize, W32_Win32ThreadProc, t, 0, 0); if (!t->handle) { Panic(Lit("Failed to create thread")); @@ -278,7 +274,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool) { __profn("CreateFiber"); fiber->pool = pool->kind; -#if VirtualFibersEnabled +#if VIRTUAL_FIBERS fiber->addr = CreateThread(0, W32_FiberStackSize, W32_VirtualFiberEntryPoint, (void *)(i64)fiber_id, 0, 0); #else fiber->addr = CreateFiber(W32_FiberStackSize, W32_FiberEntryPoint, (void *)(i64)fiber_id); @@ -289,7 +285,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool) /* Fiber is not a part of a job pool, convert thread to fiber */ __profn("ConvertThreadToFiber"); fiber->addr = ConvertThreadToFiber((void *)(i64)fiber_id); -#if VirtualFibersEnabled +#if VIRTUAL_FIBERS fiber->addr = GetCurrentThread(); #endif } @@ -319,7 +315,7 @@ ForceInline W32_Fiber *W32_FiberFromId(i16 id) void W32_SwitchToFiber(W32_Fiber *target) { -#if VirtualFibersEnabled +#if VIRTUAL_FIBERS W32_Fiber *self = W32_FiberFromId(FiberId()); Atomic8Set(&self->virtual_yield, 1); /* Signal virtual target */ @@ -351,9 +347,9 @@ void W32_FiberEntryPoint(void *_) W32_JobPool *pool = &W32_shared_job_state.job_pools[fiber->pool]; JobPool pool_kind = fiber->pool; char *fiber_name_cstr = fiber->name_cstr; + __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id); for (;;) { - __prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id); W32_Task *task = fiber->task; Job *job = task->job; @@ -542,26 +538,12 @@ void SuspendFiber(void) __prof; i16 fiber_id = FiberId(); W32_Fiber *fiber = W32_FiberFromId(FiberId()); - i16 return_id = fiber->return_id; __prof_fiber_leave(); - if (return_id > 0) { - /* Suspend task fiber (return control flow to parent/worker fiber) */ Atomic8Set(&fiber->status, W32_FiberStatus_Suspending); - W32_Fiber *parent_fiber = W32_FiberFromId(return_id); + W32_Fiber *parent_fiber = W32_FiberFromId(fiber->return_id); W32_SwitchToFiber(parent_fiber); } - else - { - /* Suspend dedicated fiber (block thread) */ - Atomic8Set(&fiber->status, W32_FiberStatus_Suspended); - i8 status = W32_FiberStatus_Suspended; - while (status != W32_FiberStatus_None) - { - WaitOnAddress(&fiber->status, &status, sizeof(status), INFINITE); - status = Atomic8Fetch(&fiber->status); - } - } __prof_fiber_enter(fiber->name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(fiber->pool) + Kibi(1) + fiber->id); } @@ -587,21 +569,21 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids) /* Update fiber status */ Atomic8Set(&fiber->status, W32_FiberStatus_None); - i16 return_id = fiber->return_id; - if (return_id > 0) + W32_Task *task = fiber->task; + // if (task->job->flags & JobFlag_Dedicated) + if (0) + { + /* TODO: Wake dedicated fiber right now */ + WakeByAddressSingle(&fiber->status); + } + else { /* Group task based on pool */ - W32_Task *task = fiber->task; JobPool pool_kind = fiber->pool; W32_TaskList *pool_tasks = &tasks_by_pool[pool_kind]; QueuePush(pool_tasks->first, pool_tasks->last, task); ++pool_tasks->count; } - else - { - /* Wake dedicated fiber right now */ - WakeByAddressSingle(&fiber->status); - } } /* Submit tasks */ @@ -689,105 +671,99 @@ u32 CloseJob(Job *job) { TempArena scratch = BeginScratchNoConflict(); - W32_JobPool *pool = &W32_shared_job_state.job_pools[job->pool]; + JobPool pool_kind = job->pool; + W32_JobPool *pool = &W32_shared_job_state.job_pools[pool_kind]; u32 num_tasks = job->count; - if (num_tasks > 0) + if (num_tasks == 0) { - /* Allocate tasks from free list */ - u32 num_tasks_allocated = 0; - W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks); + Assert(0); + job->func = W32_DummyJob; + num_tasks = 1; + } + + /* Allocate tasks from free list */ + u32 num_tasks_allocated = 0; + W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks); + { + LockTicketMutex(&pool->free_tasks_tm); { - LockTicketMutex(&pool->free_tasks_tm); + while (num_tasks_allocated < num_tasks) { - while (num_tasks_allocated < num_tasks) + W32_Task *task = pool->first_free_task; + if (task) { - W32_Task *task = pool->first_free_task; - if (task) - { - tasks_array[num_tasks_allocated++] = task; - StackPop(pool->first_free_task); - } - else - { - break; - } - } - } - UnlockTicketMutex(&pool->free_tasks_tm); - } - - /* Allocate new tasks from memory */ - u32 remaining = num_tasks - num_tasks_allocated; - if (remaining > 0) - { - Arena *perm = PermArena(); - PushAlign(perm, CachelineSize); - W32_Task *pushed_tasks = PushStructsNoZero(perm, W32_Task, remaining); - for (u32 i = 0; i < remaining; ++i) - { - tasks_array[num_tasks_allocated + i] = &pushed_tasks[i]; - } - num_tasks_allocated += remaining; - PushAlign(perm, CachelineSize); - } - - /* Generate task list */ - W32_TaskList tasks = ZI; - for (u32 i = 0; i < num_tasks; ++i) - { - W32_Task *task = tasks_array[i]; - ZeroStruct(task); - task->job = job; - task->task_id = tasks.count++; - QueuePush(tasks.first, tasks.last, task); - } - - /* Push tasks to back of pool */ - { - LockTicketMutex(&pool->tasks_tm); - { - if (pool->last_task) - { - pool->last_task->next = tasks.first; + tasks_array[num_tasks_allocated++] = task; + StackPop(pool->first_free_task); } else { - pool->first_task = tasks.first; + break; } - pool->last_task = tasks.last; - Atomic64FetchAdd(&pool->tasks_count.v, num_tasks); - } - UnlockTicketMutex(&pool->tasks_tm); - } - - /* Wake workers */ - if (num_tasks >= W32_WakeAllWorkersThreshold) - { - WakeByAddressAll(&pool->tasks_count); - } - else - { - for (u32 i = 0; i < num_tasks; ++i) - { - WakeByAddressSingle(&pool->tasks_count); } } + UnlockTicketMutex(&pool->free_tasks_tm); } - else if (job->fence) + + /* Allocate new tasks from memory */ + u32 remaining = num_tasks - num_tasks_allocated; + if (remaining > 0) { - FetchAddFence(job->fence, 1); + Arena *perm = PermArena(); + PushAlign(perm, CachelineSize); + W32_Task *pushed_tasks = PushStructsNoZero(perm, W32_Task, remaining); + for (u32 i = 0; i < remaining; ++i) + { + tasks_array[num_tasks_allocated + i] = &pushed_tasks[i]; + } + num_tasks_allocated += remaining; + PushAlign(perm, CachelineSize); + } + + /* FIXME: Handle dedicated jobs separately */ + + /* Generate task list */ + W32_TaskList tasks = ZI; + for (u32 i = 0; i < num_tasks; ++i) + { + W32_Task *task = tasks_array[i]; + ZeroStruct(task); + task->job = job; + task->task_id = tasks.count++; + QueuePush(tasks.first, tasks.last, task); + } + + /* Push tasks to back of pool */ + { + LockTicketMutex(&pool->tasks_tm); + { + if (pool->last_task) + { + pool->last_task->next = tasks.first; + } + else + { + pool->first_task = tasks.first; + } + pool->last_task = tasks.last; + Atomic64FetchAdd(&pool->tasks_count.v, num_tasks); + } + UnlockTicketMutex(&pool->tasks_tm); + } + + /* Wake workers */ + if (num_tasks >= W32_WakeAllWorkersThreshold) + { + WakeByAddressAll(&pool->tasks_count); + } + else + { + for (u32 i = 0; i < num_tasks; ++i) + { + WakeByAddressSingle(&pool->tasks_count); + } } EndScratch(scratch); return 1; } - -//////////////////////////////// -//~ @hookdef Dedicated job operations - -void RunDedicatedJob(JobFunc job_func) -{ - /* TODO: Implement */ - Assert(0); -} diff --git a/src/base/base_win32/base_win32_job.h b/src/base/base_win32/base_win32_job.h index 7e015dc5..e7d6a78e 100644 --- a/src/base/base_win32/base_win32_job.h +++ b/src/base/base_win32/base_win32_job.h @@ -178,6 +178,7 @@ DWORD WINAPI W32_Win32ThreadProc(LPVOID vt); W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, String thread_name, i32 profiler_group); b32 W32_TryEndThread(W32_Thread *thread, f32 timeout_seconds); void W32_WaitEndThread(W32_Thread *thread); +JobDecl(W32_DummyJob, EmptySig); //////////////////////////////// //~ Fiber operations diff --git a/src/config.h b/src/config.h index 14a472c5..ad4e4af4 100644 --- a/src/config.h +++ b/src/config.h @@ -69,6 +69,14 @@ #define FLOOD_DEBUG 0 +#define GPU_DEBUG 1 + +/* If virtual fibers are enabled, each fiber will get its own OS thread, + * and fiber suspend/resume will be emulated using OS thread primitives. + * This is slow but allows for easier debugging in tricky cases + * since the debugger won't be confused by fiber context switching. */ +#define VIRTUAL_FIBERS 0 + /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */ #define BITBUFF_DEBUG 0 #define BITBUFF_TEST RtcIsEnabled diff --git a/src/gpu/gpu.h b/src/gpu/gpu.h index 8581d29a..b0e0e53e 100644 --- a/src/gpu/gpu.h +++ b/src/gpu/gpu.h @@ -259,15 +259,6 @@ Struct(GPU_Scissor) f32 bottom; }; -//////////////////////////////// -//~ Fence types - -Struct(GPU_Fence) -{ - u64 targets[GPU_NumQueues]; - u32 num_targets; -}; - //////////////////////////////// //~ Memory info types @@ -281,6 +272,11 @@ Struct(GPU_MemoryInfo) void GPU_Startup(void); +//////////////////////////////// +//~ @hookdecl Fence operations + +Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind); + //////////////////////////////// //~ @hookdecl Rasterizer helpers @@ -299,8 +295,8 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource); //////////////////////////////// //~ @hookdecl Command list operations -GPU_CommandList *GPU_BeginCommandList(void); -u32 GPU_EndCommandList(GPU_CommandList *cl, Fence *fence); +GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind); +u64 GPU_EndCommandList(GPU_CommandList *cl); /* Returns the value that the queue's fence will be set to once the command is completed */ //////////////////////////////// //~ @hookdecl Profiling helpers diff --git a/src/gpu/gpu_dx12/gpu_dx12.c b/src/gpu/gpu_dx12/gpu_dx12.c index 4a2ad868..8b45eda1 100644 --- a/src/gpu/gpu_dx12/gpu_dx12.c +++ b/src/gpu/gpu_dx12/gpu_dx12.c @@ -1,7 +1,5 @@ GPU_D12_SharedState GPU_D12_shared_state = ZI; - - //////////////////////////////// //~ Helpers @@ -53,18 +51,66 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc) void GPU_D12_Startup(void) { + /* Init device */ GPU_D12_InitDevice(); + + /* Init queues */ + { + GPU_D12_QueueDesc descs[] = { + {.kind = GPU_QueueKind_Direct, .d3d_type = D3D12_COMMAND_LIST_TYPE_DIRECT, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .dbg_name = Lit("Direct queue") }, + {.kind = GPU_QueueKind_Compute, .d3d_type = D3D12_COMMAND_LIST_TYPE_COMPUTE, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .dbg_name = Lit("Compute queue") }, + {.kind = GPU_QueueKind_Copy, .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH, .dbg_name = Lit("Copy queue") }, + {.kind = GPU_QueueKind_BackgroundCopy, .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .dbg_name = Lit("Background copy queue") } + }; + u32 job_count = 0; Fence job_fence = ZI; + job_count += RunJob(GPU_D12_InitQueue, .count = GPU_NumQueues, .sig.descs = descs, .fence = &job_fence); + YieldOnFence(&job_fence, job_count); + } + + /* Start queue sync job */ + RunJob(GPU_D12_StartQueueSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated); } //////////////////////////////// -//~ Device initialization +//~ Initialization + +//- Device initialization void GPU_D12_InitDevice(void) { GPU_D12_SharedState *g = &GPU_D12_shared_state; TempArena scratch = BeginScratchNoConflict(); HRESULT hr = 0; + + /* Enable debug layer */ u32 dxgi_factory_flags = 0; +#if GPU_DEBUG + { + __profn("Enable debug layer"); + ID3D12Debug *debug_controller0 = 0; + hr = D3D12GetDebugInterface(&IID_ID3D12Debug, (void **)&debug_controller0); + if (FAILED(hr)) + { + Panic(Lit("Failed to create ID3D12Debug0")); + } + + ID3D12Debug1 *debug_controller1 = 0; + hr = ID3D12Debug_QueryInterface(debug_controller0, &IID_ID3D12Debug1, (void **)&debug_controller1); + if (FAILED(hr)) + { + Panic(Lit("Failed to create ID3D12Debug1")); + } + + ID3D12Debug_EnableDebugLayer(debug_controller0); + + /* FIXME: Enable this */ + //ID3D12Debug1_SetEnableGPUBasedValidation(debug_controller1, 1); + + ID3D12Debug_Release(debug_controller1); + ID3D12Debug_Release(debug_controller0); + dxgi_factory_flags |= DXGI_CREATE_FACTORY_DEBUG; + } +#endif /* Create factory */ { @@ -131,9 +177,74 @@ void GPU_D12_InitDevice(void) g->device = device; } +#if GPU_DEBUG + /* Enable D3D12 Debug break */ + { + __profn("Enable d3d12 debug break"); + ID3D12InfoQueue *info = 0; + hr = ID3D12Device_QueryInterface(g->device, &IID_ID3D12InfoQueue, (void **)&info); + if (FAILED(hr)) + { + Panic(Lit("Failed to query ID3D12Device interface")); + } + ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_CORRUPTION, 1); + ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_ERROR, 1); + ID3D12InfoQueue_Release(info); + } + + /* Enable DXGI Debug break */ + { + __profn("Enable dxgi debug break"); + IDXGIInfoQueue *dxgi_info = 0; + hr = DXGIGetDebugInterface1(0, &IID_IDXGIInfoQueue, (void **)&dxgi_info); + if (FAILED(hr)) + { + Panic(Lit("Failed to get DXGI debug interface")); + } + IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION, 1); + IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR, 1); + IDXGIInfoQueue_Release(dxgi_info); + } +#endif + EndScratch(scratch); } +//- Queue initialization + +JobDef(GPU_D12_InitQueue, sig, id) +{ + GPU_D12_SharedState *g = &GPU_D12_shared_state; + GPU_D12_QueueDesc desc = sig->descs[id]; + Arena *perm = PermArena(); + HRESULT hr = 0; + + GPU_D12_Queue *queue = 0; + { + PushAlign(perm, CachelineSize); + queue = PushStruct(perm, GPU_D12_Queue); + PushAlign(perm, CachelineSize); + } + queue->desc = desc; + + D3D12_COMMAND_QUEUE_DESC d3d_desc = ZI; + d3d_desc.Type = desc.d3d_type; + d3d_desc.Priority = desc.d3d_priority; + hr = ID3D12Device_CreateCommandQueue(g->device, &d3d_desc, &IID_ID3D12CommandQueue, (void **)&queue->cq); + if (FAILED(hr)) + { + Panic(Lit("Failed to create command queue")); + } + + hr = ID3D12Device_CreateFence(g->device, 0, 0, &IID_ID3D12Fence, (void **)&queue->submit_fence); + if (FAILED(hr)) + { + Panic(Lit("Failed to create command queue fence")); + } + + g->queues[desc.kind] = queue; +} + //////////////////////////////// //~ Pipeline operations @@ -148,8 +259,8 @@ GPU_D12_Pipeline *GPU_D12_PipelineFromDesc(GPU_D12_PipelineDesc desc) GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind) { - /* TODO */ - return 0; + GPU_D12_SharedState *g = &GPU_D12_shared_state; + return g->queues[kind]; } //////////////////////////////// @@ -190,13 +301,13 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind) } cl->queue = queue; - HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.type, &IID_ID3D12CommandAllocator, (void **)&cl->ca); + HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.d3d_type, &IID_ID3D12CommandAllocator, (void **)&cl->ca); if (FAILED(hr)) { Panic(Lit("Failed to create command allocator")); } - hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl); + hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.d3d_type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl); if (FAILED(hr)) { Panic(Lit("Failed to create command list")); @@ -227,7 +338,7 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind) return cl; } -void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl) +u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl) { GPU_D12_Queue *queue = cl->queue; @@ -243,11 +354,12 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl) } /* Submit */ + u64 target = 0; { __profn("Execute"); Lock lock = LockE(&queue->submit_mutex); { - u64 target = ++queue->submit_fence_target; + target = ++queue->submit_fence_target; cl->submit_fence_target = target; /* Execute */ ID3D12CommandQueue_ExecuteCommandLists(queue->cq, 1, (ID3D12CommandList **)&cl->cl); @@ -257,6 +369,38 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl) } Unlock(&lock); } + + return target; +} + +//////////////////////////////// +//~ Queue sync job + +JobDef(GPU_D12_StartQueueSync, _, __) +{ + GPU_D12_SharedState *g = &GPU_D12_shared_state; + HANDLE queue_fences_events[GPU_NumQueues] = ZI; + u64 queue_fences_seen[GPU_NumQueues] = ZI; + for (i32 i = 0; i < countof(queue_fences_events); ++i) + { + queue_fences_events[i] = CreateEvent(0, 0, 1, 0); + } + for (;;) + { + WaitForMultipleObjects(countof(queue_fences_events), queue_fences_events, 0, INFINITE); + for (GPU_QueueKind queue_kind = 0; queue_kind < GPU_NumQueues; ++queue_kind) + { + GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind); + u64 last_seen = queue_fences_seen[queue_kind]; + u64 completed = ID3D12Fence_GetCompletedValue(queue->submit_fence); + if (completed > last_seen) + { + SetFence(&queue->sync_fence, completed); + queue_fences_seen[queue_kind] = completed; + ID3D12Fence_SetEventOnCompletion(queue->submit_fence, completed + 1, queue_fences_events[queue_kind]); + } + } + } } //////////////////////////////// @@ -267,6 +411,15 @@ void GPU_Startup(void) GPU_D12_Startup(); } +//////////////////////////////// +//~ @hookdecl Fence hooks + +Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind) +{ + GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind); + return &queue->sync_fence; +} + //////////////////////////////// //~ @hookdef Rasterizer helper hooks @@ -398,7 +551,7 @@ GPU_Resource *GPU_AcquireResource(GPU_ResourceDesc desc) d3d_desc.Alignment = 0; d3d_desc.Width = desc.texture.size.x; d3d_desc.Height = desc.texture.size.y; - d3d_desc.DepthOrArraySize = desc.texture.size.y; + d3d_desc.DepthOrArraySize = desc.texture.size.z; d3d_desc.MipLevels = 1; d3d_desc.SampleDesc.Count = 1; d3d_desc.SampleDesc.Quality = 0; @@ -465,7 +618,7 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource) //////////////////////////////// //~ @hookdef Command list hooks -GPU_CommandList *GPU_BeginCommandList(void) +GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind) { GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId()); Arena *perm = PermArena(); @@ -479,23 +632,16 @@ GPU_CommandList *GPU_BeginCommandList(void) { cl = PushStruct(perm, GPU_D12_CommandList); } + cl->queue_kind = queue_kind; return (GPU_CommandList *)cl; } -u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence) +u64 GPU_EndCommandList(GPU_CommandList *gpu_cl) { GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId()); GPU_D12_CommandList *cl = (GPU_D12_CommandList *)gpu_cl; - - /* Determine queue kind */ -#if 0 - GPU_QueueKind queue_kind = GPU_QueueKind_Direct; -#else - GPU_QueueKind queue_kind = GPU_QueueKind_BackgroundCopy; - for (GPU_D12_Command *cmd = cl->first; cmd; cmd = cmd->next) - { - } -#endif + GPU_QueueKind queue_kind = cl->queue_kind; + GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind); /* Begin dx12 command list */ GPU_D12_RawCommandList *dx12_cl = GPU_D12_BeginRawCommandList(queue_kind); @@ -654,7 +800,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence) } /* End dx12 command list */ - GPU_D12_EndRawCommandList(dx12_cl); + u64 fence_target = GPU_D12_EndRawCommandList(dx12_cl); /* Free commands */ if (cl->last) @@ -666,7 +812,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence) /* Free command list */ StackPush(f->first_free_command_list, cl); - return 1; + return fence_target; } //////////////////////////////// diff --git a/src/gpu/gpu_dx12/gpu_dx12.h b/src/gpu/gpu_dx12/gpu_dx12.h index 88930ce9..55c429e9 100644 --- a/src/gpu/gpu_dx12/gpu_dx12.h +++ b/src/gpu/gpu_dx12/gpu_dx12.h @@ -53,8 +53,9 @@ Struct(GPU_D12_Resource) Struct(GPU_D12_QueueDesc) { - enum D3D12_COMMAND_LIST_TYPE type; - enum D3D12_COMMAND_QUEUE_PRIORITY priority; + GPU_QueueKind kind; + D3D12_COMMAND_LIST_TYPE d3d_type; + D3D12_COMMAND_QUEUE_PRIORITY d3d_priority; String dbg_name; }; @@ -68,6 +69,8 @@ Struct(GPU_D12_Queue) u64 submit_fence_target; struct GPU_D12_RawCommandList *first_submitted_cl; struct GPU_D12_RawCommandList *last_submitted_cl; + + Fence sync_fence; }; //////////////////////////////// @@ -163,6 +166,8 @@ Struct(GPU_D12_CommandList) GPU_D12_Command *first; GPU_D12_Command *last; u64 count; + + GPU_QueueKind queue_kind; }; //////////////////////////////// @@ -189,6 +194,9 @@ Struct(GPU_D12_SharedState) { GPU_D12_FiberState *fiber_states[MaxFibers]; + /* Queues */ + GPU_D12_Queue *queues[GPU_NumQueues]; + /* Resources */ Mutex free_resources_mutex; GPU_D12_Resource *first_free_resource; @@ -213,10 +221,14 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc); void GPU_D12_Startup(void); //////////////////////////////// -//~ Device initialization +//~ Initialization +//- Device initialization void GPU_D12_InitDevice(void); +//- Queue initialization +JobDecl(GPU_D12_InitQueue, { GPU_D12_QueueDesc *descs; }); + //////////////////////////////// //~ Pipeline operations @@ -231,4 +243,9 @@ GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind); //~ Raw command list operations GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind); -void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl); +u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl); + +//////////////////////////////// +//~ Sync job + +JobDecl(GPU_D12_StartQueueSync, EmptySig); diff --git a/src/platform/platform_win32/platform_win32.c b/src/platform/platform_win32/platform_win32.c index 030d8880..d79638a3 100644 --- a/src/platform/platform_win32/platform_win32.c +++ b/src/platform/platform_win32/platform_win32.c @@ -101,7 +101,7 @@ void P_Startup(void) g->socks_arena = AcquireArena(Gibi(64)); //- Init timer - RunJob(P_W32_UpdateTimer); + RunJob(P_W32_StartTimerSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated); } //////////////////////////////// @@ -179,11 +179,11 @@ P_W32_Window *P_W32_AcquireWindow(void) window->event_arenas[0] = AcquireArena(Gibi(64)); window->event_arenas[1] = AcquireArena(Gibi(64)); - /* Start window event thread */ - /* NOTE: This thread must finish building for the window to actually be + /* Start window event job */ + /* NOTE: This job must finish starting for the window to actually be * created and receive a HWND, because on Windows a the event proc must run on * the same thread that created the window. */ - window->window_thread = W32_StartThread(&P_W32_WindowThreadEntryFunc, window, Lit("Window thread"), PROF_THREAD_GROUP_WINDOW); + RunJob(P_W32_StartWindowMsgProcessing, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated, .sig.window = window); YieldOnFence(&window->ready_fence, 1); return window; @@ -195,7 +195,7 @@ void P_W32_ReleaseWindow(P_W32_Window *window) Atomic32Set(&window->shutdown, 1); P_W32_SharedState *g = &P_W32_shared_state; P_W32_WakeWindow(window); - W32_WaitEndThread(window->window_thread); + YieldOnFence(&window->finished_fence, 1); Lock lock = LockE(&g->windows_mutex); { @@ -389,11 +389,11 @@ void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *sett } //////////////////////////////// -//~ Win32 window thread +//~ Win32 window message processing -W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg) +JobDef(P_W32_StartWindowMsgProcessing, sig, id) { - P_W32_Window *window = (P_W32_Window *)arg; + P_W32_Window *window = sig->window; /* Win32 limitation: Window must be initialized on same thread that processes events */ window->hwnd = P_W32_InitWindow(window); @@ -419,6 +419,7 @@ W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg) /* Destroy window hwnd */ DestroyWindow(window->hwnd); + SetFence(&window->finished_fence, 1); } void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event) @@ -867,7 +868,7 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr) //////////////////////////////// //~ Timer job -JobDef(P_W32_UpdateTimer, _, __) +JobDef(P_W32_StartTimerSync, _, __) { P_W32_SharedState *g = &P_W32_shared_state; SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); diff --git a/src/platform/platform_win32/platform_win32.h b/src/platform/platform_win32/platform_win32.h index 4c05d1d3..f43a9d85 100644 --- a/src/platform/platform_win32/platform_win32.h +++ b/src/platform/platform_win32/platform_win32.h @@ -42,6 +42,7 @@ Struct(P_W32_Window) HWND hwnd; Fence ready_fence; + Fence finished_fence; u16 utf16_high_surrogate_last_input; @@ -67,8 +68,6 @@ Struct(P_W32_Window) i32 current_event_arena_index; Arena *event_arenas[2]; - W32_Thread *window_thread; - Atomic32 shutdown; P_W32_Window *next_free; }; @@ -160,12 +159,16 @@ P_W32_Window *P_W32_AcquireWindow(void); void P_W32_ReleaseWindow(P_W32_Window *window); HWND P_W32_InitWindow(P_W32_Window *window); -//- Window settings +//////////////////////////////// +//~ Window settings + void P_W32_UpdateWindowFromSystem(P_W32_Window *window); void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *settings); -//- Window thread -W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg); +//////////////////////////////// +//~ Window message processing + +JobDecl(P_W32_StartWindowMsgProcessing, { P_W32_Window *window; }); void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event); void P_W32_WakeWindow(P_W32_Window *window); LRESULT CALLBACK P_W32_Win32WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam); @@ -180,4 +183,4 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr); //////////////////////////////// //~ Timer job -JobDecl(P_W32_UpdateTimer, EmptySig); +JobDecl(P_W32_StartTimerSync, EmptySig); diff --git a/src/pp/pp.c b/src/pp/pp.c index a7aba144..455df07f 100644 --- a/src/pp/pp.c +++ b/src/pp/pp.c @@ -406,13 +406,12 @@ GPU_Resource *AcquireGbuffer(GPU_Format format, Vec2I32 size) GPU_Resource *AcquireUploadBuffer(u32 element_count, u32 element_size, void *src) { __prof; - u64 size = element_size * element_count; GPU_ResourceDesc desc = ZI; desc.kind = GPU_ResourceKind_Buffer; desc.flags = GPU_ResourceFlag_None; desc.buffer.heap_kind = GPU_HeapKind_Upload; - desc.buffer.element_size = size; desc.buffer.element_count = element_count; + desc.buffer.element_capacity = element_count; desc.buffer.element_size = element_size; GPU_Resource *r = GPU_AcquireResource(desc); { @@ -2151,14 +2150,20 @@ void UpdateUser(P_Window *window) { __profn("Render"); + GPU_QueueKind gpu_render_queue = GPU_QueueKind_Direct; Rect ui_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->ui_size.x, g->ui_size.y)); Rect render_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->render_size.x, g->render_size.y)); + if (!g->gpu_render_fence) + { + g->gpu_render_fence = GPU_FenceFromQueue(gpu_render_queue); + } + /* Acquire gbuffers */ if (g->shade_target && !EqVec2I32(g->render_size, GPU_GetTextureSize(g->shade_target))) { __profn("Release render resources"); - YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target); + YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target); GPU_ReleaseResource(g->albedo, GPU_ReleaseFlag_None); GPU_ReleaseResource(g->emittance, GPU_ReleaseFlag_None); GPU_ReleaseResource(g->emittance_flood_read, GPU_ReleaseFlag_None); @@ -2181,7 +2186,7 @@ void UpdateUser(P_Window *window) /* Acquire ui buffers */ if (g->ui_target && !EqVec2I32(g->ui_size, GPU_GetTextureSize(g->ui_target))) { - YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target); + YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target); GPU_ReleaseResource(g->ui_target, GPU_ReleaseFlag_None); g->ui_target = 0; } @@ -2200,7 +2205,7 @@ void UpdateUser(P_Window *window) GPU_Resource *ui_shape_indices_buffer = AcquireUploadBufferFromArena(g->ui_shape_indices_count, g->ui_shape_indices_arena); GPU_Resource *grids_buffer = AcquireUploadBufferFromArena(g->grids_count, g->grids_arena); - GPU_CommandList *cl = GPU_BeginCommandList(); + GPU_CommandList *cl = GPU_BeginCommandList(gpu_render_queue); { __profn("Run render"); GPU_ProfN(cl, Lit("Run render")); @@ -2427,7 +2432,7 @@ void UpdateUser(P_Window *window) GPU_RasterizeMode_TriangleList); } } - g->gpu_render_fence_target += GPU_EndCommandList(cl, &g->gpu_render_fence); + g->gpu_render_fence_target = GPU_EndCommandList(cl); /* Release transfer buffers */ { @@ -2444,7 +2449,7 @@ void UpdateUser(P_Window *window) { DelayReleaseGpuResources_Sig *sig = PushStruct(job->arena, DelayReleaseGpuResources_Sig); job->count = countof(release_resources); - sig->begin_fence = &g->gpu_render_fence; + sig->begin_fence = g->gpu_render_fence; sig->begin_fence_target = g->gpu_render_fence_target; sig->resources = PushStructsNoZero(job->arena, GPU_Resource *, job->count); sig->flags = GPU_ReleaseFlag_Reuse; diff --git a/src/pp/pp.h b/src/pp/pp.h index 752cca1d..6fde0995 100644 --- a/src/pp/pp.h +++ b/src/pp/pp.h @@ -195,7 +195,7 @@ Struct(SharedUserState) u32 ui_shape_indices_count; u32 grids_count; - Fence gpu_render_fence; + Fence *gpu_render_fence; u64 gpu_render_fence_target; //- Bind state