gpu refactor progress

This commit is contained in:
jacob 2025-09-16 22:40:56 -05:00
parent 4d3a5b7c3e
commit 34294754c7
12 changed files with 363 additions and 209 deletions

View File

@ -713,12 +713,6 @@ Struct(ComputeShader) { Resource resource; };
//////////////////////////////// ////////////////////////////////
//~ Fibers //~ Fibers
/* If virtual fibers are enabled, each fiber will get its own OS thread,
* and fiber suspend/resume will be emulated using OS thread primitives.
* This is slow but allows for easier debugging in tricky cases
* since the debugger won't be confused by fiber context switching. */
#define VirtualFibersEnabled 0
# define MaxFibers 4096 # define MaxFibers 4096
StaticAssert(MaxFibers < I16Max); /* MaxFibers should fit in FiberId */ StaticAssert(MaxFibers < I16Max); /* MaxFibers should fit in FiberId */
@ -730,7 +724,6 @@ StaticAssert(MaxFibers < I16Max); /* MaxFibers should fit in FiberId */
# endif # endif
#endif #endif
//////////////////////////////// ////////////////////////////////
//~ Exit callback types //~ Exit callback types

View File

@ -39,6 +39,29 @@ Enum(JobPool)
typedef void JobFunc(void *, i32); typedef void JobFunc(void *, i32);
Enum(JobFlag)
{
JobFlag_None = 0,
/* A dedicated job is a heavy weight job that will receive its own OS
* thread and will never yield. When the fiber running the job suspends
* itself, the dedicated thread will perform a blocking wait rather than
* yielding the thread to another fiber. This is mainly useful long-running
* dispatcher-esque jobs that block on OS primitives, since occupying a
* worker thread (and thereby preventing non-blocking jobs from running on
* that worker) is unwanted.
*
* For example, Win32 window message processing is required by the OS to
* occur on the same thread that initially created the window, which means
* it actually must run inside a dedicated job to prevent message processing
* from yielding & resuming on another thread. The message processing loop
* can block until messages are received from the OS without having to
* occupy a job worker while it blocks, and can then wake yielding
* jobs onto job worker pools based on the messages it received.
*/
JobFlag_Dedicated = (1 << 0),
};
Struct(Job) Struct(Job)
{ {
/* Internal */ /* Internal */
@ -51,6 +74,7 @@ Struct(Job)
JobPool pool; JobPool pool;
/* Configurable between OpenJob & CloseJob */ /* Configurable between OpenJob & CloseJob */
JobFlag flags;
i32 count; i32 count;
Fence *fence; Fence *fence;
void *sig; void *sig;
@ -74,7 +98,7 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids); /* NOTE: Must only be c
#define JobDecl(job, sigdef) \ #define JobDecl(job, sigdef) \
typedef struct job##_Sig sigdef job##_Sig; \ typedef struct job##_Sig sigdef job##_Sig; \
Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; job##_Sig sig; }; \ Struct(job##_Desc) { JobFunc *func; JobPool pool; u32 count; Fence *fence; JobFlag flags; job##_Sig sig; }; \
void job(job##_Sig *, i32); \ void job(job##_Sig *, i32); \
StaticAssert(1) StaticAssert(1)
@ -103,6 +127,7 @@ do {
Job *__job = OpenJob(__desc.func, __desc.pool); \ Job *__job = OpenJob(__desc.func, __desc.pool); \
__job->count = __desc.count; \ __job->count = __desc.count; \
__job->fence = __desc.fence; \ __job->fence = __desc.fence; \
__job->flags = __desc.flags; \
__job->sig = PushStructNoZero(__job->arena, job_func##_Sig); \ __job->sig = PushStructNoZero(__job->arena, job_func##_Sig); \
CopyBytes(__job->sig, &__desc.sig, sizeof(__desc.sig)); \ CopyBytes(__job->sig, &__desc.sig, sizeof(__desc.sig)); \
CloseJob(__job); \ CloseJob(__job); \
@ -110,20 +135,3 @@ do {
Job *OpenJob(JobFunc *func, JobPool pool_kind); Job *OpenJob(JobFunc *func, JobPool pool_kind);
u32 CloseJob(Job *job); u32 CloseJob(Job *job);
////////////////////////////////
//~ @hookdecl Dedicated job operations
/* A dedicated job is a heavy weight job that will not operate inside of any
* job pool. As such, it receives its own dedicated thread, and never yields to
* other fibers. Instead of yielding when the fiber suspends, it performs a blocking
* wait that puts the OS thread to sleep. This is mainly useful for
* implementing long-running blocking dispatcher-like jobs tasks for subsystems.
*
* For example, Win32 window message processing is required by the OS to occur
* on the same thread that initially created the window, which means it
* actually must run inside a dedicated job to prevent message processing from
* yielding & resuming on another thread.
*/
void RunDedicatedJob(JobFunc job_func);

View File

@ -95,6 +95,10 @@ void InitJobSystem(void)
//////////////////////////////// ////////////////////////////////
//~ Win32 thread //~ Win32 thread
JobDef(W32_DummyJob, sig, id)
{
}
DWORD WINAPI W32_Win32ThreadProc(LPVOID vt) DWORD WINAPI W32_Win32ThreadProc(LPVOID vt)
{ {
/* Convert thread to fiber */ /* Convert thread to fiber */
@ -141,15 +145,7 @@ W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, Str
t->thread_udata = thread_udata; t->thread_udata = thread_udata;
t->profiler_group = profiler_group; t->profiler_group = profiler_group;
t->handle = CreateThread( t->handle = CreateThread(0, W32_FiberStackSize, W32_Win32ThreadProc, t, 0, 0);
0,
W32_FiberStackSize,
W32_Win32ThreadProc,
t,
0,
0
);
if (!t->handle) if (!t->handle)
{ {
Panic(Lit("Failed to create thread")); Panic(Lit("Failed to create thread"));
@ -278,7 +274,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
{ {
__profn("CreateFiber"); __profn("CreateFiber");
fiber->pool = pool->kind; fiber->pool = pool->kind;
#if VirtualFibersEnabled #if VIRTUAL_FIBERS
fiber->addr = CreateThread(0, W32_FiberStackSize, W32_VirtualFiberEntryPoint, (void *)(i64)fiber_id, 0, 0); fiber->addr = CreateThread(0, W32_FiberStackSize, W32_VirtualFiberEntryPoint, (void *)(i64)fiber_id, 0, 0);
#else #else
fiber->addr = CreateFiber(W32_FiberStackSize, W32_FiberEntryPoint, (void *)(i64)fiber_id); fiber->addr = CreateFiber(W32_FiberStackSize, W32_FiberEntryPoint, (void *)(i64)fiber_id);
@ -289,7 +285,7 @@ W32_Fiber *W32_AcquireFiber(W32_JobPool *pool)
/* Fiber is not a part of a job pool, convert thread to fiber */ /* Fiber is not a part of a job pool, convert thread to fiber */
__profn("ConvertThreadToFiber"); __profn("ConvertThreadToFiber");
fiber->addr = ConvertThreadToFiber((void *)(i64)fiber_id); fiber->addr = ConvertThreadToFiber((void *)(i64)fiber_id);
#if VirtualFibersEnabled #if VIRTUAL_FIBERS
fiber->addr = GetCurrentThread(); fiber->addr = GetCurrentThread();
#endif #endif
} }
@ -319,7 +315,7 @@ ForceInline W32_Fiber *W32_FiberFromId(i16 id)
void W32_SwitchToFiber(W32_Fiber *target) void W32_SwitchToFiber(W32_Fiber *target)
{ {
#if VirtualFibersEnabled #if VIRTUAL_FIBERS
W32_Fiber *self = W32_FiberFromId(FiberId()); W32_Fiber *self = W32_FiberFromId(FiberId());
Atomic8Set(&self->virtual_yield, 1); Atomic8Set(&self->virtual_yield, 1);
/* Signal virtual target */ /* Signal virtual target */
@ -351,9 +347,9 @@ void W32_FiberEntryPoint(void *_)
W32_JobPool *pool = &W32_shared_job_state.job_pools[fiber->pool]; W32_JobPool *pool = &W32_shared_job_state.job_pools[fiber->pool];
JobPool pool_kind = fiber->pool; JobPool pool_kind = fiber->pool;
char *fiber_name_cstr = fiber->name_cstr; char *fiber_name_cstr = fiber->name_cstr;
__prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
for (;;) for (;;)
{ {
__prof_fiber_enter(fiber_name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(pool_kind) + Kibi(1) + fiber->id);
W32_Task *task = fiber->task; W32_Task *task = fiber->task;
Job *job = task->job; Job *job = task->job;
@ -542,26 +538,12 @@ void SuspendFiber(void)
__prof; __prof;
i16 fiber_id = FiberId(); i16 fiber_id = FiberId();
W32_Fiber *fiber = W32_FiberFromId(FiberId()); W32_Fiber *fiber = W32_FiberFromId(FiberId());
i16 return_id = fiber->return_id;
__prof_fiber_leave(); __prof_fiber_leave();
if (return_id > 0)
{ {
/* Suspend task fiber (return control flow to parent/worker fiber) */
Atomic8Set(&fiber->status, W32_FiberStatus_Suspending); Atomic8Set(&fiber->status, W32_FiberStatus_Suspending);
W32_Fiber *parent_fiber = W32_FiberFromId(return_id); W32_Fiber *parent_fiber = W32_FiberFromId(fiber->return_id);
W32_SwitchToFiber(parent_fiber); W32_SwitchToFiber(parent_fiber);
} }
else
{
/* Suspend dedicated fiber (block thread) */
Atomic8Set(&fiber->status, W32_FiberStatus_Suspended);
i8 status = W32_FiberStatus_Suspended;
while (status != W32_FiberStatus_None)
{
WaitOnAddress(&fiber->status, &status, sizeof(status), INFINITE);
status = Atomic8Fetch(&fiber->status);
}
}
__prof_fiber_enter(fiber->name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(fiber->pool) + Kibi(1) + fiber->id); __prof_fiber_enter(fiber->name_cstr, PROF_THREAD_GROUP_FIBERS - Mebi(fiber->pool) + Kibi(1) + fiber->id);
} }
@ -587,21 +569,21 @@ void ResumeFibers(i16 fiber_ids_count, i16 *fiber_ids)
/* Update fiber status */ /* Update fiber status */
Atomic8Set(&fiber->status, W32_FiberStatus_None); Atomic8Set(&fiber->status, W32_FiberStatus_None);
i16 return_id = fiber->return_id; W32_Task *task = fiber->task;
if (return_id > 0) // if (task->job->flags & JobFlag_Dedicated)
if (0)
{
/* TODO: Wake dedicated fiber right now */
WakeByAddressSingle(&fiber->status);
}
else
{ {
/* Group task based on pool */ /* Group task based on pool */
W32_Task *task = fiber->task;
JobPool pool_kind = fiber->pool; JobPool pool_kind = fiber->pool;
W32_TaskList *pool_tasks = &tasks_by_pool[pool_kind]; W32_TaskList *pool_tasks = &tasks_by_pool[pool_kind];
QueuePush(pool_tasks->first, pool_tasks->last, task); QueuePush(pool_tasks->first, pool_tasks->last, task);
++pool_tasks->count; ++pool_tasks->count;
} }
else
{
/* Wake dedicated fiber right now */
WakeByAddressSingle(&fiber->status);
}
} }
/* Submit tasks */ /* Submit tasks */
@ -689,11 +671,17 @@ u32 CloseJob(Job *job)
{ {
TempArena scratch = BeginScratchNoConflict(); TempArena scratch = BeginScratchNoConflict();
W32_JobPool *pool = &W32_shared_job_state.job_pools[job->pool]; JobPool pool_kind = job->pool;
W32_JobPool *pool = &W32_shared_job_state.job_pools[pool_kind];
u32 num_tasks = job->count; u32 num_tasks = job->count;
if (num_tasks > 0) if (num_tasks == 0)
{ {
Assert(0);
job->func = W32_DummyJob;
num_tasks = 1;
}
/* Allocate tasks from free list */ /* Allocate tasks from free list */
u32 num_tasks_allocated = 0; u32 num_tasks_allocated = 0;
W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks); W32_Task **tasks_array = PushStructsNoZero(scratch.arena, W32_Task *, num_tasks);
@ -732,6 +720,8 @@ u32 CloseJob(Job *job)
PushAlign(perm, CachelineSize); PushAlign(perm, CachelineSize);
} }
/* FIXME: Handle dedicated jobs separately */
/* Generate task list */ /* Generate task list */
W32_TaskList tasks = ZI; W32_TaskList tasks = ZI;
for (u32 i = 0; i < num_tasks; ++i) for (u32 i = 0; i < num_tasks; ++i)
@ -773,21 +763,7 @@ u32 CloseJob(Job *job)
WakeByAddressSingle(&pool->tasks_count); WakeByAddressSingle(&pool->tasks_count);
} }
} }
}
else if (job->fence)
{
FetchAddFence(job->fence, 1);
}
EndScratch(scratch); EndScratch(scratch);
return 1; return 1;
} }
////////////////////////////////
//~ @hookdef Dedicated job operations
void RunDedicatedJob(JobFunc job_func)
{
/* TODO: Implement */
Assert(0);
}

View File

@ -178,6 +178,7 @@ DWORD WINAPI W32_Win32ThreadProc(LPVOID vt);
W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, String thread_name, i32 profiler_group); W32_Thread *W32_StartThread(W32_ThreadFunc *entry_point, void *thread_udata, String thread_name, i32 profiler_group);
b32 W32_TryEndThread(W32_Thread *thread, f32 timeout_seconds); b32 W32_TryEndThread(W32_Thread *thread, f32 timeout_seconds);
void W32_WaitEndThread(W32_Thread *thread); void W32_WaitEndThread(W32_Thread *thread);
JobDecl(W32_DummyJob, EmptySig);
//////////////////////////////// ////////////////////////////////
//~ Fiber operations //~ Fiber operations

View File

@ -69,6 +69,14 @@
#define FLOOD_DEBUG 0 #define FLOOD_DEBUG 0
#define GPU_DEBUG 1
/* If virtual fibers are enabled, each fiber will get its own OS thread,
* and fiber suspend/resume will be emulated using OS thread primitives.
* This is slow but allows for easier debugging in tricky cases
* since the debugger won't be confused by fiber context switching. */
#define VIRTUAL_FIBERS 0
/* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */ /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
#define BITBUFF_DEBUG 0 #define BITBUFF_DEBUG 0
#define BITBUFF_TEST RtcIsEnabled #define BITBUFF_TEST RtcIsEnabled

View File

@ -259,15 +259,6 @@ Struct(GPU_Scissor)
f32 bottom; f32 bottom;
}; };
////////////////////////////////
//~ Fence types
Struct(GPU_Fence)
{
u64 targets[GPU_NumQueues];
u32 num_targets;
};
//////////////////////////////// ////////////////////////////////
//~ Memory info types //~ Memory info types
@ -281,6 +272,11 @@ Struct(GPU_MemoryInfo)
void GPU_Startup(void); void GPU_Startup(void);
////////////////////////////////
//~ @hookdecl Fence operations
Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind);
//////////////////////////////// ////////////////////////////////
//~ @hookdecl Rasterizer helpers //~ @hookdecl Rasterizer helpers
@ -299,8 +295,8 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource);
//////////////////////////////// ////////////////////////////////
//~ @hookdecl Command list operations //~ @hookdecl Command list operations
GPU_CommandList *GPU_BeginCommandList(void); GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind);
u32 GPU_EndCommandList(GPU_CommandList *cl, Fence *fence); u64 GPU_EndCommandList(GPU_CommandList *cl); /* Returns the value that the queue's fence will be set to once the command is completed */
//////////////////////////////// ////////////////////////////////
//~ @hookdecl Profiling helpers //~ @hookdecl Profiling helpers

View File

@ -1,7 +1,5 @@
GPU_D12_SharedState GPU_D12_shared_state = ZI; GPU_D12_SharedState GPU_D12_shared_state = ZI;
//////////////////////////////// ////////////////////////////////
//~ Helpers //~ Helpers
@ -53,18 +51,66 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc)
void GPU_D12_Startup(void) void GPU_D12_Startup(void)
{ {
/* Init device */
GPU_D12_InitDevice(); GPU_D12_InitDevice();
/* Init queues */
{
GPU_D12_QueueDesc descs[] = {
{.kind = GPU_QueueKind_Direct, .d3d_type = D3D12_COMMAND_LIST_TYPE_DIRECT, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .dbg_name = Lit("Direct queue") },
{.kind = GPU_QueueKind_Compute, .d3d_type = D3D12_COMMAND_LIST_TYPE_COMPUTE, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .dbg_name = Lit("Compute queue") },
{.kind = GPU_QueueKind_Copy, .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH, .dbg_name = Lit("Copy queue") },
{.kind = GPU_QueueKind_BackgroundCopy, .d3d_type = D3D12_COMMAND_LIST_TYPE_COPY, .d3d_priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .dbg_name = Lit("Background copy queue") }
};
u32 job_count = 0; Fence job_fence = ZI;
job_count += RunJob(GPU_D12_InitQueue, .count = GPU_NumQueues, .sig.descs = descs, .fence = &job_fence);
YieldOnFence(&job_fence, job_count);
}
/* Start queue sync job */
RunJob(GPU_D12_StartQueueSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
} }
//////////////////////////////// ////////////////////////////////
//~ Device initialization //~ Initialization
//- Device initialization
void GPU_D12_InitDevice(void) void GPU_D12_InitDevice(void)
{ {
GPU_D12_SharedState *g = &GPU_D12_shared_state; GPU_D12_SharedState *g = &GPU_D12_shared_state;
TempArena scratch = BeginScratchNoConflict(); TempArena scratch = BeginScratchNoConflict();
HRESULT hr = 0; HRESULT hr = 0;
/* Enable debug layer */
u32 dxgi_factory_flags = 0; u32 dxgi_factory_flags = 0;
#if GPU_DEBUG
{
__profn("Enable debug layer");
ID3D12Debug *debug_controller0 = 0;
hr = D3D12GetDebugInterface(&IID_ID3D12Debug, (void **)&debug_controller0);
if (FAILED(hr))
{
Panic(Lit("Failed to create ID3D12Debug0"));
}
ID3D12Debug1 *debug_controller1 = 0;
hr = ID3D12Debug_QueryInterface(debug_controller0, &IID_ID3D12Debug1, (void **)&debug_controller1);
if (FAILED(hr))
{
Panic(Lit("Failed to create ID3D12Debug1"));
}
ID3D12Debug_EnableDebugLayer(debug_controller0);
/* FIXME: Enable this */
//ID3D12Debug1_SetEnableGPUBasedValidation(debug_controller1, 1);
ID3D12Debug_Release(debug_controller1);
ID3D12Debug_Release(debug_controller0);
dxgi_factory_flags |= DXGI_CREATE_FACTORY_DEBUG;
}
#endif
/* Create factory */ /* Create factory */
{ {
@ -131,9 +177,74 @@ void GPU_D12_InitDevice(void)
g->device = device; g->device = device;
} }
#if GPU_DEBUG
/* Enable D3D12 Debug break */
{
__profn("Enable d3d12 debug break");
ID3D12InfoQueue *info = 0;
hr = ID3D12Device_QueryInterface(g->device, &IID_ID3D12InfoQueue, (void **)&info);
if (FAILED(hr))
{
Panic(Lit("Failed to query ID3D12Device interface"));
}
ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_CORRUPTION, 1);
ID3D12InfoQueue_SetBreakOnSeverity(info, D3D12_MESSAGE_SEVERITY_ERROR, 1);
ID3D12InfoQueue_Release(info);
}
/* Enable DXGI Debug break */
{
__profn("Enable dxgi debug break");
IDXGIInfoQueue *dxgi_info = 0;
hr = DXGIGetDebugInterface1(0, &IID_IDXGIInfoQueue, (void **)&dxgi_info);
if (FAILED(hr))
{
Panic(Lit("Failed to get DXGI debug interface"));
}
IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_CORRUPTION, 1);
IDXGIInfoQueue_SetBreakOnSeverity(dxgi_info, DXGI_DEBUG_ALL, DXGI_INFO_QUEUE_MESSAGE_SEVERITY_ERROR, 1);
IDXGIInfoQueue_Release(dxgi_info);
}
#endif
EndScratch(scratch); EndScratch(scratch);
} }
//- Queue initialization
JobDef(GPU_D12_InitQueue, sig, id)
{
GPU_D12_SharedState *g = &GPU_D12_shared_state;
GPU_D12_QueueDesc desc = sig->descs[id];
Arena *perm = PermArena();
HRESULT hr = 0;
GPU_D12_Queue *queue = 0;
{
PushAlign(perm, CachelineSize);
queue = PushStruct(perm, GPU_D12_Queue);
PushAlign(perm, CachelineSize);
}
queue->desc = desc;
D3D12_COMMAND_QUEUE_DESC d3d_desc = ZI;
d3d_desc.Type = desc.d3d_type;
d3d_desc.Priority = desc.d3d_priority;
hr = ID3D12Device_CreateCommandQueue(g->device, &d3d_desc, &IID_ID3D12CommandQueue, (void **)&queue->cq);
if (FAILED(hr))
{
Panic(Lit("Failed to create command queue"));
}
hr = ID3D12Device_CreateFence(g->device, 0, 0, &IID_ID3D12Fence, (void **)&queue->submit_fence);
if (FAILED(hr))
{
Panic(Lit("Failed to create command queue fence"));
}
g->queues[desc.kind] = queue;
}
//////////////////////////////// ////////////////////////////////
//~ Pipeline operations //~ Pipeline operations
@ -148,8 +259,8 @@ GPU_D12_Pipeline *GPU_D12_PipelineFromDesc(GPU_D12_PipelineDesc desc)
GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind) GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind)
{ {
/* TODO */ GPU_D12_SharedState *g = &GPU_D12_shared_state;
return 0; return g->queues[kind];
} }
//////////////////////////////// ////////////////////////////////
@ -190,13 +301,13 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
} }
cl->queue = queue; cl->queue = queue;
HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.type, &IID_ID3D12CommandAllocator, (void **)&cl->ca); HRESULT hr = ID3D12Device_CreateCommandAllocator(g->device, queue->desc.d3d_type, &IID_ID3D12CommandAllocator, (void **)&cl->ca);
if (FAILED(hr)) if (FAILED(hr))
{ {
Panic(Lit("Failed to create command allocator")); Panic(Lit("Failed to create command allocator"));
} }
hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl); hr = ID3D12Device_CreateCommandList(g->device, 0, queue->desc.d3d_type, cl->ca, 0, &IID_ID3D12GraphicsCommandList, (void **)&cl->cl);
if (FAILED(hr)) if (FAILED(hr))
{ {
Panic(Lit("Failed to create command list")); Panic(Lit("Failed to create command list"));
@ -227,7 +338,7 @@ GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind)
return cl; return cl;
} }
void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl) u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
{ {
GPU_D12_Queue *queue = cl->queue; GPU_D12_Queue *queue = cl->queue;
@ -243,11 +354,12 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
} }
/* Submit */ /* Submit */
u64 target = 0;
{ {
__profn("Execute"); __profn("Execute");
Lock lock = LockE(&queue->submit_mutex); Lock lock = LockE(&queue->submit_mutex);
{ {
u64 target = ++queue->submit_fence_target; target = ++queue->submit_fence_target;
cl->submit_fence_target = target; cl->submit_fence_target = target;
/* Execute */ /* Execute */
ID3D12CommandQueue_ExecuteCommandLists(queue->cq, 1, (ID3D12CommandList **)&cl->cl); ID3D12CommandQueue_ExecuteCommandLists(queue->cq, 1, (ID3D12CommandList **)&cl->cl);
@ -257,6 +369,38 @@ void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl)
} }
Unlock(&lock); Unlock(&lock);
} }
return target;
}
////////////////////////////////
//~ Queue sync job
JobDef(GPU_D12_StartQueueSync, _, __)
{
GPU_D12_SharedState *g = &GPU_D12_shared_state;
HANDLE queue_fences_events[GPU_NumQueues] = ZI;
u64 queue_fences_seen[GPU_NumQueues] = ZI;
for (i32 i = 0; i < countof(queue_fences_events); ++i)
{
queue_fences_events[i] = CreateEvent(0, 0, 1, 0);
}
for (;;)
{
WaitForMultipleObjects(countof(queue_fences_events), queue_fences_events, 0, INFINITE);
for (GPU_QueueKind queue_kind = 0; queue_kind < GPU_NumQueues; ++queue_kind)
{
GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
u64 last_seen = queue_fences_seen[queue_kind];
u64 completed = ID3D12Fence_GetCompletedValue(queue->submit_fence);
if (completed > last_seen)
{
SetFence(&queue->sync_fence, completed);
queue_fences_seen[queue_kind] = completed;
ID3D12Fence_SetEventOnCompletion(queue->submit_fence, completed + 1, queue_fences_events[queue_kind]);
}
}
}
} }
//////////////////////////////// ////////////////////////////////
@ -267,6 +411,15 @@ void GPU_Startup(void)
GPU_D12_Startup(); GPU_D12_Startup();
} }
////////////////////////////////
//~ @hookdecl Fence hooks
Fence *GPU_FenceFromQueue(GPU_QueueKind queue_kind)
{
GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
return &queue->sync_fence;
}
//////////////////////////////// ////////////////////////////////
//~ @hookdef Rasterizer helper hooks //~ @hookdef Rasterizer helper hooks
@ -398,7 +551,7 @@ GPU_Resource *GPU_AcquireResource(GPU_ResourceDesc desc)
d3d_desc.Alignment = 0; d3d_desc.Alignment = 0;
d3d_desc.Width = desc.texture.size.x; d3d_desc.Width = desc.texture.size.x;
d3d_desc.Height = desc.texture.size.y; d3d_desc.Height = desc.texture.size.y;
d3d_desc.DepthOrArraySize = desc.texture.size.y; d3d_desc.DepthOrArraySize = desc.texture.size.z;
d3d_desc.MipLevels = 1; d3d_desc.MipLevels = 1;
d3d_desc.SampleDesc.Count = 1; d3d_desc.SampleDesc.Count = 1;
d3d_desc.SampleDesc.Quality = 0; d3d_desc.SampleDesc.Quality = 0;
@ -465,7 +618,7 @@ Vec2I32 GPU_GetTextureSize(GPU_Resource *resource)
//////////////////////////////// ////////////////////////////////
//~ @hookdef Command list hooks //~ @hookdef Command list hooks
GPU_CommandList *GPU_BeginCommandList(void) GPU_CommandList *GPU_BeginCommandList(GPU_QueueKind queue_kind)
{ {
GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId()); GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
Arena *perm = PermArena(); Arena *perm = PermArena();
@ -479,23 +632,16 @@ GPU_CommandList *GPU_BeginCommandList(void)
{ {
cl = PushStruct(perm, GPU_D12_CommandList); cl = PushStruct(perm, GPU_D12_CommandList);
} }
cl->queue_kind = queue_kind;
return (GPU_CommandList *)cl; return (GPU_CommandList *)cl;
} }
u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence) u64 GPU_EndCommandList(GPU_CommandList *gpu_cl)
{ {
GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId()); GPU_D12_FiberState *f = GPU_D12_FiberStateFromId(FiberId());
GPU_D12_CommandList *cl = (GPU_D12_CommandList *)gpu_cl; GPU_D12_CommandList *cl = (GPU_D12_CommandList *)gpu_cl;
GPU_QueueKind queue_kind = cl->queue_kind;
/* Determine queue kind */ GPU_D12_Queue *queue = GPU_D12_QueueFromKind(queue_kind);
#if 0
GPU_QueueKind queue_kind = GPU_QueueKind_Direct;
#else
GPU_QueueKind queue_kind = GPU_QueueKind_BackgroundCopy;
for (GPU_D12_Command *cmd = cl->first; cmd; cmd = cmd->next)
{
}
#endif
/* Begin dx12 command list */ /* Begin dx12 command list */
GPU_D12_RawCommandList *dx12_cl = GPU_D12_BeginRawCommandList(queue_kind); GPU_D12_RawCommandList *dx12_cl = GPU_D12_BeginRawCommandList(queue_kind);
@ -654,7 +800,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
} }
/* End dx12 command list */ /* End dx12 command list */
GPU_D12_EndRawCommandList(dx12_cl); u64 fence_target = GPU_D12_EndRawCommandList(dx12_cl);
/* Free commands */ /* Free commands */
if (cl->last) if (cl->last)
@ -666,7 +812,7 @@ u32 GPU_EndCommandList(GPU_CommandList *gpu_cl, Fence *fence)
/* Free command list */ /* Free command list */
StackPush(f->first_free_command_list, cl); StackPush(f->first_free_command_list, cl);
return 1; return fence_target;
} }
//////////////////////////////// ////////////////////////////////

View File

@ -53,8 +53,9 @@ Struct(GPU_D12_Resource)
Struct(GPU_D12_QueueDesc) Struct(GPU_D12_QueueDesc)
{ {
enum D3D12_COMMAND_LIST_TYPE type; GPU_QueueKind kind;
enum D3D12_COMMAND_QUEUE_PRIORITY priority; D3D12_COMMAND_LIST_TYPE d3d_type;
D3D12_COMMAND_QUEUE_PRIORITY d3d_priority;
String dbg_name; String dbg_name;
}; };
@ -68,6 +69,8 @@ Struct(GPU_D12_Queue)
u64 submit_fence_target; u64 submit_fence_target;
struct GPU_D12_RawCommandList *first_submitted_cl; struct GPU_D12_RawCommandList *first_submitted_cl;
struct GPU_D12_RawCommandList *last_submitted_cl; struct GPU_D12_RawCommandList *last_submitted_cl;
Fence sync_fence;
}; };
//////////////////////////////// ////////////////////////////////
@ -163,6 +166,8 @@ Struct(GPU_D12_CommandList)
GPU_D12_Command *first; GPU_D12_Command *first;
GPU_D12_Command *last; GPU_D12_Command *last;
u64 count; u64 count;
GPU_QueueKind queue_kind;
}; };
//////////////////////////////// ////////////////////////////////
@ -189,6 +194,9 @@ Struct(GPU_D12_SharedState)
{ {
GPU_D12_FiberState *fiber_states[MaxFibers]; GPU_D12_FiberState *fiber_states[MaxFibers];
/* Queues */
GPU_D12_Queue *queues[GPU_NumQueues];
/* Resources */ /* Resources */
Mutex free_resources_mutex; Mutex free_resources_mutex;
GPU_D12_Resource *first_free_resource; GPU_D12_Resource *first_free_resource;
@ -213,10 +221,14 @@ u64 GPU_D12_ReuseHashFromResourceDesc(GPU_ResourceDesc desc);
void GPU_D12_Startup(void); void GPU_D12_Startup(void);
//////////////////////////////// ////////////////////////////////
//~ Device initialization //~ Initialization
//- Device initialization
void GPU_D12_InitDevice(void); void GPU_D12_InitDevice(void);
//- Queue initialization
JobDecl(GPU_D12_InitQueue, { GPU_D12_QueueDesc *descs; });
//////////////////////////////// ////////////////////////////////
//~ Pipeline operations //~ Pipeline operations
@ -231,4 +243,9 @@ GPU_D12_Queue *GPU_D12_QueueFromKind(GPU_QueueKind kind);
//~ Raw command list operations //~ Raw command list operations
GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind); GPU_D12_RawCommandList *GPU_D12_BeginRawCommandList(GPU_QueueKind queue_kind);
void GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl); u64 GPU_D12_EndRawCommandList(GPU_D12_RawCommandList *cl);
////////////////////////////////
//~ Sync job
JobDecl(GPU_D12_StartQueueSync, EmptySig);

View File

@ -101,7 +101,7 @@ void P_Startup(void)
g->socks_arena = AcquireArena(Gibi(64)); g->socks_arena = AcquireArena(Gibi(64));
//- Init timer //- Init timer
RunJob(P_W32_UpdateTimer); RunJob(P_W32_StartTimerSync, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated);
} }
//////////////////////////////// ////////////////////////////////
@ -179,11 +179,11 @@ P_W32_Window *P_W32_AcquireWindow(void)
window->event_arenas[0] = AcquireArena(Gibi(64)); window->event_arenas[0] = AcquireArena(Gibi(64));
window->event_arenas[1] = AcquireArena(Gibi(64)); window->event_arenas[1] = AcquireArena(Gibi(64));
/* Start window event thread */ /* Start window event job */
/* NOTE: This thread must finish building for the window to actually be /* NOTE: This job must finish starting for the window to actually be
* created and receive a HWND, because on Windows a the event proc must run on * created and receive a HWND, because on Windows a the event proc must run on
* the same thread that created the window. */ * the same thread that created the window. */
window->window_thread = W32_StartThread(&P_W32_WindowThreadEntryFunc, window, Lit("Window thread"), PROF_THREAD_GROUP_WINDOW); RunJob(P_W32_StartWindowMsgProcessing, .pool = JobPool_Hyper, .flags = JobFlag_Dedicated, .sig.window = window);
YieldOnFence(&window->ready_fence, 1); YieldOnFence(&window->ready_fence, 1);
return window; return window;
@ -195,7 +195,7 @@ void P_W32_ReleaseWindow(P_W32_Window *window)
Atomic32Set(&window->shutdown, 1); Atomic32Set(&window->shutdown, 1);
P_W32_SharedState *g = &P_W32_shared_state; P_W32_SharedState *g = &P_W32_shared_state;
P_W32_WakeWindow(window); P_W32_WakeWindow(window);
W32_WaitEndThread(window->window_thread); YieldOnFence(&window->finished_fence, 1);
Lock lock = LockE(&g->windows_mutex); Lock lock = LockE(&g->windows_mutex);
{ {
@ -389,11 +389,11 @@ void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *sett
} }
//////////////////////////////// ////////////////////////////////
//~ Win32 window thread //~ Win32 window message processing
W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg) JobDef(P_W32_StartWindowMsgProcessing, sig, id)
{ {
P_W32_Window *window = (P_W32_Window *)arg; P_W32_Window *window = sig->window;
/* Win32 limitation: Window must be initialized on same thread that processes events */ /* Win32 limitation: Window must be initialized on same thread that processes events */
window->hwnd = P_W32_InitWindow(window); window->hwnd = P_W32_InitWindow(window);
@ -419,6 +419,7 @@ W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg)
/* Destroy window hwnd */ /* Destroy window hwnd */
DestroyWindow(window->hwnd); DestroyWindow(window->hwnd);
SetFence(&window->finished_fence, 1);
} }
void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event) void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event)
@ -867,7 +868,7 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr)
//////////////////////////////// ////////////////////////////////
//~ Timer job //~ Timer job
JobDef(P_W32_UpdateTimer, _, __) JobDef(P_W32_StartTimerSync, _, __)
{ {
P_W32_SharedState *g = &P_W32_shared_state; P_W32_SharedState *g = &P_W32_shared_state;
SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL); SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL);

View File

@ -42,6 +42,7 @@ Struct(P_W32_Window)
HWND hwnd; HWND hwnd;
Fence ready_fence; Fence ready_fence;
Fence finished_fence;
u16 utf16_high_surrogate_last_input; u16 utf16_high_surrogate_last_input;
@ -67,8 +68,6 @@ Struct(P_W32_Window)
i32 current_event_arena_index; i32 current_event_arena_index;
Arena *event_arenas[2]; Arena *event_arenas[2];
W32_Thread *window_thread;
Atomic32 shutdown; Atomic32 shutdown;
P_W32_Window *next_free; P_W32_Window *next_free;
}; };
@ -160,12 +159,16 @@ P_W32_Window *P_W32_AcquireWindow(void);
void P_W32_ReleaseWindow(P_W32_Window *window); void P_W32_ReleaseWindow(P_W32_Window *window);
HWND P_W32_InitWindow(P_W32_Window *window); HWND P_W32_InitWindow(P_W32_Window *window);
//- Window settings ////////////////////////////////
//~ Window settings
void P_W32_UpdateWindowFromSystem(P_W32_Window *window); void P_W32_UpdateWindowFromSystem(P_W32_Window *window);
void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *settings); void P_W32_UpdateWindowFromSettings(P_W32_Window *window, P_WindowSettings *settings);
//- Window thread ////////////////////////////////
W32_ThreadDef(P_W32_WindowThreadEntryFunc, arg); //~ Window message processing
JobDecl(P_W32_StartWindowMsgProcessing, { P_W32_Window *window; });
void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event); void P_W32_ProcessWindowEvent(P_W32_Window *window, P_WindowEvent event);
void P_W32_WakeWindow(P_W32_Window *window); void P_W32_WakeWindow(P_W32_Window *window);
LRESULT CALLBACK P_W32_Win32WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam); LRESULT CALLBACK P_W32_Win32WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM lparam);
@ -180,4 +183,4 @@ P_Address P_W32_PlatformAddressFromWin32Address(P_W32_Address ws_addr);
//////////////////////////////// ////////////////////////////////
//~ Timer job //~ Timer job
JobDecl(P_W32_UpdateTimer, EmptySig); JobDecl(P_W32_StartTimerSync, EmptySig);

View File

@ -406,13 +406,12 @@ GPU_Resource *AcquireGbuffer(GPU_Format format, Vec2I32 size)
GPU_Resource *AcquireUploadBuffer(u32 element_count, u32 element_size, void *src) GPU_Resource *AcquireUploadBuffer(u32 element_count, u32 element_size, void *src)
{ {
__prof; __prof;
u64 size = element_size * element_count;
GPU_ResourceDesc desc = ZI; GPU_ResourceDesc desc = ZI;
desc.kind = GPU_ResourceKind_Buffer; desc.kind = GPU_ResourceKind_Buffer;
desc.flags = GPU_ResourceFlag_None; desc.flags = GPU_ResourceFlag_None;
desc.buffer.heap_kind = GPU_HeapKind_Upload; desc.buffer.heap_kind = GPU_HeapKind_Upload;
desc.buffer.element_size = size;
desc.buffer.element_count = element_count; desc.buffer.element_count = element_count;
desc.buffer.element_capacity = element_count;
desc.buffer.element_size = element_size; desc.buffer.element_size = element_size;
GPU_Resource *r = GPU_AcquireResource(desc); GPU_Resource *r = GPU_AcquireResource(desc);
{ {
@ -2151,14 +2150,20 @@ void UpdateUser(P_Window *window)
{ {
__profn("Render"); __profn("Render");
GPU_QueueKind gpu_render_queue = GPU_QueueKind_Direct;
Rect ui_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->ui_size.x, g->ui_size.y)); Rect ui_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->ui_size.x, g->ui_size.y));
Rect render_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->render_size.x, g->render_size.y)); Rect render_viewport = RectFromVec2(VEC2(0, 0), VEC2(g->render_size.x, g->render_size.y));
if (!g->gpu_render_fence)
{
g->gpu_render_fence = GPU_FenceFromQueue(gpu_render_queue);
}
/* Acquire gbuffers */ /* Acquire gbuffers */
if (g->shade_target && !EqVec2I32(g->render_size, GPU_GetTextureSize(g->shade_target))) if (g->shade_target && !EqVec2I32(g->render_size, GPU_GetTextureSize(g->shade_target)))
{ {
__profn("Release render resources"); __profn("Release render resources");
YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target); YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
GPU_ReleaseResource(g->albedo, GPU_ReleaseFlag_None); GPU_ReleaseResource(g->albedo, GPU_ReleaseFlag_None);
GPU_ReleaseResource(g->emittance, GPU_ReleaseFlag_None); GPU_ReleaseResource(g->emittance, GPU_ReleaseFlag_None);
GPU_ReleaseResource(g->emittance_flood_read, GPU_ReleaseFlag_None); GPU_ReleaseResource(g->emittance_flood_read, GPU_ReleaseFlag_None);
@ -2181,7 +2186,7 @@ void UpdateUser(P_Window *window)
/* Acquire ui buffers */ /* Acquire ui buffers */
if (g->ui_target && !EqVec2I32(g->ui_size, GPU_GetTextureSize(g->ui_target))) if (g->ui_target && !EqVec2I32(g->ui_size, GPU_GetTextureSize(g->ui_target)))
{ {
YieldOnFence(&g->gpu_render_fence, g->gpu_render_fence_target); YieldOnFence(g->gpu_render_fence, g->gpu_render_fence_target);
GPU_ReleaseResource(g->ui_target, GPU_ReleaseFlag_None); GPU_ReleaseResource(g->ui_target, GPU_ReleaseFlag_None);
g->ui_target = 0; g->ui_target = 0;
} }
@ -2200,7 +2205,7 @@ void UpdateUser(P_Window *window)
GPU_Resource *ui_shape_indices_buffer = AcquireUploadBufferFromArena(g->ui_shape_indices_count, g->ui_shape_indices_arena); GPU_Resource *ui_shape_indices_buffer = AcquireUploadBufferFromArena(g->ui_shape_indices_count, g->ui_shape_indices_arena);
GPU_Resource *grids_buffer = AcquireUploadBufferFromArena(g->grids_count, g->grids_arena); GPU_Resource *grids_buffer = AcquireUploadBufferFromArena(g->grids_count, g->grids_arena);
GPU_CommandList *cl = GPU_BeginCommandList(); GPU_CommandList *cl = GPU_BeginCommandList(gpu_render_queue);
{ {
__profn("Run render"); __profn("Run render");
GPU_ProfN(cl, Lit("Run render")); GPU_ProfN(cl, Lit("Run render"));
@ -2427,7 +2432,7 @@ void UpdateUser(P_Window *window)
GPU_RasterizeMode_TriangleList); GPU_RasterizeMode_TriangleList);
} }
} }
g->gpu_render_fence_target += GPU_EndCommandList(cl, &g->gpu_render_fence); g->gpu_render_fence_target = GPU_EndCommandList(cl);
/* Release transfer buffers */ /* Release transfer buffers */
{ {
@ -2444,7 +2449,7 @@ void UpdateUser(P_Window *window)
{ {
DelayReleaseGpuResources_Sig *sig = PushStruct(job->arena, DelayReleaseGpuResources_Sig); DelayReleaseGpuResources_Sig *sig = PushStruct(job->arena, DelayReleaseGpuResources_Sig);
job->count = countof(release_resources); job->count = countof(release_resources);
sig->begin_fence = &g->gpu_render_fence; sig->begin_fence = g->gpu_render_fence;
sig->begin_fence_target = g->gpu_render_fence_target; sig->begin_fence_target = g->gpu_render_fence_target;
sig->resources = PushStructsNoZero(job->arena, GPU_Resource *, job->count); sig->resources = PushStructsNoZero(job->arena, GPU_Resource *, job->count);
sig->flags = GPU_ReleaseFlag_Reuse; sig->flags = GPU_ReleaseFlag_Reuse;

View File

@ -195,7 +195,7 @@ Struct(SharedUserState)
u32 ui_shape_indices_count; u32 ui_shape_indices_count;
u32 grids_count; u32 grids_count;
Fence gpu_render_fence; Fence *gpu_render_fence;
u64 gpu_render_fence_target; u64 gpu_render_fence_target;
//- Bind state //- Bind state