gpu queue synchronization

This commit is contained in:
jacob 2025-12-10 14:06:27 -06:00
parent 3da749ef51
commit bc17e94758
4 changed files with 204 additions and 137 deletions

View File

@ -21,10 +21,6 @@ void G_BootstrapCommon(void)
g->quad_indices = G_IdxBuff16(quad_indices);
}
/* TODO: Init debug print queues */
{
}
/* Init point sampler */
{
G_ResourceHandle pt_sampler = G_PushSampler(gpu_perm, (G_SamplerResourceDesc) { .filter = G_Filter_MinMagMipPoint });
@ -55,7 +51,8 @@ void G_BootstrapCommon(void)
}
G_CommitCommandList(cl);
G_SyncOtherQueues(G_QueueKind_Direct);
/* Barrier all queues until direct queue finishes initializing resources */
G_Sync(G_QueueMask_Direct, G_QueueMask_All);
}
////////////////////////////////////////////////////////////

View File

@ -18,17 +18,43 @@ Struct(G_SwapchainHandle) { u64 v; };
Enum(G_QueueKind)
{
#if G_IsMultiQueueEnabled
G_QueueKind_Direct = 0,
#if G_IsMultiQueueEnabled
G_QueueKind_AsyncCompute = 1,
G_QueueKind_AsyncCopy = 2,
G_NumQueues = 3
#else
G_QueueKind_Direct = 0,
G_QueueKind_AsyncCompute = 0,
G_QueueKind_AsyncCopy = 0,
G_NumQueues = 1
G_QueueKind_AsyncCompute = G_QueueKind_Direct,
G_QueueKind_AsyncCopy = G_QueueKind_Direct,
#endif
G_NumQueues
};
Enum(G_QueueMask)
{
G_QueueMask_None = 0,
G_QueueMask_Direct = (1 << 0),
#if G_IsMultiQueueEnabled
G_QueueMask_AsyncCompute = (1 << 1),
G_QueueMask_AsyncCopy = (1 << 2),
#else
G_QueueMask_AsyncCompute = G_QueueMask_Direct,
G_QueueMask_AsyncCopy = G_QueueMask_Direct,
#endif
G_QueueMask_All = (0xFFFFFFFF >> (32 - G_NumQueues))
};
#define G_MaskFromQueue(queue_kind) (1 << queue_kind)
Struct(G_QueueCompletions)
{
i64 v[G_NumQueues]; /* Array of completions indexed by queue kind */
};
/* All waiters will wait until specified queues reach their value in the `completions` array */
Struct(G_QueueBarrierDesc)
{
G_QueueCompletions completions; /* Completions that waiters should wait for */
G_QueueMask wait_queues; /* Mask of queues that will wait for completions */
b32 wait_cpu; /* Will the cpu wait for completion */
};
////////////////////////////////////////////////////////////
@ -163,7 +189,7 @@ Enum(G_Format)
};
////////////////////////////////////////////////////////////
//~ Barrier types
//~ Memory sync types
Enum(G_Stage)
{
@ -278,7 +304,7 @@ Enum(G_Layout)
* - Necessary resource flushes will occur based on `access_prev` & `access_next`
* - Texture layout will transition based on `layout` (if specified)
*/
Struct(G_BarrierDesc)
Struct(G_MemoryBarrierDesc)
{
G_ResourceHandle resource;
b32 is_global;
@ -459,25 +485,6 @@ Struct(G_IndexBufferDesc)
u32 index_count;
};
////////////////////////////////////////////////////////////
//~ Synchronization types
Enum(G_FenceOpKind)
{
G_FenceOpKind_Set,
G_FenceOpKind_Add,
};
Struct(G_FenceOp)
{
G_FenceOpKind kind;
Fence *fence;
i64 v;
};
#define G_SetFence(_fence, _v) ((G_FenceOp) { .kind = G_FenceOpKind_Set, .fence = (_fence), .v = (_v) })
#define G_AddFence(_fence, _v) ((G_FenceOp) { .kind = G_FenceOpKind_Add, .fence = (_fence), .v = (_v) })
////////////////////////////////////////////////////////////
//~ Statistic types
@ -641,9 +648,7 @@ u32 G_PushRef(G_ArenaHandle arena, G_ResourceHandle resource, G_RefDesc desc);
//- Command list
G_CommandListHandle G_PrepareCommandList(G_QueueKind queue);
void G_CommitCommandListEx(G_CommandListHandle cl, u64 fence_ops_count, G_FenceOp *fence_ops);
#define G_CommitCommandList(cl) G_CommitCommandListEx((cl), 0, 0)
i64 G_CommitCommandList(G_CommandListHandle cl);
//- Arena
@ -671,12 +676,12 @@ void G_SetConstant_(G_CommandListHandle cl, i32 slot, void *src_32bit, u32 size)
G_SetConstant_((cl), (name), &__src, sizeof(__src)); \
} while (0)
//- Barrier
//- Memory sync
void G_Sync(G_CommandListHandle cl, G_BarrierDesc desc);
void G_MemorySyncEx(G_CommandListHandle cl, G_MemoryBarrierDesc desc);
#define G_MemorySync(_cl, _resource, _sync_prev, _access_prev, _sync_next, _access_next) \
G_Sync((_cl), (G_BarrierDesc) { \
G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \
.resource = (_resource), \
.sync_prev = _sync_prev, \
.access_prev = _access_prev, \
@ -685,7 +690,7 @@ void G_Sync(G_CommandListHandle cl, G_BarrierDesc desc);
})
#define G_MemoryLayoutSync(_cl, _resource, _sync_prev, _access_prev, _sync_next, _access_next, _layout) \
G_Sync((_cl), (G_BarrierDesc) { \
G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \
.resource = (_resource), \
.sync_prev = _sync_prev, \
.access_prev = _access_prev, \
@ -695,7 +700,7 @@ void G_Sync(G_CommandListHandle cl, G_BarrierDesc desc);
})
#define G_GlobalMemorySync(_cl, _sync_prev, _access_prev, _sync_next, _access_next) \
G_Sync((_cl), (G_BarrierDesc) { \
G_MemorySync((_cl), (G_MemoryBarrierDesc) { \
.is_global = 1, \
.sync_prev = _sync_prev, \
.access_prev = _access_prev, \
@ -730,13 +735,23 @@ void G_Rasterize(G_CommandListHandle cl,
void G_ClearRenderTarget(G_CommandListHandle cl, G_ResourceHandle render_target, Vec4 color);
////////////////////////////////////////////////////////////
//~ @hookdecl Queue synchronization
//~ @hookdecl Synchronization
/* `waiter_queue` will block until `completion_queue` completes all submitted commands */
void G_SyncQueue(G_QueueKind completion_queue, G_QueueKind waiter_queue);
i64 G_CompletionValueFromQueue(G_QueueKind queue_kind);
i64 G_CompletionTargetFromQueue(G_QueueKind queue_kind);
G_QueueCompletions G_CompletionValuesFromQueues(G_QueueMask queue_mask);
G_QueueCompletions G_CompletionTargetsFromQueues(G_QueueMask queue_mask);
/* All queues will block until `completion_queue` completes all submitted commands */
void G_SyncOtherQueues(G_QueueKind completion_queue);
void G_SyncEx(G_QueueBarrierDesc desc);
#define G_Sync(completion_mask, ...) \
G_SyncEx((G_QueueBarrierDesc) { \
.completions = G_CompletionTargetsFromQueues(completion_mask), \
__VA_ARGS__ \
})
#define G_SyncGpu(completion_mask, wait_mask) G_Sync((completion_mask), .wait_queues = (wait_mask))
#define G_SyncCpu(completion_mask) G_Sync((completion_mask), .wait_cpu = 1);
////////////////////////////////////////////////////////////
//~ @hookdecl Statistics

View File

@ -1,4 +1,5 @@
G_D12_SharedState G_D12_shared_state = ZI;
ThreadLocal G_D12_ThreadLocalState G_D12_tl = ZI;
////////////////////////////////////////////////////////////
//~ @hookimpl Bootstrap
@ -293,21 +294,20 @@ void G_Bootstrap(void)
/* Create debug print buffers */
if (GPU_SHADER_PRINT)
{
u64 print_buffer_size = Mebi(64);
for (G_QueueKind kind = 0; kind < G_NumQueues; ++kind)
{
G_D12_Queue *queue = G_D12_QueueFromKind(kind);
if (kind != G_QueueKind_AsyncCopy)
{
/* TODO: Don't create this in host memory. Just double buffer & do an async copy. */
G_ArenaHandle gpu_perm = G_PermArena();
queue->debug_print_buffer = G_PushBuffer(
queue->print_buffer_size = Mebi(64);
queue->print_buffer = G_PushBuffer(
gpu_perm,
u8,
print_buffer_size,
.flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_HostMemory
queue->print_buffer_size,
.flags = G_ResourceFlag_AllowShaderReadWrite
);
queue->debug_print_buffer_ref = G_PushRWByteAddressBufferRef(gpu_perm, queue->debug_print_buffer);
queue->print_buffer_ref = G_PushRWByteAddressBufferRef(gpu_perm, queue->print_buffer);
}
}
}
@ -692,7 +692,7 @@ G_D12_RawCommandList *G_D12_PrepareRawCommandList(G_QueueKind queue_kind)
return cl;
}
void G_D12_CommitRawCommandList(G_D12_RawCommandList *cl)
i64 G_D12_CommitRawCommandList(G_D12_RawCommandList *cl)
{
G_D12_Queue *queue = cl->queue;
@ -707,21 +707,23 @@ void G_D12_CommitRawCommandList(G_D12_RawCommandList *cl)
}
/* Commit */
i64 completion = 0;
{
Lock lock = LockE(&queue->commit_mutex);
{
u64 target = ++queue->commit_fence_target;
cl->commit_fence_target = target;
completion = ++queue->commit_fence_target;
cl->commit_fence_target = completion;
/* Execute */
ID3D12CommandQueue_ExecuteCommandLists(queue->d3d_queue, 1, (ID3D12CommandList **)&cl->d3d_cl);
ID3D12CommandQueue_Signal(queue->d3d_queue, queue->commit_fence, target);
ID3D12CommandQueue_Signal(queue->d3d_queue, queue->commit_fence, completion);
/* Append */
SllQueuePush(queue->first_committed_cl, queue->last_committed_cl, cl);
}
Unlock(&lock);
}
return completion;
}
////////////////////////////////////////////////////////////
@ -1596,7 +1598,7 @@ G_CommandListHandle G_PrepareCommandList(G_QueueKind queue)
return G_D12_MakeHandle(G_CommandListHandle, cl);
}
void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G_FenceOp *fence_ops)
i64 G_CommitCommandList(G_CommandListHandle cl_handle)
{
G_D12_SharedState *g = &G_D12_shared_state;
G_D12_CmdList *cl = G_D12_CmdListFromHandle(cl_handle);
@ -1618,13 +1620,13 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G
u64 slotted_constants[G_NumConstants];
u64 bound_compute_constants[G_NumConstants];
u64 bound_graphics_constants[G_NumConstants];
for (i32 i = 0; i < countof(slotted_constants); ++i) { slotted_constants[i] = 0; } /* Zero initialze all constant slots */
for (i32 i = 0; i < countof(slotted_constants); ++i) { slotted_constants[i] = 0; } /* Zero-initialize all slots */
for (i32 i = 0; i < countof(bound_compute_constants); ++i) { bound_compute_constants[i] = U64Max; }
for (i32 i = 0; i < countof(bound_graphics_constants); ++i) { bound_graphics_constants[i] = U64Max; }
if (!G_IsRefNil(queue->debug_print_buffer_ref))
if (!G_IsRefNil(queue->print_buffer_ref))
{
slotted_constants[G_ShaderConst_DebugBufferRef] = queue->debug_print_buffer_ref.v;
slotted_constants[G_ShaderConst_DebugBufferRef] = queue->print_buffer_ref.v;
}
/* Rasterizer state */
@ -1759,7 +1761,7 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G
G_D12_Cmd *barrier_cmd = &cmds[barrier_cmd_idx];
if (barrier_cmd->kind == G_D12_CmdKind_Barrier)
{
G_BarrierDesc desc = barrier_cmd->barrier.desc;
G_MemoryBarrierDesc desc = barrier_cmd->barrier.desc;
G_D12_Resource *resource = G_D12_ResourceFromHandle(desc.resource);
D3D12_BARRIER_TYPE barrier_type = resource->is_texture ? D3D12_BARRIER_TYPE_TEXTURE : D3D12_BARRIER_TYPE_BUFFER;
@ -2196,7 +2198,7 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G
}
/* End dx12 command list */
G_D12_CommitRawCommandList(rcl);
i64 completion = G_D12_CommitRawCommandList(rcl);
/* Free command list */
{
@ -2209,6 +2211,7 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G
}
EndScratch(scratch);
return completion;
}
//- Arena
@ -2406,9 +2409,9 @@ void G_SetConstant_(G_CommandListHandle cl_handle, i32 slot, void *src_32bit, u3
CopyBytes(&cmd->constant.value, src_32bit, MinU32(size, 4));
}
//- Barrier
//- Memory sync
void G_Sync(G_CommandListHandle cl_handle, G_BarrierDesc desc)
void G_MemorySyncEx(G_CommandListHandle cl_handle, G_MemoryBarrierDesc desc)
{
G_D12_CmdList *cl = G_D12_CmdListFromHandle(cl_handle);
G_D12_Cmd *cmd = G_D12_PushCmd(cl);
@ -2464,78 +2467,114 @@ void G_ClearRenderTarget(G_CommandListHandle cl_handle, G_ResourceHandle resourc
}
////////////////////////////////////////////////////////////
//~ @hookimpl Queue synchronization
//~ @hookimpl Synchronization
void G_SyncQueue(G_QueueKind completion_queue_kind, G_QueueKind waiter_queue_kind)
i64 G_CompletionValueFromQueue(G_QueueKind queue_kind)
{
if (completion_queue_kind != waiter_queue_kind)
{
G_D12_Queue *completion_queue = G_D12_QueueFromKind(completion_queue_kind);
G_D12_Queue *waiter_queue = G_D12_QueueFromKind(waiter_queue_kind);
ID3D12Fence *d3d_fence = completion_queue->commit_fence;
u64 fence_target = 0;
{
Lock lock = LockS(&completion_queue->commit_mutex);
fence_target = completion_queue->commit_fence_target;
Unlock(&lock);
}
if (ID3D12Fence_GetCompletedValue(d3d_fence) < fence_target)
{
ID3D12CommandQueue_Wait(waiter_queue->d3d_queue, d3d_fence, fence_target);
}
}
G_D12_Queue *queue = G_D12_QueueFromKind(queue_kind);
return ID3D12Fence_GetCompletedValue(queue->commit_fence);
}
void G_SyncOtherQueues(G_QueueKind completion_queue_kind)
i64 G_CompletionTargetFromQueue(G_QueueKind queue_kind)
{
if (G_IsMultiQueueEnabled)
G_D12_Queue *queue = G_D12_QueueFromKind(queue_kind);
i64 target = 0;
{
Lock lock = LockS(&queue->commit_mutex);
target = queue->commit_fence_target;
Unlock(&lock);
}
return target;
}
G_QueueCompletions G_CompletionValuesFromQueues(G_QueueMask queue_mask)
{
G_QueueCompletions completions = ZI;
for (G_QueueKind queue_kind = 0; queue_kind < G_NumQueues; ++queue_kind)
{
if (queue_mask & (1 << queue_kind))
{
completions.v[queue_kind] = G_CompletionTargetFromQueue(queue_kind);
}
}
return completions;
}
G_QueueCompletions G_CompletionTargetsFromQueues(G_QueueMask queue_mask)
{
G_QueueCompletions completions = ZI;
for (G_QueueKind queue_kind = 0; queue_kind < G_NumQueues; ++queue_kind)
{
if (queue_mask & (1 << queue_kind))
{
completions.v[queue_kind] = G_CompletionTargetFromQueue(queue_kind);
}
}
return completions;
}
void G_SyncEx(G_QueueBarrierDesc desc)
{
G_D12_SharedState *g = &G_D12_shared_state;
u64 fences_count = 0;
ID3D12Fence *fences[G_NumQueues] = ZI;
i64 fence_targets[G_NumQueues] = ZI;
/* Grab fences */
for (G_QueueKind completion_queue_kind = 0; completion_queue_kind < G_NumQueues; ++ completion_queue_kind)
{
G_D12_Queue *completion_queue = G_D12_QueueFromKind(completion_queue_kind);
ID3D12Fence *d3d_fence = completion_queue->commit_fence;
u64 fence_target = 0;
i64 target = desc.completions.v[completion_queue_kind];
if (target > 0)
{
Lock lock = LockS(&completion_queue->commit_mutex);
fence_target = completion_queue->commit_fence_target;
Unlock(&lock);
}
if (ID3D12Fence_GetCompletedValue(d3d_fence) < fence_target)
{
for (G_QueueKind waiter_queue_kind = 0; waiter_queue_kind < G_NumQueues; ++waiter_queue_kind)
i64 fence_value = ID3D12Fence_GetCompletedValue(completion_queue->commit_fence);
if (fence_value < target)
{
if (waiter_queue_kind != completion_queue_kind)
fences[fences_count] = completion_queue->commit_fence;
fence_targets[fences_count] = target;
fences_count += 1;
}
}
}
/* Sync Queues */
for (G_QueueKind waiter_queue_kind = 0; waiter_queue_kind < G_NumQueues; ++ waiter_queue_kind)
{
if (desc.wait_queues & (1 << waiter_queue_kind))
{
G_D12_Queue *waiter_queue = G_D12_QueueFromKind(waiter_queue_kind);
for (u64 fence_idx = 0; fence_idx < fences_count; ++fence_idx)
{
ID3D12Fence *fence = fences[fence_idx];
if (waiter_queue->commit_fence != fence)
{
G_D12_Queue *waiter_queue = G_D12_QueueFromKind(waiter_queue_kind);
ID3D12CommandQueue_Wait(waiter_queue->d3d_queue, d3d_fence, fence_target);
i64 target = fence_targets[fence_idx];
ID3D12CommandQueue_Wait(waiter_queue->d3d_queue, fence, target);
}
}
}
}
/* Sync Cpu */
if (desc.wait_cpu && fences_count > 0)
{
if (G_D12_tl.sync_event == 0)
{
G_D12_tl.sync_event = CreateEvent(0, 0, 0, 0);
}
ID3D12Device1_SetEventOnMultipleFenceCompletion(
g->device,
fences,
(u64 *)fence_targets,
fences_count,
D3D12_MULTIPLE_FENCE_WAIT_FLAG_ALL,
G_D12_tl.sync_event
);
WaitForSingleObject(G_D12_tl.sync_event, INFINITE);
}
}
////////////////////////////////////////////////////////////
//~ @hookimpl Map
// G_Mapped G_Map(G_Resource *gpu_r)
// {
// G_Mapped result = ZI;
// result.resource = gpu_r;
// G_D12_Resource *r = (G_D12_Resource *)gpu_r;
// D3D12_RANGE read_range = ZI;
// HRESULT hr = ID3D12Resource_Map(r->d3d_resource, 0, &read_range, &result.mem);
// if (FAILED(hr) || !result.mem)
// {
// /* TODO: Don't panic */
// Panic(Lit("Failed to map command buffer resource"));
// }
// return result;
// }
// void G_Unmap(G_Mapped m)
// {
// G_D12_Resource *r = (G_D12_Resource *)m.resource;
// ID3D12Resource_Unmap(r->d3d_resource, 0, 0);
// }
////////////////////////////////////////////////////////////
//~ @hookimpl Statistics
@ -2786,26 +2825,36 @@ void G_D12_WorkerEntry(WaveLaneCtx *lane)
G_QueueKind queue_kind = (G_QueueKind)lane->wave->udata;
G_D12_Queue *queue = G_D12_QueueFromKind(queue_kind);
for (;;)
// if (queue->print_buffer_size > 0)
if (queue_kind == G_QueueKind_Direct)
{
/* FIXME: Remove this */
G_ArenaHandle gpu_perm = G_PermArena();
G_ResourceHandle readback_buff = G_PushBuffer(
gpu_perm,
u8,
queue->print_buffer_size,
.flags = G_ResourceFlag_HostMemory
);
Sleep(500);
G_ResourceHandle debug_print_buff = queue->debug_print_buffer;
G_D12_Resource *resource = G_D12_ResourceFromHandle(debug_print_buff);
if (!G_IsResourceNil(debug_print_buff))
for (;;)
{
u8 *base = G_StructFromResource(debug_print_buff, u8);
u32 size = *((u32 *)base);
String text = STRING(size, base + 4);
/* FIXME: Remove this */
if (queue_kind == G_QueueKind_Direct)
Sleep(500);
G_CommandListHandle cl = G_PrepareCommandList(queue_kind);
{
DEBUGBREAKABLE;
G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size));
}
}
i64 completion = G_CommitCommandList(cl);
G_SyncCpu(G_MaskFromQueue(queue_kind));
u32 size = *G_StructFromResource(readback_buff, u32);
u8 *text = G_StructFromResource(readback_buff, u8) + 4;
String s = STRING(size, text);
DEBUGBREAKABLE;
}
}
}

View File

@ -220,8 +220,9 @@ Struct(G_D12_Queue)
u64 commit_fence_target;
/* Global resources */
G_ResourceHandle debug_print_buffer;
G_RWByteAddressBufferRef debug_print_buffer_ref;
u64 print_buffer_size;
G_ResourceHandle print_buffer;
G_RWByteAddressBufferRef print_buffer_ref;
/* Raw command lists */
struct G_D12_RawCommandList *first_committed_cl;
@ -283,7 +284,7 @@ Struct(G_D12_Cmd)
struct
{
G_BarrierDesc desc;
G_MemoryBarrierDesc desc;
/* Post-batch data */
b32 is_end_of_batch;
@ -413,6 +414,11 @@ Struct(G_D12_SharedState)
ID3D12Device10 *device;
} extern G_D12_shared_state;
Struct(G_D12_ThreadLocalState)
{
HANDLE sync_event;
} extern ThreadLocal G_D12_tl;
////////////////////////////////////////////////////////////
//~ Helpers
@ -453,7 +459,7 @@ G_D12_Descriptor *G_D12_PushDescriptor(G_D12_Arena *gpu_arena, G_D12_DescriptorH
//~ Raw command list
G_D12_RawCommandList *G_D12_PrepareRawCommandList(G_QueueKind queue_kind);
void G_D12_CommitRawCommandList(G_D12_RawCommandList *cl);
i64 G_D12_CommitRawCommandList(G_D12_RawCommandList *cl);
////////////////////////////////////////////////////////////
//~ Command helpers