diff --git a/src/gpu/gpu_common.c b/src/gpu/gpu_common.c index 26804625..a2b9cd35 100644 --- a/src/gpu/gpu_common.c +++ b/src/gpu/gpu_common.c @@ -21,10 +21,6 @@ void G_BootstrapCommon(void) g->quad_indices = G_IdxBuff16(quad_indices); } - /* TODO: Init debug print queues */ - { - } - /* Init point sampler */ { G_ResourceHandle pt_sampler = G_PushSampler(gpu_perm, (G_SamplerResourceDesc) { .filter = G_Filter_MinMagMipPoint }); @@ -55,7 +51,8 @@ void G_BootstrapCommon(void) } G_CommitCommandList(cl); - G_SyncOtherQueues(G_QueueKind_Direct); + /* Barrier all queues until direct queue finishes initializing resources */ + G_Sync(G_QueueMask_Direct, G_QueueMask_All); } //////////////////////////////////////////////////////////// diff --git a/src/gpu/gpu_core.h b/src/gpu/gpu_core.h index c419079d..dfa7edd9 100644 --- a/src/gpu/gpu_core.h +++ b/src/gpu/gpu_core.h @@ -18,17 +18,43 @@ Struct(G_SwapchainHandle) { u64 v; }; Enum(G_QueueKind) { -#if G_IsMultiQueueEnabled G_QueueKind_Direct = 0, +#if G_IsMultiQueueEnabled G_QueueKind_AsyncCompute = 1, G_QueueKind_AsyncCopy = 2, - G_NumQueues = 3 #else - G_QueueKind_Direct = 0, - G_QueueKind_AsyncCompute = 0, - G_QueueKind_AsyncCopy = 0, - G_NumQueues = 1 + G_QueueKind_AsyncCompute = G_QueueKind_Direct, + G_QueueKind_AsyncCopy = G_QueueKind_Direct, #endif + G_NumQueues +}; + +Enum(G_QueueMask) +{ + G_QueueMask_None = 0, + G_QueueMask_Direct = (1 << 0), +#if G_IsMultiQueueEnabled + G_QueueMask_AsyncCompute = (1 << 1), + G_QueueMask_AsyncCopy = (1 << 2), +#else + G_QueueMask_AsyncCompute = G_QueueMask_Direct, + G_QueueMask_AsyncCopy = G_QueueMask_Direct, +#endif + G_QueueMask_All = (0xFFFFFFFF >> (32 - G_NumQueues)) +}; +#define G_MaskFromQueue(queue_kind) (1 << queue_kind) + +Struct(G_QueueCompletions) +{ + i64 v[G_NumQueues]; /* Array of completions indexed by queue kind */ +}; + +/* All waiters will wait until specified queues reach their value in the `completions` array */ +Struct(G_QueueBarrierDesc) +{ + G_QueueCompletions completions; /* Completions that waiters should wait for */ + G_QueueMask wait_queues; /* Mask of queues that will wait for completions */ + b32 wait_cpu; /* Will the cpu wait for completion */ }; //////////////////////////////////////////////////////////// @@ -163,7 +189,7 @@ Enum(G_Format) }; //////////////////////////////////////////////////////////// -//~ Barrier types +//~ Memory sync types Enum(G_Stage) { @@ -278,7 +304,7 @@ Enum(G_Layout) * - Necessary resource flushes will occur based on `access_prev` & `access_next` * - Texture layout will transition based on `layout` (if specified) */ -Struct(G_BarrierDesc) +Struct(G_MemoryBarrierDesc) { G_ResourceHandle resource; b32 is_global; @@ -459,25 +485,6 @@ Struct(G_IndexBufferDesc) u32 index_count; }; -//////////////////////////////////////////////////////////// -//~ Synchronization types - -Enum(G_FenceOpKind) -{ - G_FenceOpKind_Set, - G_FenceOpKind_Add, -}; - -Struct(G_FenceOp) -{ - G_FenceOpKind kind; - Fence *fence; - i64 v; -}; - -#define G_SetFence(_fence, _v) ((G_FenceOp) { .kind = G_FenceOpKind_Set, .fence = (_fence), .v = (_v) }) -#define G_AddFence(_fence, _v) ((G_FenceOp) { .kind = G_FenceOpKind_Add, .fence = (_fence), .v = (_v) }) - //////////////////////////////////////////////////////////// //~ Statistic types @@ -641,9 +648,7 @@ u32 G_PushRef(G_ArenaHandle arena, G_ResourceHandle resource, G_RefDesc desc); //- Command list G_CommandListHandle G_PrepareCommandList(G_QueueKind queue); -void G_CommitCommandListEx(G_CommandListHandle cl, u64 fence_ops_count, G_FenceOp *fence_ops); - -#define G_CommitCommandList(cl) G_CommitCommandListEx((cl), 0, 0) +i64 G_CommitCommandList(G_CommandListHandle cl); //- Arena @@ -671,12 +676,12 @@ void G_SetConstant_(G_CommandListHandle cl, i32 slot, void *src_32bit, u32 size) G_SetConstant_((cl), (name), &__src, sizeof(__src)); \ } while (0) -//- Barrier +//- Memory sync -void G_Sync(G_CommandListHandle cl, G_BarrierDesc desc); +void G_MemorySyncEx(G_CommandListHandle cl, G_MemoryBarrierDesc desc); #define G_MemorySync(_cl, _resource, _sync_prev, _access_prev, _sync_next, _access_next) \ - G_Sync((_cl), (G_BarrierDesc) { \ + G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \ .resource = (_resource), \ .sync_prev = _sync_prev, \ .access_prev = _access_prev, \ @@ -685,7 +690,7 @@ void G_Sync(G_CommandListHandle cl, G_BarrierDesc desc); }) #define G_MemoryLayoutSync(_cl, _resource, _sync_prev, _access_prev, _sync_next, _access_next, _layout) \ - G_Sync((_cl), (G_BarrierDesc) { \ + G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \ .resource = (_resource), \ .sync_prev = _sync_prev, \ .access_prev = _access_prev, \ @@ -695,7 +700,7 @@ void G_Sync(G_CommandListHandle cl, G_BarrierDesc desc); }) #define G_GlobalMemorySync(_cl, _sync_prev, _access_prev, _sync_next, _access_next) \ - G_Sync((_cl), (G_BarrierDesc) { \ + G_MemorySync((_cl), (G_MemoryBarrierDesc) { \ .is_global = 1, \ .sync_prev = _sync_prev, \ .access_prev = _access_prev, \ @@ -730,13 +735,23 @@ void G_Rasterize(G_CommandListHandle cl, void G_ClearRenderTarget(G_CommandListHandle cl, G_ResourceHandle render_target, Vec4 color); //////////////////////////////////////////////////////////// -//~ @hookdecl Queue synchronization +//~ @hookdecl Synchronization -/* `waiter_queue` will block until `completion_queue` completes all submitted commands */ -void G_SyncQueue(G_QueueKind completion_queue, G_QueueKind waiter_queue); +i64 G_CompletionValueFromQueue(G_QueueKind queue_kind); +i64 G_CompletionTargetFromQueue(G_QueueKind queue_kind); +G_QueueCompletions G_CompletionValuesFromQueues(G_QueueMask queue_mask); +G_QueueCompletions G_CompletionTargetsFromQueues(G_QueueMask queue_mask); -/* All queues will block until `completion_queue` completes all submitted commands */ -void G_SyncOtherQueues(G_QueueKind completion_queue); +void G_SyncEx(G_QueueBarrierDesc desc); + +#define G_Sync(completion_mask, ...) \ + G_SyncEx((G_QueueBarrierDesc) { \ + .completions = G_CompletionTargetsFromQueues(completion_mask), \ + __VA_ARGS__ \ + }) + +#define G_SyncGpu(completion_mask, wait_mask) G_Sync((completion_mask), .wait_queues = (wait_mask)) +#define G_SyncCpu(completion_mask) G_Sync((completion_mask), .wait_cpu = 1); //////////////////////////////////////////////////////////// //~ @hookdecl Statistics diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.c b/src/gpu/gpu_dx12/gpu_dx12_core.c index aed0842e..589d7791 100644 --- a/src/gpu/gpu_dx12/gpu_dx12_core.c +++ b/src/gpu/gpu_dx12/gpu_dx12_core.c @@ -1,4 +1,5 @@ G_D12_SharedState G_D12_shared_state = ZI; +ThreadLocal G_D12_ThreadLocalState G_D12_tl = ZI; //////////////////////////////////////////////////////////// //~ @hookimpl Bootstrap @@ -293,21 +294,20 @@ void G_Bootstrap(void) /* Create debug print buffers */ if (GPU_SHADER_PRINT) { - u64 print_buffer_size = Mebi(64); for (G_QueueKind kind = 0; kind < G_NumQueues; ++kind) { G_D12_Queue *queue = G_D12_QueueFromKind(kind); if (kind != G_QueueKind_AsyncCopy) { - /* TODO: Don't create this in host memory. Just double buffer & do an async copy. */ G_ArenaHandle gpu_perm = G_PermArena(); - queue->debug_print_buffer = G_PushBuffer( + queue->print_buffer_size = Mebi(64); + queue->print_buffer = G_PushBuffer( gpu_perm, u8, - print_buffer_size, - .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_HostMemory + queue->print_buffer_size, + .flags = G_ResourceFlag_AllowShaderReadWrite ); - queue->debug_print_buffer_ref = G_PushRWByteAddressBufferRef(gpu_perm, queue->debug_print_buffer); + queue->print_buffer_ref = G_PushRWByteAddressBufferRef(gpu_perm, queue->print_buffer); } } } @@ -692,7 +692,7 @@ G_D12_RawCommandList *G_D12_PrepareRawCommandList(G_QueueKind queue_kind) return cl; } -void G_D12_CommitRawCommandList(G_D12_RawCommandList *cl) +i64 G_D12_CommitRawCommandList(G_D12_RawCommandList *cl) { G_D12_Queue *queue = cl->queue; @@ -707,21 +707,23 @@ void G_D12_CommitRawCommandList(G_D12_RawCommandList *cl) } /* Commit */ + i64 completion = 0; { Lock lock = LockE(&queue->commit_mutex); { - u64 target = ++queue->commit_fence_target; - cl->commit_fence_target = target; + completion = ++queue->commit_fence_target; + cl->commit_fence_target = completion; /* Execute */ ID3D12CommandQueue_ExecuteCommandLists(queue->d3d_queue, 1, (ID3D12CommandList **)&cl->d3d_cl); - ID3D12CommandQueue_Signal(queue->d3d_queue, queue->commit_fence, target); + ID3D12CommandQueue_Signal(queue->d3d_queue, queue->commit_fence, completion); /* Append */ SllQueuePush(queue->first_committed_cl, queue->last_committed_cl, cl); } Unlock(&lock); } + return completion; } //////////////////////////////////////////////////////////// @@ -1596,7 +1598,7 @@ G_CommandListHandle G_PrepareCommandList(G_QueueKind queue) return G_D12_MakeHandle(G_CommandListHandle, cl); } -void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G_FenceOp *fence_ops) +i64 G_CommitCommandList(G_CommandListHandle cl_handle) { G_D12_SharedState *g = &G_D12_shared_state; G_D12_CmdList *cl = G_D12_CmdListFromHandle(cl_handle); @@ -1618,13 +1620,13 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G u64 slotted_constants[G_NumConstants]; u64 bound_compute_constants[G_NumConstants]; u64 bound_graphics_constants[G_NumConstants]; - for (i32 i = 0; i < countof(slotted_constants); ++i) { slotted_constants[i] = 0; } /* Zero initialze all constant slots */ + for (i32 i = 0; i < countof(slotted_constants); ++i) { slotted_constants[i] = 0; } /* Zero-initialize all slots */ for (i32 i = 0; i < countof(bound_compute_constants); ++i) { bound_compute_constants[i] = U64Max; } for (i32 i = 0; i < countof(bound_graphics_constants); ++i) { bound_graphics_constants[i] = U64Max; } - if (!G_IsRefNil(queue->debug_print_buffer_ref)) + if (!G_IsRefNil(queue->print_buffer_ref)) { - slotted_constants[G_ShaderConst_DebugBufferRef] = queue->debug_print_buffer_ref.v; + slotted_constants[G_ShaderConst_DebugBufferRef] = queue->print_buffer_ref.v; } /* Rasterizer state */ @@ -1759,7 +1761,7 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G G_D12_Cmd *barrier_cmd = &cmds[barrier_cmd_idx]; if (barrier_cmd->kind == G_D12_CmdKind_Barrier) { - G_BarrierDesc desc = barrier_cmd->barrier.desc; + G_MemoryBarrierDesc desc = barrier_cmd->barrier.desc; G_D12_Resource *resource = G_D12_ResourceFromHandle(desc.resource); D3D12_BARRIER_TYPE barrier_type = resource->is_texture ? D3D12_BARRIER_TYPE_TEXTURE : D3D12_BARRIER_TYPE_BUFFER; @@ -2196,7 +2198,7 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G } /* End dx12 command list */ - G_D12_CommitRawCommandList(rcl); + i64 completion = G_D12_CommitRawCommandList(rcl); /* Free command list */ { @@ -2209,6 +2211,7 @@ void G_CommitCommandListEx(G_CommandListHandle cl_handle, u64 fence_ops_count, G } EndScratch(scratch); + return completion; } //- Arena @@ -2406,9 +2409,9 @@ void G_SetConstant_(G_CommandListHandle cl_handle, i32 slot, void *src_32bit, u3 CopyBytes(&cmd->constant.value, src_32bit, MinU32(size, 4)); } -//- Barrier +//- Memory sync -void G_Sync(G_CommandListHandle cl_handle, G_BarrierDesc desc) +void G_MemorySyncEx(G_CommandListHandle cl_handle, G_MemoryBarrierDesc desc) { G_D12_CmdList *cl = G_D12_CmdListFromHandle(cl_handle); G_D12_Cmd *cmd = G_D12_PushCmd(cl); @@ -2464,78 +2467,114 @@ void G_ClearRenderTarget(G_CommandListHandle cl_handle, G_ResourceHandle resourc } //////////////////////////////////////////////////////////// -//~ @hookimpl Queue synchronization +//~ @hookimpl Synchronization -void G_SyncQueue(G_QueueKind completion_queue_kind, G_QueueKind waiter_queue_kind) +i64 G_CompletionValueFromQueue(G_QueueKind queue_kind) { - if (completion_queue_kind != waiter_queue_kind) - { - G_D12_Queue *completion_queue = G_D12_QueueFromKind(completion_queue_kind); - G_D12_Queue *waiter_queue = G_D12_QueueFromKind(waiter_queue_kind); - ID3D12Fence *d3d_fence = completion_queue->commit_fence; - u64 fence_target = 0; - { - Lock lock = LockS(&completion_queue->commit_mutex); - fence_target = completion_queue->commit_fence_target; - Unlock(&lock); - } - if (ID3D12Fence_GetCompletedValue(d3d_fence) < fence_target) - { - ID3D12CommandQueue_Wait(waiter_queue->d3d_queue, d3d_fence, fence_target); - } - } + G_D12_Queue *queue = G_D12_QueueFromKind(queue_kind); + return ID3D12Fence_GetCompletedValue(queue->commit_fence); } -void G_SyncOtherQueues(G_QueueKind completion_queue_kind) +i64 G_CompletionTargetFromQueue(G_QueueKind queue_kind) { - if (G_IsMultiQueueEnabled) + G_D12_Queue *queue = G_D12_QueueFromKind(queue_kind); + i64 target = 0; + { + Lock lock = LockS(&queue->commit_mutex); + target = queue->commit_fence_target; + Unlock(&lock); + } + return target; +} + +G_QueueCompletions G_CompletionValuesFromQueues(G_QueueMask queue_mask) +{ + G_QueueCompletions completions = ZI; + for (G_QueueKind queue_kind = 0; queue_kind < G_NumQueues; ++queue_kind) + { + if (queue_mask & (1 << queue_kind)) + { + completions.v[queue_kind] = G_CompletionTargetFromQueue(queue_kind); + } + } + return completions; +} + +G_QueueCompletions G_CompletionTargetsFromQueues(G_QueueMask queue_mask) +{ + G_QueueCompletions completions = ZI; + for (G_QueueKind queue_kind = 0; queue_kind < G_NumQueues; ++queue_kind) + { + if (queue_mask & (1 << queue_kind)) + { + completions.v[queue_kind] = G_CompletionTargetFromQueue(queue_kind); + } + } + return completions; +} + +void G_SyncEx(G_QueueBarrierDesc desc) +{ + G_D12_SharedState *g = &G_D12_shared_state; + + u64 fences_count = 0; + ID3D12Fence *fences[G_NumQueues] = ZI; + i64 fence_targets[G_NumQueues] = ZI; + + /* Grab fences */ + for (G_QueueKind completion_queue_kind = 0; completion_queue_kind < G_NumQueues; ++ completion_queue_kind) { G_D12_Queue *completion_queue = G_D12_QueueFromKind(completion_queue_kind); - ID3D12Fence *d3d_fence = completion_queue->commit_fence; - u64 fence_target = 0; + i64 target = desc.completions.v[completion_queue_kind]; + if (target > 0) { - Lock lock = LockS(&completion_queue->commit_mutex); - fence_target = completion_queue->commit_fence_target; - Unlock(&lock); - } - if (ID3D12Fence_GetCompletedValue(d3d_fence) < fence_target) - { - for (G_QueueKind waiter_queue_kind = 0; waiter_queue_kind < G_NumQueues; ++waiter_queue_kind) + i64 fence_value = ID3D12Fence_GetCompletedValue(completion_queue->commit_fence); + if (fence_value < target) { - if (waiter_queue_kind != completion_queue_kind) + fences[fences_count] = completion_queue->commit_fence; + fence_targets[fences_count] = target; + fences_count += 1; + } + } + } + + /* Sync Queues */ + for (G_QueueKind waiter_queue_kind = 0; waiter_queue_kind < G_NumQueues; ++ waiter_queue_kind) + { + if (desc.wait_queues & (1 << waiter_queue_kind)) + { + G_D12_Queue *waiter_queue = G_D12_QueueFromKind(waiter_queue_kind); + for (u64 fence_idx = 0; fence_idx < fences_count; ++fence_idx) + { + ID3D12Fence *fence = fences[fence_idx]; + if (waiter_queue->commit_fence != fence) { - G_D12_Queue *waiter_queue = G_D12_QueueFromKind(waiter_queue_kind); - ID3D12CommandQueue_Wait(waiter_queue->d3d_queue, d3d_fence, fence_target); + i64 target = fence_targets[fence_idx]; + ID3D12CommandQueue_Wait(waiter_queue->d3d_queue, fence, target); } } } } + + /* Sync Cpu */ + if (desc.wait_cpu && fences_count > 0) + { + if (G_D12_tl.sync_event == 0) + { + G_D12_tl.sync_event = CreateEvent(0, 0, 0, 0); + } + ID3D12Device1_SetEventOnMultipleFenceCompletion( + g->device, + fences, + (u64 *)fence_targets, + fences_count, + D3D12_MULTIPLE_FENCE_WAIT_FLAG_ALL, + G_D12_tl.sync_event + ); + WaitForSingleObject(G_D12_tl.sync_event, INFINITE); + } } -//////////////////////////////////////////////////////////// -//~ @hookimpl Map - -// G_Mapped G_Map(G_Resource *gpu_r) -// { -// G_Mapped result = ZI; -// result.resource = gpu_r; -// G_D12_Resource *r = (G_D12_Resource *)gpu_r; -// D3D12_RANGE read_range = ZI; -// HRESULT hr = ID3D12Resource_Map(r->d3d_resource, 0, &read_range, &result.mem); -// if (FAILED(hr) || !result.mem) -// { -// /* TODO: Don't panic */ -// Panic(Lit("Failed to map command buffer resource")); -// } -// return result; -// } - -// void G_Unmap(G_Mapped m) -// { -// G_D12_Resource *r = (G_D12_Resource *)m.resource; -// ID3D12Resource_Unmap(r->d3d_resource, 0, 0); -// } - //////////////////////////////////////////////////////////// //~ @hookimpl Statistics @@ -2786,26 +2825,36 @@ void G_D12_WorkerEntry(WaveLaneCtx *lane) G_QueueKind queue_kind = (G_QueueKind)lane->wave->udata; G_D12_Queue *queue = G_D12_QueueFromKind(queue_kind); - for (;;) + // if (queue->print_buffer_size > 0) + if (queue_kind == G_QueueKind_Direct) { - /* FIXME: Remove this */ + G_ArenaHandle gpu_perm = G_PermArena(); + G_ResourceHandle readback_buff = G_PushBuffer( + gpu_perm, + u8, + queue->print_buffer_size, + .flags = G_ResourceFlag_HostMemory + ); - Sleep(500); - - G_ResourceHandle debug_print_buff = queue->debug_print_buffer; - G_D12_Resource *resource = G_D12_ResourceFromHandle(debug_print_buff); - - if (!G_IsResourceNil(debug_print_buff)) + for (;;) { - u8 *base = G_StructFromResource(debug_print_buff, u8); - u32 size = *((u32 *)base); - String text = STRING(size, base + 4); + /* FIXME: Remove this */ - if (queue_kind == G_QueueKind_Direct) + Sleep(500); + + G_CommandListHandle cl = G_PrepareCommandList(queue_kind); { - DEBUGBREAKABLE; + G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size)); } - } + i64 completion = G_CommitCommandList(cl); + G_SyncCpu(G_MaskFromQueue(queue_kind)); + u32 size = *G_StructFromResource(readback_buff, u32); + u8 *text = G_StructFromResource(readback_buff, u8) + 4; + + String s = STRING(size, text); + + DEBUGBREAKABLE; + } } } diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.h b/src/gpu/gpu_dx12/gpu_dx12_core.h index cbfba434..37b5db76 100644 --- a/src/gpu/gpu_dx12/gpu_dx12_core.h +++ b/src/gpu/gpu_dx12/gpu_dx12_core.h @@ -220,8 +220,9 @@ Struct(G_D12_Queue) u64 commit_fence_target; /* Global resources */ - G_ResourceHandle debug_print_buffer; - G_RWByteAddressBufferRef debug_print_buffer_ref; + u64 print_buffer_size; + G_ResourceHandle print_buffer; + G_RWByteAddressBufferRef print_buffer_ref; /* Raw command lists */ struct G_D12_RawCommandList *first_committed_cl; @@ -283,7 +284,7 @@ Struct(G_D12_Cmd) struct { - G_BarrierDesc desc; + G_MemoryBarrierDesc desc; /* Post-batch data */ b32 is_end_of_batch; @@ -413,6 +414,11 @@ Struct(G_D12_SharedState) ID3D12Device10 *device; } extern G_D12_shared_state; +Struct(G_D12_ThreadLocalState) +{ + HANDLE sync_event; +} extern ThreadLocal G_D12_tl; + //////////////////////////////////////////////////////////// //~ Helpers @@ -453,7 +459,7 @@ G_D12_Descriptor *G_D12_PushDescriptor(G_D12_Arena *gpu_arena, G_D12_DescriptorH //~ Raw command list G_D12_RawCommandList *G_D12_PrepareRawCommandList(G_QueueKind queue_kind); -void G_D12_CommitRawCommandList(G_D12_RawCommandList *cl); +i64 G_D12_CommitRawCommandList(G_D12_RawCommandList *cl); //////////////////////////////////////////////////////////// //~ Command helpers