From f911e98c982953b27aa00e68fb30a3405258ba5c Mon Sep 17 00:00:00 2001 From: jacob Date: Wed, 10 Dec 2025 17:23:51 -0600 Subject: [PATCH] reset print buffer size in collection worker --- src/gpu/gpu_common.c | 2 +- src/gpu/gpu_common.h | 2 +- src/gpu/gpu_core.h | 54 ++++++++--------- src/gpu/gpu_dx12/gpu_dx12_core.c | 99 ++++++++++++++++++-------------- src/gpu/gpu_dx12/gpu_dx12_core.h | 4 +- src/gpu/gpu_shader_core.cgh | 33 +++++++---- src/ui/ui_shaders.g | 2 +- 7 files changed, 108 insertions(+), 88 deletions(-) diff --git a/src/gpu/gpu_common.c b/src/gpu/gpu_common.c index a2b9cd35..1f66e367 100644 --- a/src/gpu/gpu_common.c +++ b/src/gpu/gpu_common.c @@ -71,7 +71,7 @@ G_ArenaHandle G_PermArena(void) return perm; } -//- Cpu -> Gpu copy +//- Cpu -> Gpu upload G_ResourceHandle G_PushBufferFromString_(G_ArenaHandle gpu_arena, G_CommandListHandle cl, String src, G_BufferResourceDesc desc) { diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h index 33cd588e..76d3136e 100644 --- a/src/gpu/gpu_common.h +++ b/src/gpu/gpu_common.h @@ -23,7 +23,7 @@ void G_BootstrapCommon(void); G_ArenaHandle G_PermArena(void); -//- Cpu -> Gpu copy +//- Cpu -> Gpu upload G_ResourceHandle G_PushBufferFromString_(G_ArenaHandle gpu_arena, G_CommandListHandle cl, String src, G_BufferResourceDesc desc); #define G_PushBufferFromString(_arena, _cl, _src, ...) \ diff --git a/src/gpu/gpu_core.h b/src/gpu/gpu_core.h index dfa7edd9..afb3dd91 100644 --- a/src/gpu/gpu_core.h +++ b/src/gpu/gpu_core.h @@ -299,7 +299,7 @@ Enum(G_Layout) G_Layout_ComputeQueue_CopyRead, /* D3D12_BARRIER_LAYOUT_COMPUTE_QUEUE_COPY_SOURCE */ }; -/* Barrier will execute after previous stages specified by `sync_prev`, and before next stages specified by `sync_next`. +/* Barrier will execute after previous stages specified by `stage_prev`, and before next stages specified by `stage_next`. * When barrier executes: * - Necessary resource flushes will occur based on `access_prev` & `access_next` * - Texture layout will transition based on `layout` (if specified) @@ -308,8 +308,8 @@ Struct(G_MemoryBarrierDesc) { G_ResourceHandle resource; b32 is_global; - G_Stage sync_prev; - G_Stage sync_next; + G_Stage stage_prev; + G_Stage stage_next; G_Access access_prev; G_Access access_next; G_Layout layout; @@ -324,8 +324,8 @@ Enum(G_ResourceFlag) G_ResourceFlag_AllowShaderReadWrite = (1 << 0), G_ResourceFlag_AllowRenderTarget = (1 << 1), G_ResourceFlag_AllowDepthStencil = (1 << 2), - G_ResourceFlag_HostMemory = (1 << 3), - G_ResourceFlag_WriteCombinedHostMemory = (1 << 4), + G_ResourceFlag_HostMemory = (1 << 3), /* Resource will automatically be mapped into the cpu's address space */ + G_ResourceFlag_WriteCombined = (1 << 4), /* Writes into the mapped resource will be combined. Fast for linear memcpy, slow for everything else */ }; //////////////////////////////////////////////////////////// @@ -680,32 +680,32 @@ void G_SetConstant_(G_CommandListHandle cl, i32 slot, void *src_32bit, u32 size) void G_MemorySyncEx(G_CommandListHandle cl, G_MemoryBarrierDesc desc); -#define G_MemorySync(_cl, _resource, _sync_prev, _access_prev, _sync_next, _access_next) \ - G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \ - .resource = (_resource), \ - .sync_prev = _sync_prev, \ - .access_prev = _access_prev, \ - .sync_next = _sync_next, \ - .access_next = _access_next, \ +#define G_MemorySync(_cl, _resource, _stage_prev, _access_prev, _stage_next, _access_next) \ + G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \ + .resource = (_resource), \ + .stage_prev = _stage_prev, \ + .access_prev = _access_prev, \ + .stage_next = _stage_next, \ + .access_next = _access_next, \ }) -#define G_MemoryLayoutSync(_cl, _resource, _sync_prev, _access_prev, _sync_next, _access_next, _layout) \ - G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \ - .resource = (_resource), \ - .sync_prev = _sync_prev, \ - .access_prev = _access_prev, \ - .sync_next = _sync_next, \ - .access_next = _access_next, \ - .layout = _layout, \ +#define G_MemoryLayoutSync(_cl, _resource, _stage_prev, _access_prev, _stage_next, _access_next, _layout) \ + G_MemorySyncEx((_cl), (G_MemoryBarrierDesc) { \ + .resource = (_resource), \ + .stage_prev = _stage_prev, \ + .access_prev = _access_prev, \ + .stage_next = _stage_next, \ + .access_next = _access_next, \ + .layout = _layout, \ }) -#define G_GlobalMemorySync(_cl, _sync_prev, _access_prev, _sync_next, _access_next) \ - G_MemorySync((_cl), (G_MemoryBarrierDesc) { \ - .is_global = 1, \ - .sync_prev = _sync_prev, \ - .access_prev = _access_prev, \ - .sync_next = _sync_next, \ - .access_next = _access_next, \ +#define G_GlobalMemorySync(_cl, _stage_prev, _access_prev, _stage_next, _access_next) \ + G_MemorySync((_cl), (G_MemoryBarrierDesc) { \ + .is_global = 1, \ + .stage_prev = _stage_prev, \ + .access_prev = _access_prev, \ + .stage_next = _stage_next, \ + .access_next = _access_next, \ }) #define G_DumbMemorySync(cl, resource) \ diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.c b/src/gpu/gpu_dx12/gpu_dx12_core.c index 589d7791..b03a248a 100644 --- a/src/gpu/gpu_dx12/gpu_dx12_core.c +++ b/src/gpu/gpu_dx12/gpu_dx12_core.c @@ -316,14 +316,16 @@ void G_Bootstrap(void) ////////////////////////////// //- Start workers - for (G_QueueKind kind = 0; kind < G_NumQueues; ++kind) - { - String name = ZI; - if (kind == G_QueueKind_Direct) name = Lit("Direct queue worker"); - if (kind == G_QueueKind_AsyncCompute) name = Lit("Compute queue worker"); - if (kind == G_QueueKind_AsyncCopy) name = Lit("Copy queue worker"); - DispatchWave(name, 1, G_D12_WorkerEntry, (void *)(u64)kind); - } + // for (G_QueueKind kind = 0; kind < G_NumQueues; ++kind) + // { + // String name = ZI; + // if (kind == G_QueueKind_Direct) name = Lit("Gpu direct queue worker"); + // if (kind == G_QueueKind_AsyncCompute) name = Lit("Gpu compute queue worker"); + // if (kind == G_QueueKind_AsyncCopy) name = Lit("Gpu copy queue worker"); + // DispatchWave(name, 1, G_D12_WorkerEntry, (void *)(u64)kind); + // } + + DispatchWave(Lit("Gpu collection worker"), 1, G_D12_CollectionWorkerEntry, 0); EndScratch(scratch); } @@ -949,7 +951,7 @@ G_ResourceHandle G_PushBufferEx(G_ArenaHandle arena_handle, G_BufferResourceDesc if (desc.flags & G_ResourceFlag_HostMemory) { heap_kind = G_D12_ResourceHeapKind_Cpu; - if (desc.flags & G_ResourceFlag_WriteCombinedHostMemory) + if (desc.flags & G_ResourceFlag_WriteCombined) { heap_kind = G_D12_ResourceHeapKind_CpuWriteCombined; } @@ -1028,7 +1030,7 @@ G_ResourceHandle G_PushTextureEx(G_ArenaHandle arena_handle, G_TextureResourceDe if (desc.flags & G_ResourceFlag_HostMemory) { heap_kind = G_D12_ResourceHeapKind_Cpu; - if (desc.flags & G_ResourceFlag_WriteCombinedHostMemory) + if (desc.flags & G_ResourceFlag_WriteCombined) { heap_kind = G_D12_ResourceHeapKind_CpuWriteCombined; } @@ -1626,7 +1628,7 @@ i64 G_CommitCommandList(G_CommandListHandle cl_handle) if (!G_IsRefNil(queue->print_buffer_ref)) { - slotted_constants[G_ShaderConst_DebugBufferRef] = queue->print_buffer_ref.v; + slotted_constants[G_ShaderConst_PrintBufferRef] = queue->print_buffer_ref.v; } /* Rasterizer state */ @@ -1655,10 +1657,12 @@ i64 G_CommitCommandList(G_CommandListHandle cl_handle) { Lock lock = LockE(&g->free_cmd_chunks_mutex); { - for (G_D12_CmdChunk *chunk = cl->first_cmd_chunk; chunk; chunk = chunk->next) + G_D12_CmdChunk *chunk = cl->first_cmd_chunk; + while (chunk) { - chunk->next = g->first_free_cmd_chunk; + G_D12_CmdChunk *next = chunk->next; g->first_free_cmd_chunk = chunk; + chunk = next; } } Unlock(&lock); @@ -1729,6 +1733,7 @@ i64 G_CommitCommandList(G_CommandListHandle cl_handle) } break; //- Constant + case G_D12_CmdKind_Constant: { i32 slot = cmd->constant.slot; @@ -1766,8 +1771,8 @@ i64 G_CommitCommandList(G_CommandListHandle cl_handle) D3D12_BARRIER_TYPE barrier_type = resource->is_texture ? D3D12_BARRIER_TYPE_TEXTURE : D3D12_BARRIER_TYPE_BUFFER; /* Translate gpu barrier kind -> d3d barrier fields */ - D3D12_BARRIER_SYNC sync_before = G_D12_BarrierSyncFromStages(desc.sync_prev); - D3D12_BARRIER_SYNC sync_after = G_D12_BarrierSyncFromStages(desc.sync_next); + D3D12_BARRIER_SYNC sync_before = G_D12_BarrierSyncFromStages(desc.stage_prev); + D3D12_BARRIER_SYNC sync_after = G_D12_BarrierSyncFromStages(desc.stage_next); D3D12_BARRIER_ACCESS access_before = G_D12_BarrierAccessFromAccesses(desc.access_prev); D3D12_BARRIER_ACCESS access_after = G_D12_BarrierAccessFromAccesses(desc.access_next); D3D12_BARRIER_LAYOUT layout_before = resource->texture_layout; @@ -2818,43 +2823,49 @@ void G_CommitBackbuffer(G_ResourceHandle backbuffer_handle, i32 vsync) } //////////////////////////////////////////////////////////// -//~ Workers +//~ Collection worker -void G_D12_WorkerEntry(WaveLaneCtx *lane) +void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane) { - G_QueueKind queue_kind = (G_QueueKind)lane->wave->udata; + G_QueueKind queue_kind = G_QueueKind_Direct; G_D12_Queue *queue = G_D12_QueueFromKind(queue_kind); // if (queue->print_buffer_size > 0) - if (queue_kind == G_QueueKind_Direct) + + G_ArenaHandle gpu_perm = G_PermArena(); + G_ResourceHandle readback_buff = G_PushBuffer( + gpu_perm, + u8, + queue->print_buffer_size, + .flags = G_ResourceFlag_HostMemory + ); + + u32 zero = 0; + for (;;) { - G_ArenaHandle gpu_perm = G_PermArena(); - G_ResourceHandle readback_buff = G_PushBuffer( - gpu_perm, - u8, - queue->print_buffer_size, - .flags = G_ResourceFlag_HostMemory - ); + /* FIXME: Remove this */ - for (;;) + Sleep(100); + + G_CommandListHandle cl = G_PrepareCommandList(queue_kind); { - /* FIXME: Remove this */ - - Sleep(500); - - G_CommandListHandle cl = G_PrepareCommandList(queue_kind); - { - G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size)); - } - i64 completion = G_CommitCommandList(cl); - - G_SyncCpu(G_MaskFromQueue(queue_kind)); - u32 size = *G_StructFromResource(readback_buff, u32); - u8 *text = G_StructFromResource(readback_buff, u8) + 4; - - String s = STRING(size, text); - - DEBUGBREAKABLE; + /* Copy print buffer to readback buffer */ + G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size)); + /* Reset size to 0 */ + G_MemorySync(cl, queue->print_buffer, + G_Stage_Copy, G_Access_CopyRead, + G_Stage_Copy, G_Access_CopyWrite + ); + G_CopyCpuToBuffer(cl, queue->print_buffer, 0, &zero, RNGU64(0, 4)); } + i64 completion = G_CommitCommandList(cl); + + G_SyncCpu(G_MaskFromQueue(queue_kind)); + u32 size = *G_StructFromResource(readback_buff, u32); + u8 *text = G_StructFromResource(readback_buff, u8) + 4; + + String s = STRING(size, text); + + DEBUGBREAKABLE; } } diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.h b/src/gpu/gpu_dx12/gpu_dx12_core.h index 37b5db76..bedf2caf 100644 --- a/src/gpu/gpu_dx12/gpu_dx12_core.h +++ b/src/gpu/gpu_dx12/gpu_dx12_core.h @@ -469,6 +469,6 @@ G_D12_Cmd *G_D12_PushConstCmd(G_D12_CmdList *cl, i32 slot, void *v); G_D12_StagingRegionNode *G_D12_PushStagingRegion(G_D12_CmdList *cl, u64 size); //////////////////////////////////////////////////////////// -//~ Workers +//~ Collection worker -void G_D12_WorkerEntry(WaveLaneCtx *lane); +void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane); diff --git a/src/gpu/gpu_shader_core.cgh b/src/gpu/gpu_shader_core.cgh index d03d4ec8..d8bdc9b1 100644 --- a/src/gpu/gpu_shader_core.cgh +++ b/src/gpu/gpu_shader_core.cgh @@ -102,7 +102,7 @@ Struct(G_SamplerStateRef) { u32 v; }; StaticAssert(G_NumGeneralPurposeConstants == 8); StaticAssert(G_NumReservedConstants == 1); -G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_DebugBufferRef, 8); +G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef, 8); //////////////////////////////////////////////////////////// //~ Debug printf @@ -110,24 +110,32 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_DebugBufferRef, 8) /* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */ #if IsLanguageG && GPU_SHADER_PRINT - Struct(G_DebugBuffer) + Struct(G_TempPrintBuffer) { u32 data_u32[256]; u32 byte_pos; }; - void G_PushDebugChar(inout G_DebugBuffer buff, u32 c) + void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 c) { /* TODO: Overflow check */ u32 u32_arr_pos = buff.byte_pos / 4; u32 idx_in_u32 = buff.byte_pos & 0x03; - buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8); + if (idx_in_u32 == 0) + { + /* Since buff is not zero initialized, we set the byte on first write here */ + buff.data_u32[u32_arr_pos] = c & 0xFF; + } + else + { + buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8); + } buff.byte_pos += 1; } - void G_CommitDebugBuffer(G_DebugBuffer buff) + void G_CommitPrint(G_TempPrintBuffer buff) { - RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_DebugBufferRef); + RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef); u32 u32s_count = (buff.byte_pos + 3) / 4; u32 alloc_size = u32s_count * 4; @@ -145,17 +153,18 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_DebugBufferRef, 8) } } - #define G_DebugPrint(fmt) do { \ - G_DebugBuffer __dbg; \ - __dbg.byte_pos = 0; \ + #define G_Print(fmt) do { \ + G_TempPrintBuffer __tmp; \ + __tmp.byte_pos = 0; \ u32 __pos = 0; \ while (U32FromChar(fmt[__pos]) != 0) \ { \ - G_PushDebugChar(__dbg, U32FromChar(fmt[__pos])); \ + G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \ ++__pos; \ } \ - G_CommitDebugBuffer(__dbg); \ + G_PushPrintChar(__tmp, 0); \ + G_CommitPrint(__tmp); \ } while (0) #else - #define G_DebugPrint(fmt) + #define G_Print(fmt) #endif diff --git a/src/ui/ui_shaders.g b/src/ui/ui_shaders.g index 690a9161..5b64e281 100644 --- a/src/ui/ui_shaders.g +++ b/src/ui/ui_shaders.g @@ -145,7 +145,7 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input) Vec2 uv = input.src_uv; Vec4 result = tex.Sample(sampler, uv); - G_DebugPrint("Hello there"); + G_Print("Hello there!"); UI_BlitPSOutput output; output.SV_Target0 = result;