From bc76a511e6e271def37023aa148e8f1a0f4c2bba Mon Sep 17 00:00:00 2001 From: jacob Date: Wed, 10 Dec 2025 20:21:08 -0600 Subject: [PATCH] shader printf arg parsing on cpu --- src/base/base_shader.gh | 6 +- src/config.h | 2 + src/gpu/gpu_dx12/gpu_dx12_core.c | 91 ++++++++++++++++++++-- src/gpu/gpu_shader_core.cgh | 127 ++++++++++++++++++++++++------- src/ui/ui_shaders.g | 5 +- 5 files changed, 193 insertions(+), 38 deletions(-) diff --git a/src/base/base_shader.gh b/src/base/base_shader.gh index 75ac7674..a6710a15 100644 --- a/src/base/base_shader.gh +++ b/src/base/base_shader.gh @@ -24,7 +24,11 @@ typedef float4x4 Mat4x4; //////////////////////////////////////////////////////////// //~ Countof -template u32 countof(T arr[N]) { return N; } +template +u32 countof(T arr[N]) +{ + return N; +} //////////////////////////////////////////////////////////// //~ Color helpers diff --git a/src/config.h b/src/config.h index d0951d8e..a0720b78 100644 --- a/src/config.h +++ b/src/config.h @@ -73,6 +73,8 @@ #define GPU_DEBUG_VALIDATION 1 #define GPU_SHADER_PRINT 1 +#define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1); +#define GPU_SHADER_PRINT_LOG 1 /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */ #define BITBUFF_DEBUG 0 diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.c b/src/gpu/gpu_dx12/gpu_dx12_core.c index b03a248a..ef073019 100644 --- a/src/gpu/gpu_dx12/gpu_dx12_core.c +++ b/src/gpu/gpu_dx12/gpu_dx12_core.c @@ -300,7 +300,7 @@ void G_Bootstrap(void) if (kind != G_QueueKind_AsyncCopy) { G_ArenaHandle gpu_perm = G_PermArena(); - queue->print_buffer_size = Mebi(64); + queue->print_buffer_size = GPU_SHADER_PRINT_BUFFER_SIZE; queue->print_buffer = G_PushBuffer( gpu_perm, u8, @@ -2840,7 +2840,6 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane) .flags = G_ResourceFlag_HostMemory ); - u32 zero = 0; for (;;) { /* FIXME: Remove this */ @@ -2851,20 +2850,96 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane) { /* Copy print buffer to readback buffer */ G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size)); - /* Reset size to 0 */ + /* Reset counters to 0 */ G_MemorySync(cl, queue->print_buffer, G_Stage_Copy, G_Access_CopyRead, G_Stage_Copy, G_Access_CopyWrite ); - G_CopyCpuToBuffer(cl, queue->print_buffer, 0, &zero, RNGU64(0, 4)); + u8 zero[12] = ZI; + G_CopyCpuToBuffer(cl, queue->print_buffer, 0, zero, RNGU64(0, sizeof(zero))); } - i64 completion = G_CommitCommandList(cl); + G_CommitCommandList(cl); G_SyncCpu(G_MaskFromQueue(queue_kind)); - u32 size = *G_StructFromResource(readback_buff, u32); - u8 *text = G_StructFromResource(readback_buff, u8) + 4; + u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0); + u32 prints_count = *(G_StructFromResource(readback_buff, u32) + 1); + u32 overflows_count = *(G_StructFromResource(readback_buff, u32) + 2); + u8 *start = G_StructFromResource(readback_buff, u8) + 12; - String s = STRING(size, text); + /* Deserialize */ + if (GPU_SHADER_PRINT_LOG) + { + /* FIXME: Remove this */ + TempArena scratch = BeginScratchNoConflict(); + u8 *at = start; + { + for (u32 print_num = 1; print_num <= prints_count; ++print_num) + { + u32 chars_count = 0; + u32 args_count = 0; + { + u32 header = *(u32 *)at; + chars_count = (header & 0x0000FFFF) >> 0; + args_count = (header & 0xFFFF0000) >> 16; + at += 4; + } + + String fmt = ZI; + { + fmt.len = chars_count; + fmt.text = at; + at += chars_count; + } + + FmtArg *args = 0; + { + if (args_count > 0) + { + args = PushStructs(scratch.arena, FmtArg, args_count); + for (u32 arg_idx = 0; arg_idx <= args_count; ++arg_idx) + { + G_FmtArgKind gpu_kind = (G_FmtArgKind)(*at); + at += 1; + u32 gpu_data = *(u32 *)at; + at += 4; + + FmtArg *dst = &args[arg_idx]; + switch (gpu_kind) + { + case G_FmtArgKind_U32: + { + dst->kind = FmtArgKind_Uint; + dst->value.uint = gpu_data; + } break; + case G_FmtArgKind_I32: + { + dst->kind = FmtArgKind_Sint; + dst->value.sint = (i32)gpu_data; + } break; + case G_FmtArgKind_F32: + { + dst->kind = FmtArgKind_Float; + dst->value.f = *(f32 *)&gpu_data; + } break; + } + } + } + } + + // String final_str = ZI; + // if (args_count > 0) + // { + // } + // else + // { + // final_str = PushString(scratch.arena, fmt); + // } + + at = (u8 *)AlignU64((u64)at, 4); + } + } + EndScratch(scratch); + } DEBUGBREAKABLE; } diff --git a/src/gpu/gpu_shader_core.cgh b/src/gpu/gpu_shader_core.cgh index d8bdc9b1..6f9c1f1d 100644 --- a/src/gpu/gpu_shader_core.cgh +++ b/src/gpu/gpu_shader_core.cgh @@ -109,62 +109,133 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef, 8) /* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */ +Enum(G_FmtArgKind) +{ + G_FmtArgKind_None, + G_FmtArgKind_U32, + G_FmtArgKind_I32, + G_FmtArgKind_F32, +}; + +Struct(G_FmtArg) +{ + G_FmtArgKind kind; + u32 v; +}; + #if IsLanguageG && GPU_SHADER_PRINT + G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_U32; result.v = v; return result; } + G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_I32; result.v = v; return result; } + G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_F32; result.v = asuint(v); return result; } + Struct(G_TempPrintBuffer) { - u32 data_u32[256]; - u32 byte_pos; + u32 char_chunks[256]; + u32 char_pos; + u32 fmt_size; + u32 args_count; }; - void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 c) + void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v) { /* TODO: Overflow check */ - u32 u32_arr_pos = buff.byte_pos / 4; - u32 idx_in_u32 = buff.byte_pos & 0x03; + u32 u32_arr_pos = buff.char_pos / 4; + u32 idx_in_u32 = buff.char_pos & 0x03; if (idx_in_u32 == 0) { - /* Since buff is not zero initialized, we set the byte on first write here */ - buff.data_u32[u32_arr_pos] = c & 0xFF; + /* Since buff is not zero initialized, we set the chunk on first write here */ + buff.char_chunks[u32_arr_pos] = v & 0xFF; } else { - buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8); + buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8); } - buff.byte_pos += 1; + buff.char_pos += 1; } void G_CommitPrint(G_TempPrintBuffer buff) { RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef); - u32 u32s_count = (buff.byte_pos + 3) / 4; - u32 alloc_size = u32s_count * 4; + + u32 chunks_count = (buff.char_pos + 3) / 4; + + u32 alloc_size = 0; + alloc_size += 4; /* Header */ + alloc_size += chunks_count * 4; /* Chunks */ u32 base; - rw.InterlockedAdd(0, alloc_size, base); - base += 4; /* Account for counter at beginning of buff */ + rw.InterlockedAdd(0, alloc_size, base); /* Write to base counter */ - if ((base + alloc_size) < countof(rw)) + u32 pos = base; + pos += 4; /* Offset for base counter */ + pos += 4; /* Offset for success counter */ + pos += 4; /* Offset for overflow counter */ + + if ((pos + alloc_size) < countof(rw)) { - for (u32 u32_idx = 0; u32_idx < u32s_count; ++u32_idx) + /* Increment success counter */ + rw.InterlockedAdd(4, 1); + + /* Store header */ { - u32 data = buff.data_u32[u32_idx]; - rw.Store(base + (u32_idx * 4), data); + u32 header = 0; + header |= (buff.fmt_size << 0) & 0x0000FFFF; + header |= (buff.args_count << 16) & 0xFFFF0000; + rw.Store(base + pos, header); + pos += 4; } + /* Store chunks */ + for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx) + { + u32 chunk = buff.char_chunks[chunk_idx]; + rw.Store(base + pos, chunk); + pos += 4; + } + } + else + { + /* Increment overflow counter */ + rw.InterlockedAdd(8, 1); } } - #define G_Print(fmt) do { \ - G_TempPrintBuffer __tmp; \ - __tmp.byte_pos = 0; \ - u32 __pos = 0; \ - while (U32FromChar(fmt[__pos]) != 0) \ - { \ - G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \ - ++__pos; \ - } \ - G_PushPrintChar(__tmp, 0); \ - G_CommitPrint(__tmp); \ + #define G_Print(fmt) do { \ + G_TempPrintBuffer __tmp; \ + __tmp.char_pos = 0; \ + u32 __pos = 0; \ + while (U32FromChar(fmt[__pos]) != 0) \ + { \ + G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \ + ++__pos; \ + } \ + __tmp.fmt_size = __tmp.char_pos; \ + G_CommitPrint(__tmp); \ } while (0) + + #define G_PrintF(fmt, ...) do { \ + G_TempPrintBuffer __tmp; \ + __tmp.char_pos = 0; \ + u32 __pos = 0; \ + while (U32FromChar(fmt[__pos]) != 0) \ + { \ + G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \ + ++__pos; \ + } \ + G_FmtArg __args[] = { __VA_ARGS__ }; \ + __tmp.fmt_size = __tmp.char_pos; \ + __tmp.args_count = countof(__args); \ + for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx) \ + { \ + G_PushPrintChar(__tmp, __args[__arg_idx].kind); \ + G_PushPrintChar(__tmp, __args[__arg_idx].v >> 0); \ + G_PushPrintChar(__tmp, __args[__arg_idx].v >> 8); \ + G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16); \ + G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24); \ + } \ + G_CommitPrint(__tmp); \ + } while (0) + #else #define G_Print(fmt) + #define G_PrintF(fmt) #endif diff --git a/src/ui/ui_shaders.g b/src/ui/ui_shaders.g index 5b64e281..43d9247c 100644 --- a/src/ui/ui_shaders.g +++ b/src/ui/ui_shaders.g @@ -145,7 +145,10 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input) Vec2 uv = input.src_uv; Vec4 result = tex.Sample(sampler, uv); - G_Print("Hello there!"); + // G_Print("Hello there!"); + G_PrintF("Hello there: \"%F\"", G_Fmt(3.123)); + + UI_BlitPSOutput output; output.SV_Target0 = result;