From 4453d18d519bbaabbf5e3ea384e1ad37eb9a41bf Mon Sep 17 00:00:00 2001 From: jacob Date: Thu, 11 Dec 2025 10:24:30 -0600 Subject: [PATCH] check for overflow in shader printf --- src/config.h | 2 +- src/gpu/gpu_dx12/gpu_dx12_core.c | 33 ++++++--- src/gpu/gpu_shader_core.cgh | 119 ++++++++++++++++--------------- src/ui/ui_shaders.g | 11 +-- 4 files changed, 95 insertions(+), 70 deletions(-) diff --git a/src/config.h b/src/config.h index a0720b78..9bb8ceb4 100644 --- a/src/config.h +++ b/src/config.h @@ -70,7 +70,7 @@ #define FLOOD_DEBUG 0 #define GPU_DEBUG 1 -#define GPU_DEBUG_VALIDATION 1 +#define GPU_DEBUG_VALIDATION 0 #define GPU_SHADER_PRINT 1 #define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1); diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.c b/src/gpu/gpu_dx12/gpu_dx12_core.c index c2ae3a52..a6912cff 100644 --- a/src/gpu/gpu_dx12/gpu_dx12_core.c +++ b/src/gpu/gpu_dx12/gpu_dx12_core.c @@ -2872,14 +2872,24 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane) G_SyncCpu(G_MaskFromQueue(queue_kind)); - u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0); - u32 prints_count = *(G_StructFromResource(readback_buff, u32) + 1); - u32 overflows_count = *(G_StructFromResource(readback_buff, u32) + 2); + u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0); /* The number of bytes shaders attempted to write */ + u32 prints_count = *(G_StructFromResource(readback_buff, u32) + 1); /* The number of shader prints that are in the buffer */ + u32 overflows_count = *(G_StructFromResource(readback_buff, u32) + 2); /* The number of shader prints that could not fit in the buffer */ u8 *start = G_StructFromResource(readback_buff, u8) + 12; /* Deserialize */ if (GPU_SHADER_PRINT_LOG) { + if (prints_count > 0) + { + LogDebugF( + "Forwarding logs collected from GPU - Resident prints: %F, Total attempted prints: %F, Total attempted bytes: %F", + FmtUint(prints_count), + FmtUint(prints_count + overflows_count), + FmtUint(attempted_print_bytes_count) + ); + } + /* FIXME: Remove this */ TempArena scratch = BeginScratchNoConflict(); u8 *at = start; @@ -2888,10 +2898,12 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane) { u32 chars_count = 0; u32 args_count = 0; + b32 internal_overflow = 0; { u32 header = *(u32 *)at; - chars_count = (header & 0x0000FFFF) >> 0; - args_count = (header & 0xFFFF0000) >> 16; + chars_count = (header & 0x0000FFFF) >> 0; + args_count = (header & 0x7FFF0000) >> 16; + internal_overflow = (header & 0xF0000000) >> 31; at += 4; } @@ -2940,11 +2952,16 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane) } } - String final_str = FormatString(scratch.arena, fmt, args); - if (GPU_SHADER_PRINT_LOG) + String final_str = ZI; + if (internal_overflow) { - LogDebug(final_str); + final_str = Lit("[Shader PrintF is too large]"); } + else + { + final_str = FormatString(scratch.arena, fmt, args); + } + LogDebug(final_str); at = (u8 *)AlignU64((u64)at, 4); } diff --git a/src/gpu/gpu_shader_core.cgh b/src/gpu/gpu_shader_core.cgh index 125828f3..81de7a12 100644 --- a/src/gpu/gpu_shader_core.cgh +++ b/src/gpu/gpu_shader_core.cgh @@ -107,7 +107,7 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef, 8) //////////////////////////////////////////////////////////// //~ Debug printf -/* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */ +/* This technique is based on MJP's article - https://therealmjp.github.io/posts/hlsl-printf/ */ Enum(G_FmtArgKind) { @@ -115,6 +115,8 @@ Enum(G_FmtArgKind) G_FmtArgKind_U32, G_FmtArgKind_I32, G_FmtArgKind_F32, + + G_FmtArgKind_End, }; Struct(G_FmtArg) @@ -124,41 +126,56 @@ Struct(G_FmtArg) }; #if IsLanguageG && GPU_SHADER_PRINT - G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_U32; result.v = v; return result; } - G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_I32; result.v = v; return result; } - G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_F32; result.v = asuint(v); return result; } + G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_U32; result.v = v; return result; } + G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_I32; result.v = v; return result; } + G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_F32; result.v = asuint(v); return result; } + G_FmtArg G_FmtEnd(void) { G_FmtArg result; result.kind = G_FmtArgKind_End; result.v = 0; return result; } Struct(G_TempPrintBuffer) { - u32 char_chunks[256]; - u32 char_pos; - u32 fmt_size; + /* NOTE: The larger the array size, the longer the compilation time */ + u32 byte_chunks[64]; + u32 bytes_count; + u32 chars_count; u32 args_count; + b32 overflowed; }; - void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v) + void G_PushPrintByte(inout G_TempPrintBuffer buff, u32 v) { - /* TODO: Overflow check */ - u32 u32_arr_pos = buff.char_pos / 4; - u32 idx_in_u32 = buff.char_pos & 0x03; - if (idx_in_u32 == 0) + u32 chunk_idx = buff.bytes_count / 4; + if (chunk_idx < countof(buff.byte_chunks)) { - /* Since buff is not zero initialized, we set the chunk on first write here */ - buff.char_chunks[u32_arr_pos] = v & 0xFF; + u32 byte_idx_in_chunk = buff.bytes_count & 0x03; + if (byte_idx_in_chunk == 0) + { + /* Since buff is not zero initialized, we set the chunk on first write here */ + buff.byte_chunks[chunk_idx] = v & 0xFF; + } + else + { + buff.byte_chunks[chunk_idx] |= (v & 0xFF) << (byte_idx_in_chunk * 8); + } + buff.bytes_count += 1; } else { - buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8); + buff.overflowed = 1; } - buff.char_pos += 1; } void G_CommitPrint(G_TempPrintBuffer buff) { RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef); - u32 chunks_count = (buff.char_pos + 3) / 4; + if (buff.overflowed) + { + buff.bytes_count = 0; + buff.chars_count = 0; + buff.args_count = 0; + } + u32 chunks_count = (buff.bytes_count + 3) / 4; u32 alloc_size = 0; alloc_size += 4; /* Header */ alloc_size += chunks_count * 4; /* Chunks */ @@ -166,23 +183,22 @@ Struct(G_FmtArg) /* Atomic fetch + add to base counter */ u32 base; rw.InterlockedAdd(0, alloc_size, base); - base += 4; /* Offset for allocation counter */ base += 4; /* Offset for success counter */ base += 4; /* Offset for overflow counter */ if ((base + alloc_size) < countof(rw)) { - u32 pos = 0; - /* Increment success counter */ rw.InterlockedAdd(4, 1); + u32 pos = 0; /* Write header */ { u32 header = 0; - header |= (buff.fmt_size << 0) & 0x0000FFFF; - header |= (buff.args_count << 16) & 0xFFFF0000; + header |= (buff.chars_count << 0) & 0x0000FFFF; + header |= (buff.args_count << 16) & 0x7FFF0000; + header |= (buff.overflowed << 31) & 0xF0000000; rw.Store(base + pos, header); pos += 4; } @@ -190,7 +206,7 @@ Struct(G_FmtArg) /* Write chunks */ for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx) { - u32 chunk = buff.char_chunks[chunk_idx]; + u32 chunk = buff.byte_chunks[chunk_idx]; rw.Store(base + pos, chunk); pos += 4; } @@ -202,43 +218,32 @@ Struct(G_FmtArg) } } - #define G_Print(fmt) do { \ - G_TempPrintBuffer __tmp; \ - __tmp.char_pos = 0; \ - u32 __pos = 0; \ - while (U32FromChar(fmt[__pos]) != 0) \ - { \ - G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \ - ++__pos; \ - } \ - __tmp.fmt_size = __tmp.char_pos; \ - G_CommitPrint(__tmp); \ + #define G_PrintF_(fmt, ...) do { \ + G_TempPrintBuffer __tmp; \ + __tmp.bytes_count = 0; \ + __tmp.overflowed = 0; \ + u32 __char_idx = 0; \ + while (U32FromChar(fmt[__char_idx]) != 0) \ + { \ + G_PushPrintByte(__tmp, U32FromChar(fmt[__char_idx])); \ + ++__char_idx; \ + } \ + G_FmtArg __args[] = { __VA_ARGS__ }; \ + __tmp.chars_count = __tmp.bytes_count; \ + __tmp.args_count = (countof(__args) - 1); \ + for (u32 __arg_idx = 0; __arg_idx < __tmp.args_count; ++__arg_idx) \ + { \ + G_PushPrintByte(__tmp, __args[__arg_idx].kind); \ + G_PushPrintByte(__tmp, __args[__arg_idx].v >> 0); \ + G_PushPrintByte(__tmp, __args[__arg_idx].v >> 8); \ + G_PushPrintByte(__tmp, __args[__arg_idx].v >> 16); \ + G_PushPrintByte(__tmp, __args[__arg_idx].v >> 24); \ + } \ + G_CommitPrint(__tmp); \ } while (0) - #define G_PrintF(fmt, ...) do { \ - G_TempPrintBuffer __tmp; \ - __tmp.char_pos = 0; \ - u32 __pos = 0; \ - while (U32FromChar(fmt[__pos]) != 0) \ - { \ - G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \ - ++__pos; \ - } \ - G_FmtArg __args[] = { __VA_ARGS__ }; \ - __tmp.fmt_size = __tmp.char_pos; \ - __tmp.args_count = countof(__args); \ - for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx) \ - { \ - G_PushPrintChar(__tmp, __args[__arg_idx].kind); \ - G_PushPrintChar(__tmp, __args[__arg_idx].v >> 0); \ - G_PushPrintChar(__tmp, __args[__arg_idx].v >> 8); \ - G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16); \ - G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24); \ - } \ - G_CommitPrint(__tmp); \ - } while (0) + #define G_PrintF(fmt, ...) G_PrintF_(fmt, ##__VA_ARGS__, G_FmtEnd()) #else - #define G_Print(fmt) #define G_PrintF(fmt) #endif diff --git a/src/ui/ui_shaders.g b/src/ui/ui_shaders.g index 3099f02b..4c60fbc1 100644 --- a/src/ui/ui_shaders.g +++ b/src/ui/ui_shaders.g @@ -145,10 +145,13 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input) Vec2 uv = input.src_uv; Vec4 result = tex.Sample(sampler, uv); - // G_Print("Hello there!"); - G_PrintF("Bla: (%F, %F)", G_Fmt(uv.x), G_Fmt(uv.y)); - - + G_PrintF( + "Hello there! (%F, %F), (%F, %F)", + G_Fmt(input.SV_Position.x), + G_Fmt(input.SV_Position.y), + G_Fmt(uv.x), + G_Fmt(uv.y) + ); UI_BlitPSOutput output; output.SV_Target0 = result;