check for overflow in shader printf

2025-12-11 10:24:30 -06:00 · 2025-12-11 10:24:30 -06:00 · 4453d18d51
commit 4453d18d51
parent 84fbaaf7cd
4 changed files with 95 additions and 70 deletions
--- a/src/config.h
+++ b/src/config.h
@ -70,7 +70,7 @@
 #define FLOOD_DEBUG 0
 #define GPU_DEBUG 1
-#define GPU_DEBUG_VALIDATION 1
+#define GPU_DEBUG_VALIDATION 0
 #define GPU_SHADER_PRINT 1
 #define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1);
--- a/src/gpu/gpu_dx12/gpu_dx12_core.c
+++ b/src/gpu/gpu_dx12/gpu_dx12_core.c
@ -2872,14 +2872,24 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
        G_SyncCpu(G_MaskFromQueue(queue_kind));
-        u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0);
+        u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0);  /* The number of bytes shaders attempted to write */
-        u32 prints_count                = *(G_StructFromResource(readback_buff, u32) + 1);
+        u32 prints_count                = *(G_StructFromResource(readback_buff, u32) + 1);  /* The number of shader prints that are in the buffer */
-        u32 overflows_count             = *(G_StructFromResource(readback_buff, u32) + 2);
+        u32 overflows_count             = *(G_StructFromResource(readback_buff, u32) + 2);  /* The number of shader prints that could not fit in the buffer */
        u8 *start                       = G_StructFromResource(readback_buff, u8) + 12;
        /* Deserialize */
        if (GPU_SHADER_PRINT_LOG)
        {
            if (prints_count > 0)
            {
                LogDebugF(
                    "Forwarding logs collected from GPU - Resident prints: %F, Total attempted prints: %F, Total attempted bytes: %F",
                    FmtUint(prints_count),
                    FmtUint(prints_count + overflows_count),
                    FmtUint(attempted_print_bytes_count)
                );
            }
            /* FIXME: Remove this */
            TempArena scratch = BeginScratchNoConflict();
            u8 *at = start;
@ -2888,10 +2898,12 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
                {
                    u32 chars_count = 0;
                    u32 args_count = 0;
                    b32 internal_overflow = 0;
                    {
                        u32 header = *(u32 *)at;
-                        chars_count = (header & 0x0000FFFF) >> 0;
+                        chars_count         = (header & 0x0000FFFF) >> 0;
-                        args_count  = (header & 0xFFFF0000) >> 16;
+                        args_count          = (header & 0x7FFF0000) >> 16;
                        internal_overflow   = (header & 0xF0000000) >> 31;
                        at += 4;
                    }
@ -2940,11 +2952,16 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
                        }
                    }
-                    String final_str = FormatString(scratch.arena, fmt, args);
+                    String final_str = ZI;
-                    if (GPU_SHADER_PRINT_LOG)
+                    if (internal_overflow)
                    {
-                        LogDebug(final_str);
+                        final_str = Lit("[Shader PrintF is too large]");
                    }
                    else
                    {
                        final_str = FormatString(scratch.arena, fmt, args);
                    }
                    LogDebug(final_str);
                    at = (u8 *)AlignU64((u64)at, 4);
                }
--- a/src/gpu/gpu_shader_core.cgh
+++ b/src/gpu/gpu_shader_core.cgh
@ -107,7 +107,7 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef,   8)
 ////////////////////////////////////////////////////////////
 //~ Debug printf
-/* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */
+/* This technique is based on MJP's article - https://therealmjp.github.io/posts/hlsl-printf/ */
 Enum(G_FmtArgKind)
 {
@ -115,6 +115,8 @@ Enum(G_FmtArgKind)
    G_FmtArgKind_U32,
    G_FmtArgKind_I32,
    G_FmtArgKind_F32,
    G_FmtArgKind_End,
 };
 Struct(G_FmtArg)
@ -124,41 +126,56 @@ Struct(G_FmtArg)
 };
 #if IsLanguageG && GPU_SHADER_PRINT
-    G_FmtArg G_Fmt(u32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_U32;    result.v = v;          return result; }
+    G_FmtArg G_Fmt(u32 v)       { G_FmtArg result;    result.kind = G_FmtArgKind_U32;    result.v = v;          return result; }
-    G_FmtArg G_Fmt(i32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_I32;    result.v = v;          return result; }
+    G_FmtArg G_Fmt(i32 v)       { G_FmtArg result;    result.kind = G_FmtArgKind_I32;    result.v = v;          return result; }
-    G_FmtArg G_Fmt(f32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_F32;    result.v = asuint(v);  return result; }
+    G_FmtArg G_Fmt(f32 v)       { G_FmtArg result;    result.kind = G_FmtArgKind_F32;    result.v = asuint(v);  return result; }
    G_FmtArg G_FmtEnd(void)     { G_FmtArg result;    result.kind = G_FmtArgKind_End;    result.v = 0;          return result; }
    Struct(G_TempPrintBuffer)
    {
-        u32 char_chunks[256];
+        /* NOTE: The larger the array size, the longer the compilation time */
-        u32 char_pos;
+        u32 byte_chunks[64];
-        u32 fmt_size;
+        u32 bytes_count;
        u32 chars_count;
        u32 args_count;
        b32 overflowed;
    };
-    void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v)
+    void G_PushPrintByte(inout G_TempPrintBuffer buff, u32 v)
    {
-        /* TODO: Overflow check */
+        u32 chunk_idx = buff.bytes_count / 4;
-        u32 u32_arr_pos = buff.char_pos / 4;
+        if (chunk_idx < countof(buff.byte_chunks))
        u32 idx_in_u32 = buff.char_pos & 0x03;
        if (idx_in_u32 == 0)
        {
-            /* Since buff is not zero initialized, we set the chunk on first write here */
+            u32 byte_idx_in_chunk = buff.bytes_count & 0x03;
-            buff.char_chunks[u32_arr_pos] = v & 0xFF;
+            if (byte_idx_in_chunk == 0)
            {
                /* Since buff is not zero initialized, we set the chunk on first write here */
                buff.byte_chunks[chunk_idx] = v & 0xFF;
            }
            else
            {
                buff.byte_chunks[chunk_idx] |= (v & 0xFF) << (byte_idx_in_chunk * 8);
            }
            buff.bytes_count += 1;
        }
        else
        {
-            buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8);
+            buff.overflowed = 1;
        }
        buff.char_pos += 1;
    }
    void G_CommitPrint(G_TempPrintBuffer buff)
    {
        RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef);
-        u32 chunks_count = (buff.char_pos + 3) / 4;
+        if (buff.overflowed)
        {
            buff.bytes_count = 0;
            buff.chars_count = 0;
            buff.args_count = 0;
        }
        u32 chunks_count = (buff.bytes_count + 3) / 4;
        u32 alloc_size = 0;
        alloc_size += 4;                 /* Header */
        alloc_size += chunks_count * 4;  /* Chunks */
@ -166,23 +183,22 @@ Struct(G_FmtArg)
        /* Atomic fetch + add to base counter */
        u32 base;
        rw.InterlockedAdd(0, alloc_size, base);
        base += 4;  /* Offset for allocation counter */
        base += 4;  /* Offset for success counter */
        base += 4;  /* Offset for overflow counter */
        if ((base + alloc_size) < countof(rw))
        {
            u32 pos = 0;
            /* Increment success counter */
            rw.InterlockedAdd(4, 1);
            u32 pos = 0;
            /* Write header */
            {
                u32 header = 0;
-                header |= (buff.fmt_size   <<  0) & 0x0000FFFF;
+                header |= (buff.chars_count <<  0) & 0x0000FFFF;
-                header |= (buff.args_count << 16) & 0xFFFF0000;
+                header |= (buff.args_count  << 16) & 0x7FFF0000;
                header |= (buff.overflowed  << 31) & 0xF0000000;
                rw.Store(base + pos, header);
                pos += 4;
            }
@ -190,7 +206,7 @@ Struct(G_FmtArg)
            /* Write chunks */
            for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
            {
-                u32 chunk = buff.char_chunks[chunk_idx];
+                u32 chunk = buff.byte_chunks[chunk_idx];
                rw.Store(base + pos, chunk);
                pos += 4;
            }
@ -202,43 +218,32 @@ Struct(G_FmtArg)
        }
    }
-    #define G_Print(fmt) do {                                                     \
+    #define G_PrintF_(fmt, ...) do {                                        \
-        G_TempPrintBuffer __tmp;                                                  \
+        G_TempPrintBuffer __tmp;                                            \
-        __tmp.char_pos = 0;                                                       \
+        __tmp.bytes_count = 0;                                              \
-        u32 __pos = 0;                                                            \
+        __tmp.overflowed = 0;                                               \
-        while (U32FromChar(fmt[__pos]) != 0)                                      \
+        u32 __char_idx = 0;                                                 \
-        {                                                                         \
+        while (U32FromChar(fmt[__char_idx]) != 0)                           \
-            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));                      \
+        {                                                                   \
-            ++__pos;                                                              \
+            G_PushPrintByte(__tmp, U32FromChar(fmt[__char_idx]));           \
-        }                                                                         \
+            ++__char_idx;                                                   \
-        __tmp.fmt_size = __tmp.char_pos;                                          \
+        }                                                                   \
-        G_CommitPrint(__tmp);                                                     \
+        G_FmtArg __args[] = { __VA_ARGS__ };                                \
        __tmp.chars_count = __tmp.bytes_count;                              \
        __tmp.args_count = (countof(__args) - 1);                           \
        for (u32 __arg_idx = 0; __arg_idx < __tmp.args_count; ++__arg_idx)  \
        {                                                                   \
            G_PushPrintByte(__tmp, __args[__arg_idx].kind);                 \
            G_PushPrintByte(__tmp, __args[__arg_idx].v >>  0);              \
            G_PushPrintByte(__tmp, __args[__arg_idx].v >>  8);              \
            G_PushPrintByte(__tmp, __args[__arg_idx].v >> 16);              \
            G_PushPrintByte(__tmp, __args[__arg_idx].v >> 24);              \
        }                                                                   \
        G_CommitPrint(__tmp);                                               \
    } while (0)
-    #define G_PrintF(fmt, ...) do {                                               \
+    #define G_PrintF(fmt, ...) G_PrintF_(fmt, ##__VA_ARGS__, G_FmtEnd())
        G_TempPrintBuffer __tmp;                                                  \
        __tmp.char_pos = 0;                                                       \
        u32 __pos = 0;                                                            \
        while (U32FromChar(fmt[__pos]) != 0)                                      \
        {                                                                         \
            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));                      \
            ++__pos;                                                              \
        }                                                                         \
        G_FmtArg __args[] = { __VA_ARGS__ };                                      \
        __tmp.fmt_size = __tmp.char_pos;                                          \
        __tmp.args_count = countof(__args);                                       \
        for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx)         \
        {                                                                         \
            G_PushPrintChar(__tmp, __args[__arg_idx].kind);                       \
            G_PushPrintChar(__tmp, __args[__arg_idx].v >>  0);                    \
            G_PushPrintChar(__tmp, __args[__arg_idx].v >>  8);                    \
            G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16);                    \
            G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24);                    \
        }                                                                         \
        G_CommitPrint(__tmp);                                                     \
    } while (0)
 #else
    #define G_Print(fmt)
    #define G_PrintF(fmt)
 #endif
--- a/src/ui/ui_shaders.g
+++ b/src/ui/ui_shaders.g
@ -145,10 +145,13 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input)
    Vec2 uv = input.src_uv;
    Vec4 result = tex.Sample(sampler, uv);
-    // G_Print("Hello there!");
+    G_PrintF(
-    G_PrintF("Bla: (%F, %F)", G_Fmt(uv.x), G_Fmt(uv.y));
+        "Hello there! (%F, %F), (%F, %F)",
-
+        G_Fmt(input.SV_Position.x),
-
+        G_Fmt(input.SV_Position.y),
        G_Fmt(uv.x),
        G_Fmt(uv.y)
    );
    UI_BlitPSOutput output;
    output.SV_Target0 = result;