shader printf arg parsing on cpu

2025-12-10 20:21:08 -06:00 · 2025-12-10 20:21:08 -06:00 · bc76a511e6
commit bc76a511e6
parent f911e98c98
5 changed files with 193 additions and 38 deletions
--- a/src/base/base_shader.gh
+++ b/src/base/base_shader.gh
@ -24,7 +24,11 @@ typedef float4x4 Mat4x4;
 ////////////////////////////////////////////////////////////
 //~ Countof

-template<typename T, u32 N> u32         countof(T arr[N])                   { return N; }
+template<typename T, u32 N>
+u32 countof(T arr[N])
+{
+    return N;
+}

 ////////////////////////////////////////////////////////////
 //~ Color helpers
--- a/src/config.h
+++ b/src/config.h
@ -73,6 +73,8 @@
 #define GPU_DEBUG_VALIDATION 1

 #define GPU_SHADER_PRINT 1
+#define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1);
+#define GPU_SHADER_PRINT_LOG 1

 /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
 #define BITBUFF_DEBUG 0
--- a/src/gpu/gpu_dx12/gpu_dx12_core.c
+++ b/src/gpu/gpu_dx12/gpu_dx12_core.c
@ -300,7 +300,7 @@ void G_Bootstrap(void)
                if (kind != G_QueueKind_AsyncCopy)
                {
                    G_ArenaHandle gpu_perm = G_PermArena();
-                    queue->print_buffer_size = Mebi(64);
+                    queue->print_buffer_size = GPU_SHADER_PRINT_BUFFER_SIZE;
                    queue->print_buffer = G_PushBuffer(
                        gpu_perm,
                        u8,
@ -2840,7 +2840,6 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
        .flags = G_ResourceFlag_HostMemory
    );

-    u32 zero = 0;
    for (;;)
    {
        /* FIXME: Remove this */
@ -2851,20 +2850,96 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
        {
            /* Copy print buffer to readback buffer */
            G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size));
-            /* Reset size to 0 */
+            /* Reset counters to 0 */
            G_MemorySync(cl, queue->print_buffer,
                G_Stage_Copy,           G_Access_CopyRead,
                G_Stage_Copy,           G_Access_CopyWrite
            );
-            G_CopyCpuToBuffer(cl, queue->print_buffer, 0, &zero, RNGU64(0, 4));
+            u8 zero[12] = ZI;
+            G_CopyCpuToBuffer(cl, queue->print_buffer, 0, zero, RNGU64(0, sizeof(zero)));
        }
-        i64 completion = G_CommitCommandList(cl);
+        G_CommitCommandList(cl);

        G_SyncCpu(G_MaskFromQueue(queue_kind));
-        u32 size = *G_StructFromResource(readback_buff, u32);
-        u8 *text = G_StructFromResource(readback_buff, u8) + 4;
+        u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0);
+        u32 prints_count                = *(G_StructFromResource(readback_buff, u32) + 1);
+        u32 overflows_count             = *(G_StructFromResource(readback_buff, u32) + 2);
+        u8 *start                       = G_StructFromResource(readback_buff, u8) + 12;

-        String s = STRING(size, text);
+        /* Deserialize */
+        if (GPU_SHADER_PRINT_LOG)
+        {
+            /* FIXME: Remove this */
+            TempArena scratch = BeginScratchNoConflict();
+            u8 *at = start;
+            {
+                for (u32 print_num = 1; print_num <= prints_count; ++print_num)
+                {
+                    u32 chars_count = 0;
+                    u32 args_count = 0;
+                    {
+                        u32 header = *(u32 *)at;
+                        chars_count = (header & 0x0000FFFF) >> 0;
+                        args_count  = (header & 0xFFFF0000) >> 16;
+                        at += 4;
+                    }
+
+                    String fmt = ZI;
+                    {
+                        fmt.len = chars_count;
+                        fmt.text = at;
+                        at += chars_count;
+                    }
+
+                    FmtArg *args = 0;
+                    {
+                        if (args_count > 0)
+                        {
+                            args = PushStructs(scratch.arena, FmtArg, args_count);
+                            for (u32 arg_idx = 0; arg_idx <= args_count; ++arg_idx)
+                            {
+                                G_FmtArgKind gpu_kind = (G_FmtArgKind)(*at);
+                                at += 1;
+                                u32 gpu_data = *(u32 *)at;
+                                at += 4;
+
+                                FmtArg *dst = &args[arg_idx];
+                                switch (gpu_kind)
+                                {
+                                    case G_FmtArgKind_U32:
+                                    {
+                                        dst->kind = FmtArgKind_Uint;
+                                        dst->value.uint = gpu_data;
+                                    } break;
+                                    case G_FmtArgKind_I32:
+                                    {
+                                        dst->kind = FmtArgKind_Sint;
+                                        dst->value.sint = (i32)gpu_data;
+                                    } break;
+                                    case G_FmtArgKind_F32:
+                                    {
+                                        dst->kind = FmtArgKind_Float;
+                                        dst->value.f = *(f32 *)&gpu_data;
+                                    } break;
+                                }
+                            }
+                        }
+                    }
+
+                    // String final_str = ZI;
+                    // if (args_count > 0)
+                    // {
+                    // }
+                    // else
+                    // {
+                    //     final_str = PushString(scratch.arena, fmt);
+                    // }
+
+                    at = (u8 *)AlignU64((u64)at, 4);
+                }
+            }
+            EndScratch(scratch);
+        }

        DEBUGBREAKABLE;
    }
--- a/src/gpu/gpu_shader_core.cgh
+++ b/src/gpu/gpu_shader_core.cgh
@ -109,62 +109,133 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef,   8)

 /* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */

+Enum(G_FmtArgKind)
+{
+    G_FmtArgKind_None,
+    G_FmtArgKind_U32,
+    G_FmtArgKind_I32,
+    G_FmtArgKind_F32,
+};
+
+Struct(G_FmtArg)
+{
+    G_FmtArgKind kind;
+    u32 v;
+};
+
 #if IsLanguageG && GPU_SHADER_PRINT
+    G_FmtArg G_Fmt(u32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_U32;    result.v = v;          return result; }
+    G_FmtArg G_Fmt(i32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_I32;    result.v = v;          return result; }
+    G_FmtArg G_Fmt(f32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_F32;    result.v = asuint(v);  return result; }
+
    Struct(G_TempPrintBuffer)
    {
-        u32 data_u32[256];
-        u32 byte_pos;
+        u32 char_chunks[256];
+        u32 char_pos;
+        u32 fmt_size;
+        u32 args_count;
    };

-    void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 c)
+    void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v)
    {
        /* TODO: Overflow check */
-        u32 u32_arr_pos = buff.byte_pos / 4;
-        u32 idx_in_u32 = buff.byte_pos & 0x03;
+        u32 u32_arr_pos = buff.char_pos / 4;
+        u32 idx_in_u32 = buff.char_pos & 0x03;
        if (idx_in_u32 == 0)
        {
-            /* Since buff is not zero initialized, we set the byte on first write here */
-            buff.data_u32[u32_arr_pos]  = c & 0xFF;
+            /* Since buff is not zero initialized, we set the chunk on first write here */
+            buff.char_chunks[u32_arr_pos] = v & 0xFF;
        }
        else
        {
-            buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8);
+            buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8);
        }
-        buff.byte_pos += 1;
+        buff.char_pos += 1;
    }

    void G_CommitPrint(G_TempPrintBuffer buff)
    {
        RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef);
-        u32 u32s_count = (buff.byte_pos + 3) / 4;
-        u32 alloc_size = u32s_count * 4;
+
+        u32 chunks_count = (buff.char_pos + 3) / 4;
+
+        u32 alloc_size = 0;
+        alloc_size += 4;                /* Header */
+        alloc_size += chunks_count * 4; /* Chunks */

        u32 base;
-        rw.InterlockedAdd(0, alloc_size, base);
-        base += 4;  /* Account for counter at beginning of buff */
+        rw.InterlockedAdd(0, alloc_size, base);  /* Write to base counter */

-        if ((base + alloc_size) < countof(rw))
+        u32 pos = base;
+        pos += 4;  /* Offset for base counter */
+        pos += 4;  /* Offset for success counter */
+        pos += 4;  /* Offset for overflow counter */
+
+        if ((pos + alloc_size) < countof(rw))
        {
-            for (u32 u32_idx = 0; u32_idx < u32s_count; ++u32_idx)
+            /* Increment success counter */
+            rw.InterlockedAdd(4, 1);
+
+            /* Store header */
            {
-                u32 data = buff.data_u32[u32_idx];
-                rw.Store(base + (u32_idx * 4), data);
+                u32 header = 0;
+                header |= (buff.fmt_size   <<  0) & 0x0000FFFF;
+                header |= (buff.args_count << 16) & 0xFFFF0000;
+                rw.Store(base + pos, header);
+                pos += 4;
            }
+            /* Store chunks */
+            for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
+            {
+                u32 chunk = buff.char_chunks[chunk_idx];
+                rw.Store(base + pos, chunk);
+                pos += 4;
+            }
+        }
+        else
+        {
+            /* Increment overflow counter */
+            rw.InterlockedAdd(8, 1);
        }
    }

-    #define G_Print(fmt) do {                                   \
-        G_TempPrintBuffer __tmp;                                \
-        __tmp.byte_pos = 0;                                     \
-        u32 __pos = 0;                                          \
-        while (U32FromChar(fmt[__pos]) != 0)                    \
-        {                                                       \
-            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));    \
-            ++__pos;                                            \
-        }                                                       \
-        G_PushPrintChar(__tmp, 0);                              \
-        G_CommitPrint(__tmp);                                   \
+    #define G_Print(fmt) do {                                                     \
+        G_TempPrintBuffer __tmp;                                                  \
+        __tmp.char_pos = 0;                                                       \
+        u32 __pos = 0;                                                            \
+        while (U32FromChar(fmt[__pos]) != 0)                                      \
+        {                                                                         \
+            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));                      \
+            ++__pos;                                                              \
+        }                                                                         \
+        __tmp.fmt_size = __tmp.char_pos;                                          \
+        G_CommitPrint(__tmp);                                                     \
    } while (0)
+
+    #define G_PrintF(fmt, ...) do {                                               \
+        G_TempPrintBuffer __tmp;                                                  \
+        __tmp.char_pos = 0;                                                       \
+        u32 __pos = 0;                                                            \
+        while (U32FromChar(fmt[__pos]) != 0)                                      \
+        {                                                                         \
+            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));                      \
+            ++__pos;                                                              \
+        }                                                                         \
+        G_FmtArg __args[] = { __VA_ARGS__ };                                      \
+        __tmp.fmt_size = __tmp.char_pos;                                          \
+        __tmp.args_count = countof(__args);                                       \
+        for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx)         \
+        {                                                                         \
+            G_PushPrintChar(__tmp, __args[__arg_idx].kind);                       \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >>  0);                    \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >>  8);                    \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16);                    \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24);                    \
+        }                                                                         \
+        G_CommitPrint(__tmp);                                                     \
+    } while (0)
+
 #else
    #define G_Print(fmt)
+    #define G_PrintF(fmt)
 #endif
--- a/src/ui/ui_shaders.g
+++ b/src/ui/ui_shaders.g
@ -145,7 +145,10 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input)
    Vec2 uv = input.src_uv;
    Vec4 result = tex.Sample(sampler, uv);

-    G_Print("Hello there!");
+    // G_Print("Hello there!");
+    G_PrintF("Hello there: \"%F\"", G_Fmt(3.123));
+
+

    UI_BlitPSOutput output;
    output.SV_Target0 = result;