From bc76a511e6e271def37023aa148e8f1a0f4c2bba Mon Sep 17 00:00:00 2001
From: jacob <jacob@cagori.com>
Date: Wed, 10 Dec 2025 20:21:08 -0600
Subject: [PATCH] shader printf arg parsing on cpu

---
 src/base/base_shader.gh          |   6 +-
 src/config.h                     |   2 +
 src/gpu/gpu_dx12/gpu_dx12_core.c |  91 ++++++++++++++++++++--
 src/gpu/gpu_shader_core.cgh      | 127 ++++++++++++++++++++++++-------
 src/ui/ui_shaders.g              |   5 +-
 5 files changed, 193 insertions(+), 38 deletions(-)
diff --git a/src/base/base_shader.gh b/src/base/base_shader.gh
index 75ac7674..a6710a15 100644
--- a/src/base/base_shader.gh
+++ b/src/base/base_shader.gh
@@ -24,7 +24,11 @@ typedef float4x4 Mat4x4;
 ////////////////////////////////////////////////////////////
 //~ Countof
 
-template<typename T, u32 N> u32         countof(T arr[N])                   { return N; }
+template<typename T, u32 N>
+u32 countof(T arr[N])
+{
+    return N;
+}
 
 ////////////////////////////////////////////////////////////
 //~ Color helpers
diff --git a/src/config.h b/src/config.h
index d0951d8e..a0720b78 100644
--- a/src/config.h
+++ b/src/config.h
@@ -73,6 +73,8 @@
 #define GPU_DEBUG_VALIDATION 1
 
 #define GPU_SHADER_PRINT 1
+#define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1);
+#define GPU_SHADER_PRINT_LOG 1
 
 /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
 #define BITBUFF_DEBUG 0
diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.c b/src/gpu/gpu_dx12/gpu_dx12_core.c
index b03a248a..ef073019 100644
--- a/src/gpu/gpu_dx12/gpu_dx12_core.c
+++ b/src/gpu/gpu_dx12/gpu_dx12_core.c
@@ -300,7 +300,7 @@ void G_Bootstrap(void)
                 if (kind != G_QueueKind_AsyncCopy)
                 {
                     G_ArenaHandle gpu_perm = G_PermArena();
-                    queue->print_buffer_size = Mebi(64);
+                    queue->print_buffer_size = GPU_SHADER_PRINT_BUFFER_SIZE;
                     queue->print_buffer = G_PushBuffer(
                         gpu_perm,
                         u8,
@@ -2840,7 +2840,6 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
         .flags = G_ResourceFlag_HostMemory
     );
 
-    u32 zero = 0;
     for (;;)
     {
         /* FIXME: Remove this */
@@ -2851,20 +2850,96 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
         {
             /* Copy print buffer to readback buffer */
             G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size));
-            /* Reset size to 0 */
+            /* Reset counters to 0 */
             G_MemorySync(cl, queue->print_buffer,
                 G_Stage_Copy,           G_Access_CopyRead,
                 G_Stage_Copy,           G_Access_CopyWrite
             );
-            G_CopyCpuToBuffer(cl, queue->print_buffer, 0, &zero, RNGU64(0, 4));
+            u8 zero[12] = ZI;
+            G_CopyCpuToBuffer(cl, queue->print_buffer, 0, zero, RNGU64(0, sizeof(zero)));
         }
-        i64 completion = G_CommitCommandList(cl);
+        G_CommitCommandList(cl);
 
         G_SyncCpu(G_MaskFromQueue(queue_kind));
-        u32 size = *G_StructFromResource(readback_buff, u32);
-        u8 *text = G_StructFromResource(readback_buff, u8) + 4;
+        u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0);
+        u32 prints_count                = *(G_StructFromResource(readback_buff, u32) + 1);
+        u32 overflows_count             = *(G_StructFromResource(readback_buff, u32) + 2);
+        u8 *start                       = G_StructFromResource(readback_buff, u8) + 12;
 
-        String s = STRING(size, text);
+        /* Deserialize */
+        if (GPU_SHADER_PRINT_LOG)
+        {
+            /* FIXME: Remove this */
+            TempArena scratch = BeginScratchNoConflict();
+            u8 *at = start;
+            {
+                for (u32 print_num = 1; print_num <= prints_count; ++print_num)
+                {
+                    u32 chars_count = 0;
+                    u32 args_count = 0;
+                    {
+                        u32 header = *(u32 *)at;
+                        chars_count = (header & 0x0000FFFF) >> 0;
+                        args_count  = (header & 0xFFFF0000) >> 16;
+                        at += 4;
+                    }
+
+                    String fmt = ZI;
+                    {
+                        fmt.len = chars_count;
+                        fmt.text = at;
+                        at += chars_count;
+                    }
+
+                    FmtArg *args = 0;
+                    {
+                        if (args_count > 0)
+                        {
+                            args = PushStructs(scratch.arena, FmtArg, args_count);
+                            for (u32 arg_idx = 0; arg_idx <= args_count; ++arg_idx)
+                            {
+                                G_FmtArgKind gpu_kind = (G_FmtArgKind)(*at);
+                                at += 1;
+                                u32 gpu_data = *(u32 *)at;
+                                at += 4;
+
+                                FmtArg *dst = &args[arg_idx];
+                                switch (gpu_kind)
+                                {
+                                    case G_FmtArgKind_U32:
+                                    {
+                                        dst->kind = FmtArgKind_Uint;
+                                        dst->value.uint = gpu_data;
+                                    } break;
+                                    case G_FmtArgKind_I32:
+                                    {
+                                        dst->kind = FmtArgKind_Sint;
+                                        dst->value.sint = (i32)gpu_data;
+                                    } break;
+                                    case G_FmtArgKind_F32:
+                                    {
+                                        dst->kind = FmtArgKind_Float;
+                                        dst->value.f = *(f32 *)&gpu_data;
+                                    } break;
+                                }
+                            }
+                        }
+                    }
+
+                    // String final_str = ZI;
+                    // if (args_count > 0)
+                    // {
+                    // }
+                    // else
+                    // {
+                    //     final_str = PushString(scratch.arena, fmt);
+                    // }
+
+                    at = (u8 *)AlignU64((u64)at, 4);
+                }
+            }
+            EndScratch(scratch);
+        }
 
         DEBUGBREAKABLE;
     }
diff --git a/src/gpu/gpu_shader_core.cgh b/src/gpu/gpu_shader_core.cgh
index d8bdc9b1..6f9c1f1d 100644
--- a/src/gpu/gpu_shader_core.cgh
+++ b/src/gpu/gpu_shader_core.cgh
@@ -109,62 +109,133 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef,   8)
 
 /* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */
 
+Enum(G_FmtArgKind)
+{
+    G_FmtArgKind_None,
+    G_FmtArgKind_U32,
+    G_FmtArgKind_I32,
+    G_FmtArgKind_F32,
+};
+
+Struct(G_FmtArg)
+{
+    G_FmtArgKind kind;
+    u32 v;
+};
+
 #if IsLanguageG && GPU_SHADER_PRINT
+    G_FmtArg G_Fmt(u32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_U32;    result.v = v;          return result; }
+    G_FmtArg G_Fmt(i32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_I32;    result.v = v;          return result; }
+    G_FmtArg G_Fmt(f32 v) { G_FmtArg result;    result.kind = G_FmtArgKind_F32;    result.v = asuint(v);  return result; }
+
     Struct(G_TempPrintBuffer)
     {
-        u32 data_u32[256];
-        u32 byte_pos;
+        u32 char_chunks[256];
+        u32 char_pos;
+        u32 fmt_size;
+        u32 args_count;
     };
 
-    void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 c)
+    void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v)
     {
         /* TODO: Overflow check */
-        u32 u32_arr_pos = buff.byte_pos / 4;
-        u32 idx_in_u32 = buff.byte_pos & 0x03;
+        u32 u32_arr_pos = buff.char_pos / 4;
+        u32 idx_in_u32 = buff.char_pos & 0x03;
         if (idx_in_u32 == 0)
         {
-            /* Since buff is not zero initialized, we set the byte on first write here */
-            buff.data_u32[u32_arr_pos]  = c & 0xFF;
+            /* Since buff is not zero initialized, we set the chunk on first write here */
+            buff.char_chunks[u32_arr_pos] = v & 0xFF;
         }
         else
         {
-            buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8);
+            buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8);
         }
-        buff.byte_pos += 1;
+        buff.char_pos += 1;
     }
 
     void G_CommitPrint(G_TempPrintBuffer buff)
     {
         RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef);
-        u32 u32s_count = (buff.byte_pos + 3) / 4;
-        u32 alloc_size = u32s_count * 4;
+
+        u32 chunks_count = (buff.char_pos + 3) / 4;
+
+        u32 alloc_size = 0;
+        alloc_size += 4;                /* Header */
+        alloc_size += chunks_count * 4; /* Chunks */
 
         u32 base;
-        rw.InterlockedAdd(0, alloc_size, base);
-        base += 4;  /* Account for counter at beginning of buff */
+        rw.InterlockedAdd(0, alloc_size, base);  /* Write to base counter */
 
-        if ((base + alloc_size) < countof(rw))
+        u32 pos = base;
+        pos += 4;  /* Offset for base counter */
+        pos += 4;  /* Offset for success counter */
+        pos += 4;  /* Offset for overflow counter */
+
+        if ((pos + alloc_size) < countof(rw))
         {
-            for (u32 u32_idx = 0; u32_idx < u32s_count; ++u32_idx)
+            /* Increment success counter */
+            rw.InterlockedAdd(4, 1);
+
+            /* Store header */
             {
-                u32 data = buff.data_u32[u32_idx];
-                rw.Store(base + (u32_idx * 4), data);
+                u32 header = 0;
+                header |= (buff.fmt_size   <<  0) & 0x0000FFFF;
+                header |= (buff.args_count << 16) & 0xFFFF0000;
+                rw.Store(base + pos, header);
+                pos += 4;
             }
+            /* Store chunks */
+            for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
+            {
+                u32 chunk = buff.char_chunks[chunk_idx];
+                rw.Store(base + pos, chunk);
+                pos += 4;
+            }
+        }
+        else
+        {
+            /* Increment overflow counter */
+            rw.InterlockedAdd(8, 1);
         }
     }
 
-    #define G_Print(fmt) do {                                   \
-        G_TempPrintBuffer __tmp;                                \
-        __tmp.byte_pos = 0;                                     \
-        u32 __pos = 0;                                          \
-        while (U32FromChar(fmt[__pos]) != 0)                    \
-        {                                                       \
-            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));    \
-            ++__pos;                                            \
-        }                                                       \
-        G_PushPrintChar(__tmp, 0);                              \
-        G_CommitPrint(__tmp);                                   \
+    #define G_Print(fmt) do {                                                     \
+        G_TempPrintBuffer __tmp;                                                  \
+        __tmp.char_pos = 0;                                                       \
+        u32 __pos = 0;                                                            \
+        while (U32FromChar(fmt[__pos]) != 0)                                      \
+        {                                                                         \
+            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));                      \
+            ++__pos;                                                              \
+        }                                                                         \
+        __tmp.fmt_size = __tmp.char_pos;                                          \
+        G_CommitPrint(__tmp);                                                     \
     } while (0)
+
+    #define G_PrintF(fmt, ...) do {                                               \
+        G_TempPrintBuffer __tmp;                                                  \
+        __tmp.char_pos = 0;                                                       \
+        u32 __pos = 0;                                                            \
+        while (U32FromChar(fmt[__pos]) != 0)                                      \
+        {                                                                         \
+            G_PushPrintChar(__tmp, U32FromChar(fmt[__pos]));                      \
+            ++__pos;                                                              \
+        }                                                                         \
+        G_FmtArg __args[] = { __VA_ARGS__ };                                      \
+        __tmp.fmt_size = __tmp.char_pos;                                          \
+        __tmp.args_count = countof(__args);                                       \
+        for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx)         \
+        {                                                                         \
+            G_PushPrintChar(__tmp, __args[__arg_idx].kind);                       \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >>  0);                    \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >>  8);                    \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16);                    \
+            G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24);                    \
+        }                                                                         \
+        G_CommitPrint(__tmp);                                                     \
+    } while (0)
+
 #else
     #define G_Print(fmt)
+    #define G_PrintF(fmt)
 #endif
diff --git a/src/ui/ui_shaders.g b/src/ui/ui_shaders.g
index 5b64e281..43d9247c 100644
--- a/src/ui/ui_shaders.g
+++ b/src/ui/ui_shaders.g
@@ -145,7 +145,10 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input)
     Vec2 uv = input.src_uv;
     Vec4 result = tex.Sample(sampler, uv);
 
-    G_Print("Hello there!");
+    // G_Print("Hello there!");
+    G_PrintF("Hello there: \"%F\"", G_Fmt(3.123));
+
+
 
     UI_BlitPSOutput output;
     output.SV_Target0 = result;