shader printf arg parsing on cpu

This commit is contained in:
jacob 2025-12-10 20:21:08 -06:00
parent f911e98c98
commit bc76a511e6
5 changed files with 193 additions and 38 deletions

View File

@ -24,7 +24,11 @@ typedef float4x4 Mat4x4;
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Countof //~ Countof
template<typename T, u32 N> u32 countof(T arr[N]) { return N; } template<typename T, u32 N>
u32 countof(T arr[N])
{
return N;
}
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Color helpers //~ Color helpers

View File

@ -73,6 +73,8 @@
#define GPU_DEBUG_VALIDATION 1 #define GPU_DEBUG_VALIDATION 1
#define GPU_SHADER_PRINT 1 #define GPU_SHADER_PRINT 1
#define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1);
#define GPU_SHADER_PRINT_LOG 1
/* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */ /* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
#define BITBUFF_DEBUG 0 #define BITBUFF_DEBUG 0

View File

@ -300,7 +300,7 @@ void G_Bootstrap(void)
if (kind != G_QueueKind_AsyncCopy) if (kind != G_QueueKind_AsyncCopy)
{ {
G_ArenaHandle gpu_perm = G_PermArena(); G_ArenaHandle gpu_perm = G_PermArena();
queue->print_buffer_size = Mebi(64); queue->print_buffer_size = GPU_SHADER_PRINT_BUFFER_SIZE;
queue->print_buffer = G_PushBuffer( queue->print_buffer = G_PushBuffer(
gpu_perm, gpu_perm,
u8, u8,
@ -2840,7 +2840,6 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
.flags = G_ResourceFlag_HostMemory .flags = G_ResourceFlag_HostMemory
); );
u32 zero = 0;
for (;;) for (;;)
{ {
/* FIXME: Remove this */ /* FIXME: Remove this */
@ -2851,20 +2850,96 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
{ {
/* Copy print buffer to readback buffer */ /* Copy print buffer to readback buffer */
G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size)); G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size));
/* Reset size to 0 */ /* Reset counters to 0 */
G_MemorySync(cl, queue->print_buffer, G_MemorySync(cl, queue->print_buffer,
G_Stage_Copy, G_Access_CopyRead, G_Stage_Copy, G_Access_CopyRead,
G_Stage_Copy, G_Access_CopyWrite G_Stage_Copy, G_Access_CopyWrite
); );
G_CopyCpuToBuffer(cl, queue->print_buffer, 0, &zero, RNGU64(0, 4)); u8 zero[12] = ZI;
G_CopyCpuToBuffer(cl, queue->print_buffer, 0, zero, RNGU64(0, sizeof(zero)));
} }
i64 completion = G_CommitCommandList(cl); G_CommitCommandList(cl);
G_SyncCpu(G_MaskFromQueue(queue_kind)); G_SyncCpu(G_MaskFromQueue(queue_kind));
u32 size = *G_StructFromResource(readback_buff, u32); u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0);
u8 *text = G_StructFromResource(readback_buff, u8) + 4; u32 prints_count = *(G_StructFromResource(readback_buff, u32) + 1);
u32 overflows_count = *(G_StructFromResource(readback_buff, u32) + 2);
u8 *start = G_StructFromResource(readback_buff, u8) + 12;
String s = STRING(size, text); /* Deserialize */
if (GPU_SHADER_PRINT_LOG)
{
/* FIXME: Remove this */
TempArena scratch = BeginScratchNoConflict();
u8 *at = start;
{
for (u32 print_num = 1; print_num <= prints_count; ++print_num)
{
u32 chars_count = 0;
u32 args_count = 0;
{
u32 header = *(u32 *)at;
chars_count = (header & 0x0000FFFF) >> 0;
args_count = (header & 0xFFFF0000) >> 16;
at += 4;
}
String fmt = ZI;
{
fmt.len = chars_count;
fmt.text = at;
at += chars_count;
}
FmtArg *args = 0;
{
if (args_count > 0)
{
args = PushStructs(scratch.arena, FmtArg, args_count);
for (u32 arg_idx = 0; arg_idx <= args_count; ++arg_idx)
{
G_FmtArgKind gpu_kind = (G_FmtArgKind)(*at);
at += 1;
u32 gpu_data = *(u32 *)at;
at += 4;
FmtArg *dst = &args[arg_idx];
switch (gpu_kind)
{
case G_FmtArgKind_U32:
{
dst->kind = FmtArgKind_Uint;
dst->value.uint = gpu_data;
} break;
case G_FmtArgKind_I32:
{
dst->kind = FmtArgKind_Sint;
dst->value.sint = (i32)gpu_data;
} break;
case G_FmtArgKind_F32:
{
dst->kind = FmtArgKind_Float;
dst->value.f = *(f32 *)&gpu_data;
} break;
}
}
}
}
// String final_str = ZI;
// if (args_count > 0)
// {
// }
// else
// {
// final_str = PushString(scratch.arena, fmt);
// }
at = (u8 *)AlignU64((u64)at, 4);
}
}
EndScratch(scratch);
}
DEBUGBREAKABLE; DEBUGBREAKABLE;
} }

View File

@ -109,62 +109,133 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef, 8)
/* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */ /* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */
Enum(G_FmtArgKind)
{
G_FmtArgKind_None,
G_FmtArgKind_U32,
G_FmtArgKind_I32,
G_FmtArgKind_F32,
};
Struct(G_FmtArg)
{
G_FmtArgKind kind;
u32 v;
};
#if IsLanguageG && GPU_SHADER_PRINT #if IsLanguageG && GPU_SHADER_PRINT
G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_U32; result.v = v; return result; }
G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_I32; result.v = v; return result; }
G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_F32; result.v = asuint(v); return result; }
Struct(G_TempPrintBuffer) Struct(G_TempPrintBuffer)
{ {
u32 data_u32[256]; u32 char_chunks[256];
u32 byte_pos; u32 char_pos;
u32 fmt_size;
u32 args_count;
}; };
void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 c) void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v)
{ {
/* TODO: Overflow check */ /* TODO: Overflow check */
u32 u32_arr_pos = buff.byte_pos / 4; u32 u32_arr_pos = buff.char_pos / 4;
u32 idx_in_u32 = buff.byte_pos & 0x03; u32 idx_in_u32 = buff.char_pos & 0x03;
if (idx_in_u32 == 0) if (idx_in_u32 == 0)
{ {
/* Since buff is not zero initialized, we set the byte on first write here */ /* Since buff is not zero initialized, we set the chunk on first write here */
buff.data_u32[u32_arr_pos] = c & 0xFF; buff.char_chunks[u32_arr_pos] = v & 0xFF;
} }
else else
{ {
buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8); buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8);
} }
buff.byte_pos += 1; buff.char_pos += 1;
} }
void G_CommitPrint(G_TempPrintBuffer buff) void G_CommitPrint(G_TempPrintBuffer buff)
{ {
RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef); RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef);
u32 u32s_count = (buff.byte_pos + 3) / 4;
u32 alloc_size = u32s_count * 4; u32 chunks_count = (buff.char_pos + 3) / 4;
u32 alloc_size = 0;
alloc_size += 4; /* Header */
alloc_size += chunks_count * 4; /* Chunks */
u32 base; u32 base;
rw.InterlockedAdd(0, alloc_size, base); rw.InterlockedAdd(0, alloc_size, base); /* Write to base counter */
base += 4; /* Account for counter at beginning of buff */
if ((base + alloc_size) < countof(rw)) u32 pos = base;
pos += 4; /* Offset for base counter */
pos += 4; /* Offset for success counter */
pos += 4; /* Offset for overflow counter */
if ((pos + alloc_size) < countof(rw))
{ {
for (u32 u32_idx = 0; u32_idx < u32s_count; ++u32_idx) /* Increment success counter */
rw.InterlockedAdd(4, 1);
/* Store header */
{ {
u32 data = buff.data_u32[u32_idx]; u32 header = 0;
rw.Store(base + (u32_idx * 4), data); header |= (buff.fmt_size << 0) & 0x0000FFFF;
header |= (buff.args_count << 16) & 0xFFFF0000;
rw.Store(base + pos, header);
pos += 4;
} }
/* Store chunks */
for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
{
u32 chunk = buff.char_chunks[chunk_idx];
rw.Store(base + pos, chunk);
pos += 4;
}
}
else
{
/* Increment overflow counter */
rw.InterlockedAdd(8, 1);
} }
} }
#define G_Print(fmt) do { \ #define G_Print(fmt) do { \
G_TempPrintBuffer __tmp; \ G_TempPrintBuffer __tmp; \
__tmp.byte_pos = 0; \ __tmp.char_pos = 0; \
u32 __pos = 0; \ u32 __pos = 0; \
while (U32FromChar(fmt[__pos]) != 0) \ while (U32FromChar(fmt[__pos]) != 0) \
{ \ { \
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \ G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
++__pos; \ ++__pos; \
} \ } \
G_PushPrintChar(__tmp, 0); \ __tmp.fmt_size = __tmp.char_pos; \
G_CommitPrint(__tmp); \ G_CommitPrint(__tmp); \
} while (0) } while (0)
#define G_PrintF(fmt, ...) do { \
G_TempPrintBuffer __tmp; \
__tmp.char_pos = 0; \
u32 __pos = 0; \
while (U32FromChar(fmt[__pos]) != 0) \
{ \
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
++__pos; \
} \
G_FmtArg __args[] = { __VA_ARGS__ }; \
__tmp.fmt_size = __tmp.char_pos; \
__tmp.args_count = countof(__args); \
for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx) \
{ \
G_PushPrintChar(__tmp, __args[__arg_idx].kind); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 0); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 8); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24); \
} \
G_CommitPrint(__tmp); \
} while (0)
#else #else
#define G_Print(fmt) #define G_Print(fmt)
#define G_PrintF(fmt)
#endif #endif

View File

@ -145,7 +145,10 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input)
Vec2 uv = input.src_uv; Vec2 uv = input.src_uv;
Vec4 result = tex.Sample(sampler, uv); Vec4 result = tex.Sample(sampler, uv);
G_Print("Hello there!"); // G_Print("Hello there!");
G_PrintF("Hello there: \"%F\"", G_Fmt(3.123));
UI_BlitPSOutput output; UI_BlitPSOutput output;
output.SV_Target0 = result; output.SV_Target0 = result;