shader printf arg parsing on cpu

This commit is contained in:
jacob 2025-12-10 20:21:08 -06:00
parent f911e98c98
commit bc76a511e6
5 changed files with 193 additions and 38 deletions

View File

@ -24,7 +24,11 @@ typedef float4x4 Mat4x4;
////////////////////////////////////////////////////////////
//~ Countof
template<typename T, u32 N> u32 countof(T arr[N]) { return N; }
template<typename T, u32 N>
u32 countof(T arr[N])
{
return N;
}
////////////////////////////////////////////////////////////
//~ Color helpers

View File

@ -73,6 +73,8 @@
#define GPU_DEBUG_VALIDATION 1
#define GPU_SHADER_PRINT 1
#define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1);
#define GPU_SHADER_PRINT_LOG 1
/* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
#define BITBUFF_DEBUG 0

View File

@ -300,7 +300,7 @@ void G_Bootstrap(void)
if (kind != G_QueueKind_AsyncCopy)
{
G_ArenaHandle gpu_perm = G_PermArena();
queue->print_buffer_size = Mebi(64);
queue->print_buffer_size = GPU_SHADER_PRINT_BUFFER_SIZE;
queue->print_buffer = G_PushBuffer(
gpu_perm,
u8,
@ -2840,7 +2840,6 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
.flags = G_ResourceFlag_HostMemory
);
u32 zero = 0;
for (;;)
{
/* FIXME: Remove this */
@ -2851,20 +2850,96 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
{
/* Copy print buffer to readback buffer */
G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size));
/* Reset size to 0 */
/* Reset counters to 0 */
G_MemorySync(cl, queue->print_buffer,
G_Stage_Copy, G_Access_CopyRead,
G_Stage_Copy, G_Access_CopyWrite
);
G_CopyCpuToBuffer(cl, queue->print_buffer, 0, &zero, RNGU64(0, 4));
u8 zero[12] = ZI;
G_CopyCpuToBuffer(cl, queue->print_buffer, 0, zero, RNGU64(0, sizeof(zero)));
}
i64 completion = G_CommitCommandList(cl);
G_CommitCommandList(cl);
G_SyncCpu(G_MaskFromQueue(queue_kind));
u32 size = *G_StructFromResource(readback_buff, u32);
u8 *text = G_StructFromResource(readback_buff, u8) + 4;
u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0);
u32 prints_count = *(G_StructFromResource(readback_buff, u32) + 1);
u32 overflows_count = *(G_StructFromResource(readback_buff, u32) + 2);
u8 *start = G_StructFromResource(readback_buff, u8) + 12;
String s = STRING(size, text);
/* Deserialize */
if (GPU_SHADER_PRINT_LOG)
{
/* FIXME: Remove this */
TempArena scratch = BeginScratchNoConflict();
u8 *at = start;
{
for (u32 print_num = 1; print_num <= prints_count; ++print_num)
{
u32 chars_count = 0;
u32 args_count = 0;
{
u32 header = *(u32 *)at;
chars_count = (header & 0x0000FFFF) >> 0;
args_count = (header & 0xFFFF0000) >> 16;
at += 4;
}
String fmt = ZI;
{
fmt.len = chars_count;
fmt.text = at;
at += chars_count;
}
FmtArg *args = 0;
{
if (args_count > 0)
{
args = PushStructs(scratch.arena, FmtArg, args_count);
for (u32 arg_idx = 0; arg_idx <= args_count; ++arg_idx)
{
G_FmtArgKind gpu_kind = (G_FmtArgKind)(*at);
at += 1;
u32 gpu_data = *(u32 *)at;
at += 4;
FmtArg *dst = &args[arg_idx];
switch (gpu_kind)
{
case G_FmtArgKind_U32:
{
dst->kind = FmtArgKind_Uint;
dst->value.uint = gpu_data;
} break;
case G_FmtArgKind_I32:
{
dst->kind = FmtArgKind_Sint;
dst->value.sint = (i32)gpu_data;
} break;
case G_FmtArgKind_F32:
{
dst->kind = FmtArgKind_Float;
dst->value.f = *(f32 *)&gpu_data;
} break;
}
}
}
}
// String final_str = ZI;
// if (args_count > 0)
// {
// }
// else
// {
// final_str = PushString(scratch.arena, fmt);
// }
at = (u8 *)AlignU64((u64)at, 4);
}
}
EndScratch(scratch);
}
DEBUGBREAKABLE;
}

View File

@ -109,62 +109,133 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef, 8)
/* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */
Enum(G_FmtArgKind)
{
G_FmtArgKind_None,
G_FmtArgKind_U32,
G_FmtArgKind_I32,
G_FmtArgKind_F32,
};
Struct(G_FmtArg)
{
G_FmtArgKind kind;
u32 v;
};
#if IsLanguageG && GPU_SHADER_PRINT
G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_U32; result.v = v; return result; }
G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_I32; result.v = v; return result; }
G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_F32; result.v = asuint(v); return result; }
Struct(G_TempPrintBuffer)
{
u32 data_u32[256];
u32 byte_pos;
u32 char_chunks[256];
u32 char_pos;
u32 fmt_size;
u32 args_count;
};
void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 c)
void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v)
{
/* TODO: Overflow check */
u32 u32_arr_pos = buff.byte_pos / 4;
u32 idx_in_u32 = buff.byte_pos & 0x03;
u32 u32_arr_pos = buff.char_pos / 4;
u32 idx_in_u32 = buff.char_pos & 0x03;
if (idx_in_u32 == 0)
{
/* Since buff is not zero initialized, we set the byte on first write here */
buff.data_u32[u32_arr_pos] = c & 0xFF;
/* Since buff is not zero initialized, we set the chunk on first write here */
buff.char_chunks[u32_arr_pos] = v & 0xFF;
}
else
{
buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8);
buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8);
}
buff.byte_pos += 1;
buff.char_pos += 1;
}
void G_CommitPrint(G_TempPrintBuffer buff)
{
RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef);
u32 u32s_count = (buff.byte_pos + 3) / 4;
u32 alloc_size = u32s_count * 4;
u32 chunks_count = (buff.char_pos + 3) / 4;
u32 alloc_size = 0;
alloc_size += 4; /* Header */
alloc_size += chunks_count * 4; /* Chunks */
u32 base;
rw.InterlockedAdd(0, alloc_size, base);
base += 4; /* Account for counter at beginning of buff */
rw.InterlockedAdd(0, alloc_size, base); /* Write to base counter */
if ((base + alloc_size) < countof(rw))
u32 pos = base;
pos += 4; /* Offset for base counter */
pos += 4; /* Offset for success counter */
pos += 4; /* Offset for overflow counter */
if ((pos + alloc_size) < countof(rw))
{
for (u32 u32_idx = 0; u32_idx < u32s_count; ++u32_idx)
/* Increment success counter */
rw.InterlockedAdd(4, 1);
/* Store header */
{
u32 data = buff.data_u32[u32_idx];
rw.Store(base + (u32_idx * 4), data);
u32 header = 0;
header |= (buff.fmt_size << 0) & 0x0000FFFF;
header |= (buff.args_count << 16) & 0xFFFF0000;
rw.Store(base + pos, header);
pos += 4;
}
/* Store chunks */
for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
{
u32 chunk = buff.char_chunks[chunk_idx];
rw.Store(base + pos, chunk);
pos += 4;
}
}
else
{
/* Increment overflow counter */
rw.InterlockedAdd(8, 1);
}
}
#define G_Print(fmt) do { \
G_TempPrintBuffer __tmp; \
__tmp.byte_pos = 0; \
u32 __pos = 0; \
while (U32FromChar(fmt[__pos]) != 0) \
{ \
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
++__pos; \
} \
G_PushPrintChar(__tmp, 0); \
G_CommitPrint(__tmp); \
#define G_Print(fmt) do { \
G_TempPrintBuffer __tmp; \
__tmp.char_pos = 0; \
u32 __pos = 0; \
while (U32FromChar(fmt[__pos]) != 0) \
{ \
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
++__pos; \
} \
__tmp.fmt_size = __tmp.char_pos; \
G_CommitPrint(__tmp); \
} while (0)
#define G_PrintF(fmt, ...) do { \
G_TempPrintBuffer __tmp; \
__tmp.char_pos = 0; \
u32 __pos = 0; \
while (U32FromChar(fmt[__pos]) != 0) \
{ \
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
++__pos; \
} \
G_FmtArg __args[] = { __VA_ARGS__ }; \
__tmp.fmt_size = __tmp.char_pos; \
__tmp.args_count = countof(__args); \
for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx) \
{ \
G_PushPrintChar(__tmp, __args[__arg_idx].kind); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 0); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 8); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16); \
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24); \
} \
G_CommitPrint(__tmp); \
} while (0)
#else
#define G_Print(fmt)
#define G_PrintF(fmt)
#endif

View File

@ -145,7 +145,10 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input)
Vec2 uv = input.src_uv;
Vec4 result = tex.Sample(sampler, uv);
G_Print("Hello there!");
// G_Print("Hello there!");
G_PrintF("Hello there: \"%F\"", G_Fmt(3.123));
UI_BlitPSOutput output;
output.SV_Target0 = result;