shader printf arg parsing on cpu
This commit is contained in:
parent
f911e98c98
commit
bc76a511e6
@ -24,7 +24,11 @@ typedef float4x4 Mat4x4;
|
||||
////////////////////////////////////////////////////////////
|
||||
//~ Countof
|
||||
|
||||
template<typename T, u32 N> u32 countof(T arr[N]) { return N; }
|
||||
template<typename T, u32 N>
|
||||
u32 countof(T arr[N])
|
||||
{
|
||||
return N;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
//~ Color helpers
|
||||
|
||||
@ -73,6 +73,8 @@
|
||||
#define GPU_DEBUG_VALIDATION 1
|
||||
|
||||
#define GPU_SHADER_PRINT 1
|
||||
#define GPU_SHADER_PRINT_BUFFER_SIZE Kibi(1);
|
||||
#define GPU_SHADER_PRINT_LOG 1
|
||||
|
||||
/* If enabled, bitbuffs will insert/verify magic numbers & length for each read & write */
|
||||
#define BITBUFF_DEBUG 0
|
||||
|
||||
@ -300,7 +300,7 @@ void G_Bootstrap(void)
|
||||
if (kind != G_QueueKind_AsyncCopy)
|
||||
{
|
||||
G_ArenaHandle gpu_perm = G_PermArena();
|
||||
queue->print_buffer_size = Mebi(64);
|
||||
queue->print_buffer_size = GPU_SHADER_PRINT_BUFFER_SIZE;
|
||||
queue->print_buffer = G_PushBuffer(
|
||||
gpu_perm,
|
||||
u8,
|
||||
@ -2840,7 +2840,6 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
|
||||
.flags = G_ResourceFlag_HostMemory
|
||||
);
|
||||
|
||||
u32 zero = 0;
|
||||
for (;;)
|
||||
{
|
||||
/* FIXME: Remove this */
|
||||
@ -2851,20 +2850,96 @@ void G_D12_CollectionWorkerEntry(WaveLaneCtx *lane)
|
||||
{
|
||||
/* Copy print buffer to readback buffer */
|
||||
G_CopyBufferToBuffer(cl, readback_buff, 0, queue->print_buffer, RNGU64(0, queue->print_buffer_size));
|
||||
/* Reset size to 0 */
|
||||
/* Reset counters to 0 */
|
||||
G_MemorySync(cl, queue->print_buffer,
|
||||
G_Stage_Copy, G_Access_CopyRead,
|
||||
G_Stage_Copy, G_Access_CopyWrite
|
||||
);
|
||||
G_CopyCpuToBuffer(cl, queue->print_buffer, 0, &zero, RNGU64(0, 4));
|
||||
u8 zero[12] = ZI;
|
||||
G_CopyCpuToBuffer(cl, queue->print_buffer, 0, zero, RNGU64(0, sizeof(zero)));
|
||||
}
|
||||
i64 completion = G_CommitCommandList(cl);
|
||||
G_CommitCommandList(cl);
|
||||
|
||||
G_SyncCpu(G_MaskFromQueue(queue_kind));
|
||||
u32 size = *G_StructFromResource(readback_buff, u32);
|
||||
u8 *text = G_StructFromResource(readback_buff, u8) + 4;
|
||||
u32 attempted_print_bytes_count = *(G_StructFromResource(readback_buff, u32) + 0);
|
||||
u32 prints_count = *(G_StructFromResource(readback_buff, u32) + 1);
|
||||
u32 overflows_count = *(G_StructFromResource(readback_buff, u32) + 2);
|
||||
u8 *start = G_StructFromResource(readback_buff, u8) + 12;
|
||||
|
||||
String s = STRING(size, text);
|
||||
/* Deserialize */
|
||||
if (GPU_SHADER_PRINT_LOG)
|
||||
{
|
||||
/* FIXME: Remove this */
|
||||
TempArena scratch = BeginScratchNoConflict();
|
||||
u8 *at = start;
|
||||
{
|
||||
for (u32 print_num = 1; print_num <= prints_count; ++print_num)
|
||||
{
|
||||
u32 chars_count = 0;
|
||||
u32 args_count = 0;
|
||||
{
|
||||
u32 header = *(u32 *)at;
|
||||
chars_count = (header & 0x0000FFFF) >> 0;
|
||||
args_count = (header & 0xFFFF0000) >> 16;
|
||||
at += 4;
|
||||
}
|
||||
|
||||
String fmt = ZI;
|
||||
{
|
||||
fmt.len = chars_count;
|
||||
fmt.text = at;
|
||||
at += chars_count;
|
||||
}
|
||||
|
||||
FmtArg *args = 0;
|
||||
{
|
||||
if (args_count > 0)
|
||||
{
|
||||
args = PushStructs(scratch.arena, FmtArg, args_count);
|
||||
for (u32 arg_idx = 0; arg_idx <= args_count; ++arg_idx)
|
||||
{
|
||||
G_FmtArgKind gpu_kind = (G_FmtArgKind)(*at);
|
||||
at += 1;
|
||||
u32 gpu_data = *(u32 *)at;
|
||||
at += 4;
|
||||
|
||||
FmtArg *dst = &args[arg_idx];
|
||||
switch (gpu_kind)
|
||||
{
|
||||
case G_FmtArgKind_U32:
|
||||
{
|
||||
dst->kind = FmtArgKind_Uint;
|
||||
dst->value.uint = gpu_data;
|
||||
} break;
|
||||
case G_FmtArgKind_I32:
|
||||
{
|
||||
dst->kind = FmtArgKind_Sint;
|
||||
dst->value.sint = (i32)gpu_data;
|
||||
} break;
|
||||
case G_FmtArgKind_F32:
|
||||
{
|
||||
dst->kind = FmtArgKind_Float;
|
||||
dst->value.f = *(f32 *)&gpu_data;
|
||||
} break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// String final_str = ZI;
|
||||
// if (args_count > 0)
|
||||
// {
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// final_str = PushString(scratch.arena, fmt);
|
||||
// }
|
||||
|
||||
at = (u8 *)AlignU64((u64)at, 4);
|
||||
}
|
||||
}
|
||||
EndScratch(scratch);
|
||||
}
|
||||
|
||||
DEBUGBREAKABLE;
|
||||
}
|
||||
|
||||
@ -109,62 +109,133 @@ G_ForceDeclConstant(G_RWByteAddressBufferRef, G_ShaderConst_PrintBufferRef, 8)
|
||||
|
||||
/* This technique is based on MJP's article: https://therealmjp.github.io/posts/hlsl-printf/ */
|
||||
|
||||
Enum(G_FmtArgKind)
|
||||
{
|
||||
G_FmtArgKind_None,
|
||||
G_FmtArgKind_U32,
|
||||
G_FmtArgKind_I32,
|
||||
G_FmtArgKind_F32,
|
||||
};
|
||||
|
||||
Struct(G_FmtArg)
|
||||
{
|
||||
G_FmtArgKind kind;
|
||||
u32 v;
|
||||
};
|
||||
|
||||
#if IsLanguageG && GPU_SHADER_PRINT
|
||||
G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_U32; result.v = v; return result; }
|
||||
G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_I32; result.v = v; return result; }
|
||||
G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_F32; result.v = asuint(v); return result; }
|
||||
|
||||
Struct(G_TempPrintBuffer)
|
||||
{
|
||||
u32 data_u32[256];
|
||||
u32 byte_pos;
|
||||
u32 char_chunks[256];
|
||||
u32 char_pos;
|
||||
u32 fmt_size;
|
||||
u32 args_count;
|
||||
};
|
||||
|
||||
void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 c)
|
||||
void G_PushPrintChar(inout G_TempPrintBuffer buff, u32 v)
|
||||
{
|
||||
/* TODO: Overflow check */
|
||||
u32 u32_arr_pos = buff.byte_pos / 4;
|
||||
u32 idx_in_u32 = buff.byte_pos & 0x03;
|
||||
u32 u32_arr_pos = buff.char_pos / 4;
|
||||
u32 idx_in_u32 = buff.char_pos & 0x03;
|
||||
if (idx_in_u32 == 0)
|
||||
{
|
||||
/* Since buff is not zero initialized, we set the byte on first write here */
|
||||
buff.data_u32[u32_arr_pos] = c & 0xFF;
|
||||
/* Since buff is not zero initialized, we set the chunk on first write here */
|
||||
buff.char_chunks[u32_arr_pos] = v & 0xFF;
|
||||
}
|
||||
else
|
||||
{
|
||||
buff.data_u32[u32_arr_pos] |= (c & 0xFF) << (idx_in_u32 * 8);
|
||||
buff.char_chunks[u32_arr_pos] |= (v & 0xFF) << (idx_in_u32 * 8);
|
||||
}
|
||||
buff.byte_pos += 1;
|
||||
buff.char_pos += 1;
|
||||
}
|
||||
|
||||
void G_CommitPrint(G_TempPrintBuffer buff)
|
||||
{
|
||||
RWByteAddressBuffer rw = G_Dereference(G_ShaderConst_PrintBufferRef);
|
||||
u32 u32s_count = (buff.byte_pos + 3) / 4;
|
||||
u32 alloc_size = u32s_count * 4;
|
||||
|
||||
u32 chunks_count = (buff.char_pos + 3) / 4;
|
||||
|
||||
u32 alloc_size = 0;
|
||||
alloc_size += 4; /* Header */
|
||||
alloc_size += chunks_count * 4; /* Chunks */
|
||||
|
||||
u32 base;
|
||||
rw.InterlockedAdd(0, alloc_size, base);
|
||||
base += 4; /* Account for counter at beginning of buff */
|
||||
rw.InterlockedAdd(0, alloc_size, base); /* Write to base counter */
|
||||
|
||||
if ((base + alloc_size) < countof(rw))
|
||||
u32 pos = base;
|
||||
pos += 4; /* Offset for base counter */
|
||||
pos += 4; /* Offset for success counter */
|
||||
pos += 4; /* Offset for overflow counter */
|
||||
|
||||
if ((pos + alloc_size) < countof(rw))
|
||||
{
|
||||
for (u32 u32_idx = 0; u32_idx < u32s_count; ++u32_idx)
|
||||
/* Increment success counter */
|
||||
rw.InterlockedAdd(4, 1);
|
||||
|
||||
/* Store header */
|
||||
{
|
||||
u32 data = buff.data_u32[u32_idx];
|
||||
rw.Store(base + (u32_idx * 4), data);
|
||||
u32 header = 0;
|
||||
header |= (buff.fmt_size << 0) & 0x0000FFFF;
|
||||
header |= (buff.args_count << 16) & 0xFFFF0000;
|
||||
rw.Store(base + pos, header);
|
||||
pos += 4;
|
||||
}
|
||||
/* Store chunks */
|
||||
for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
|
||||
{
|
||||
u32 chunk = buff.char_chunks[chunk_idx];
|
||||
rw.Store(base + pos, chunk);
|
||||
pos += 4;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Increment overflow counter */
|
||||
rw.InterlockedAdd(8, 1);
|
||||
}
|
||||
}
|
||||
|
||||
#define G_Print(fmt) do { \
|
||||
G_TempPrintBuffer __tmp; \
|
||||
__tmp.byte_pos = 0; \
|
||||
u32 __pos = 0; \
|
||||
while (U32FromChar(fmt[__pos]) != 0) \
|
||||
{ \
|
||||
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
|
||||
++__pos; \
|
||||
} \
|
||||
G_PushPrintChar(__tmp, 0); \
|
||||
G_CommitPrint(__tmp); \
|
||||
#define G_Print(fmt) do { \
|
||||
G_TempPrintBuffer __tmp; \
|
||||
__tmp.char_pos = 0; \
|
||||
u32 __pos = 0; \
|
||||
while (U32FromChar(fmt[__pos]) != 0) \
|
||||
{ \
|
||||
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
|
||||
++__pos; \
|
||||
} \
|
||||
__tmp.fmt_size = __tmp.char_pos; \
|
||||
G_CommitPrint(__tmp); \
|
||||
} while (0)
|
||||
|
||||
#define G_PrintF(fmt, ...) do { \
|
||||
G_TempPrintBuffer __tmp; \
|
||||
__tmp.char_pos = 0; \
|
||||
u32 __pos = 0; \
|
||||
while (U32FromChar(fmt[__pos]) != 0) \
|
||||
{ \
|
||||
G_PushPrintChar(__tmp, U32FromChar(fmt[__pos])); \
|
||||
++__pos; \
|
||||
} \
|
||||
G_FmtArg __args[] = { __VA_ARGS__ }; \
|
||||
__tmp.fmt_size = __tmp.char_pos; \
|
||||
__tmp.args_count = countof(__args); \
|
||||
for (u32 __arg_idx = 0; __arg_idx < countof(__args); ++__arg_idx) \
|
||||
{ \
|
||||
G_PushPrintChar(__tmp, __args[__arg_idx].kind); \
|
||||
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 0); \
|
||||
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 8); \
|
||||
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 16); \
|
||||
G_PushPrintChar(__tmp, __args[__arg_idx].v >> 24); \
|
||||
} \
|
||||
G_CommitPrint(__tmp); \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
#define G_Print(fmt)
|
||||
#define G_PrintF(fmt)
|
||||
#endif
|
||||
|
||||
@ -145,7 +145,10 @@ PixelShader(UI_BlitPS, UI_BlitPSOutput, UI_BlitPSInput input)
|
||||
Vec2 uv = input.src_uv;
|
||||
Vec4 result = tex.Sample(sampler, uv);
|
||||
|
||||
G_Print("Hello there!");
|
||||
// G_Print("Hello there!");
|
||||
G_PrintF("Hello there: \"%F\"", G_Fmt(3.123));
|
||||
|
||||
|
||||
|
||||
UI_BlitPSOutput output;
|
||||
output.SV_Target0 = result;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user