//////////////////////////////////////////////////////////// //~ Ref types Enum(G_RefKind) { G_RefKind_StructuredBuffer, G_RefKind_ByteAddressBuffer, G_RefKind_Texture1D, G_RefKind_Texture2D, G_RefKind_Texture3D, G_RefKind_SamplerState, }; Struct(G_StructuredBufferRef) { u32 v; }; Struct(G_ByteAddressBufferRef) { u32 v; }; Struct(G_Texture1DRef) { u32 v; }; Struct(G_Texture2DRef) { u32 v; }; Struct(G_Texture3DRef) { u32 v; }; Struct(G_SamplerStateRef) { u32 v; }; #define G_IsRefNil(r) ((r).v == 0) //////////////////////////////////////////////////////////// //~ Constant types // // D3D12 exposes 64 root constants and Vulkan exposes 32 push constants. // Supposedly AMD hardware will start spilling constants once more than // 12 are in use - https://gpuopen.com/learn/rdna-performance-guide/ // #define G_NumGeneralPurposeConstants (24) // Constants available for any usage #define G_NumReservedConstants (4) // Constants reserved for internal usage by the GPU layer #define G_NumConstants (G_NumGeneralPurposeConstants + G_NumReservedConstants) #if IsCpu #define G_ForceDeclConstant(type, name, slot) \ enum { name = slot }; \ Struct(name##__shaderconstanttype) { type v; } #define G_DeclConstant(type, name, slot) \ StaticAssert(sizeof(type) <= 4); \ StaticAssert(slot < G_NumGeneralPurposeConstants); \ G_ForceDeclConstant(type, name, slot) #else #define G_ForceDeclConstant(type, name, slot) cbuffer name : register(b##slot) { type name; } #define G_DeclConstant(type, name, slot) G_ForceDeclConstant(type, name, slot) #endif //////////////////////////////////////////////////////////// //~ Reserved constants // The constants declared below assume this configuration is accurate for slot usage StaticAssert(G_NumGeneralPurposeConstants == 24); StaticAssert(G_NumReservedConstants >= 3); G_ForceDeclConstant(G_ByteAddressBufferRef, G_ShaderConst_PrintBufferRef, 24); G_ForceDeclConstant(b32, G_ShaderConst_TweakB32, 25); G_ForceDeclConstant(f32, G_ShaderConst_TweakF32, 26); #if IsGpu #define G_TweakBool G_ShaderConst_TweakB32 #define G_TweakFloat G_ShaderConst_TweakF32 #endif //////////////////////////////////////////////////////////// //~ Basic samplers Enum(G_BasicSamplerKind) { G_BasicSamplerKind_PointClamp, G_BasicSamplerKind_PointWrap, G_BasicSamplerKind_PointMirror, G_BasicSamplerKind_BilinearClamp, G_BasicSamplerKind_BilinearWrap, G_BasicSamplerKind_BilinearMirror, G_BasicSamplerKind_TrilinearClamp, G_BasicSamplerKind_TrilinearWrap, G_BasicSamplerKind_TrilinearMirror, G_BasicSamplerKind_COUNT }; //////////////////////////////////////////////////////////// //~ Resource dereference #if IsGpu // NOTE: Uniform dereferencing is faster than Non-Uniform on AMD hardware //- Scalar/Uniform dereference SamplerState G_SDeref(G_SamplerStateRef r) { return SamplerDescriptorHeap[r.v]; } template StructuredBuffer G_SDeref(G_StructuredBufferRef r) { return ResourceDescriptorHeap[r.v]; } ByteAddressBuffer G_SDeref(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[r.v]; } template Texture1D G_SDeref(G_Texture1DRef r) { return ResourceDescriptorHeap[r.v]; } template Texture2D G_SDeref(G_Texture2DRef r) { return ResourceDescriptorHeap[r.v]; } template Texture3D G_SDeref(G_Texture3DRef r) { return ResourceDescriptorHeap[r.v]; } template RWStructuredBuffer G_SDerefRW(G_StructuredBufferRef r) { return ResourceDescriptorHeap[r.v + 1]; } RWByteAddressBuffer G_SDerefRW(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[r.v + 1]; } template RWTexture1D G_SDerefRW(G_Texture1DRef r) { return ResourceDescriptorHeap[r.v + 1]; } template RWTexture2D G_SDerefRW(G_Texture2DRef r) { return ResourceDescriptorHeap[r.v + 1]; } template RWTexture3D G_SDerefRW(G_Texture3DRef r) { return ResourceDescriptorHeap[r.v + 1]; } //- Vector/Non-Uniform dereference SamplerState G_VDeref(G_SamplerStateRef r) { return SamplerDescriptorHeap[NonUniformResourceIndex(r.v)]; } template StructuredBuffer G_VDeref(G_StructuredBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } ByteAddressBuffer G_VDeref(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } template Texture1D G_VDeref(G_Texture1DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } template Texture2D G_VDeref(G_Texture2DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } template Texture3D G_VDeref(G_Texture3DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } template RWStructuredBuffer G_VDerefRW(G_StructuredBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } RWByteAddressBuffer G_VDerefRW(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } template RWTexture1D G_VDerefRW(G_Texture1DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } template RWTexture2D G_VDerefRW(G_Texture2DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } template RWTexture3D G_VDerefRW(G_Texture3DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } #endif //////////////////////////////////////////////////////////// //~ Resource countof #define G_MaxMips 16 #define G_MaxRenderTargets 8 #if IsGpu template u32 countof(StructuredBuffer obj) { u32 result; obj.GetDimensions(result); return result; } template u32 countof(RWStructuredBuffer obj) { u32 result; u32 stride; obj.GetDimensions(result, stride); return result; } u32 countof(ByteAddressBuffer obj) { u32 result; obj.GetDimensions(result); return result; } u32 countof(RWByteAddressBuffer obj) { u32 result; obj.GetDimensions(result); return result; } template u32 countof(Texture1D obj) { u32 result; obj.GetDimensions(result); return result; } template u32 countof(RWTexture1D obj) { u32 result; obj.GetDimensions(result); return result; } template Vec2U32 countof(Texture2D obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; } template Vec2U32 countof(RWTexture2D obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; } template Vec3U32 countof(Texture3D obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; } template Vec3U32 countof(RWTexture3D obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; } #endif //////////////////////////////////////////////////////////// //~ Debug printf // This technique is based on MJP's article - https://therealmjp.github.io/posts/hlsl-printf/ Enum(G_FmtArgKind) { G_FmtArgKind_None, G_FmtArgKind_End, G_FmtArgKind_BEGINSIZE1, G_FmtArgKind_Uint, G_FmtArgKind_Sint, G_FmtArgKind_Float, G_FmtArgKind_BEGINSIZE2, G_FmtArgKind_Uint2, G_FmtArgKind_Sint2, G_FmtArgKind_Float2, G_FmtArgKind_BEGINSIZE3, G_FmtArgKind_Uint3, G_FmtArgKind_Sint3, G_FmtArgKind_Float3, G_FmtArgKind_BEGINSIZE4, G_FmtArgKind_Uint4, G_FmtArgKind_Sint4, G_FmtArgKind_Float4, }; Struct(G_FmtArg) { G_FmtArgKind kind; Vec4U32 v; }; #if IsGpu && GPU_SHADER_PRINT G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint; result.v.x = v; return result; } G_FmtArg G_Fmt(Vec2U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint2; result.v.xy = v.xy; return result; } G_FmtArg G_Fmt(Vec3U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint3; result.v.xyz = v.xyz; return result; } G_FmtArg G_Fmt(Vec4U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint4; result.v.xyzw = v.xyzw; return result; } G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint; result.v.x = v; return result; } G_FmtArg G_Fmt(Vec2I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint2; result.v.xy = v.xy; return result; } G_FmtArg G_Fmt(Vec3I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint3; result.v.xyz = v.xyz; return result; } G_FmtArg G_Fmt(Vec4I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint4; result.v.xyzw = v.xyzw; return result; } G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float; result.v.x = asuint(v); return result; } G_FmtArg G_Fmt(Vec2 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float2; result.v.xy = asuint(v.xy); return result; } G_FmtArg G_Fmt(Vec3 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float3; result.v.xyz = asuint(v.xyz); return result; } G_FmtArg G_Fmt(Vec4 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float4; result.v.xyzw = asuint(v.xyzw); return result; } G_FmtArg G_FmtEnd(void) { G_FmtArg result; result.kind = G_FmtArgKind_End; return result; } Struct(G_TempPrintBuffer) { // NOTE: The larger the array size, the longer the compilation time u32 byte_chunks[64]; u32 bytes_count; u32 chars_count; u32 args_count; b32 overflowed; }; void G_PushPrintByte(inout G_TempPrintBuffer buff, u32 v) { u32 chunk_idx = buff.bytes_count / 4; if (chunk_idx < countof(buff.byte_chunks)) { u32 byte_idx_in_chunk = buff.bytes_count & 0x03; if (byte_idx_in_chunk == 0) { // Since buff is not zero initialized, we set the chunk on first write here buff.byte_chunks[chunk_idx] = v & 0xFF; } else { buff.byte_chunks[chunk_idx] |= (v & 0xFF) << (byte_idx_in_chunk * 8); } buff.bytes_count += 1; } else { buff.overflowed = 1; } } void G_CommitPrint(G_TempPrintBuffer buff) { RWByteAddressBuffer rw = G_SDerefRW(G_ShaderConst_PrintBufferRef); if (buff.overflowed) { buff.bytes_count = 0; buff.chars_count = 0; buff.args_count = 0; } u32 chunks_count = (buff.bytes_count + 3) / 4; u32 alloc_size = 0; alloc_size += 4; // Header alloc_size += chunks_count * 4; // Chunks // Atomic fetch + add to base counter u32 base; rw.InterlockedAdd(0, alloc_size, base); base += 4; // Offset for allocation counter base += 4; // Offset for success counter base += 4; // Offset for overflow counter if ((base + alloc_size) < countof(rw)) { // Increment success counter rw.InterlockedAdd(4, 1); u32 pos = 0; // Write header { u32 header = 0; header |= (buff.chars_count << 0) & 0x0000FFFF; header |= (buff.args_count << 16) & 0x7FFF0000; header |= (buff.overflowed << 31) & 0xF0000000; rw.Store(base + pos, header); pos += 4; } // Write chunks for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx) { u32 chunk = buff.byte_chunks[chunk_idx]; rw.Store(base + pos, chunk); pos += 4; } } else { // Increment overflow counter rw.InterlockedAdd(8, 1); } } #define G_PrintF_(fmt, ...) do { \ G_TempPrintBuffer __tmp; \ __tmp.bytes_count = 0; \ __tmp.overflowed = 0; \ u32 __char_idx = 0; \ while (U32FromChar(fmt[__char_idx]) != 0) \ { \ G_PushPrintByte(__tmp, U32FromChar(fmt[__char_idx])); \ ++__char_idx; \ } \ G_FmtArg __args[] = { __VA_ARGS__ }; \ __tmp.chars_count = __char_idx; \ __tmp.args_count = (countof(__args) - 1); \ for (u32 __arg_idx = 0; __arg_idx < __tmp.args_count; ++__arg_idx) \ { \ G_FmtArg __arg = __args[__arg_idx]; \ G_PushPrintByte(__tmp, __arg.kind); \ if (__arg.kind > G_FmtArgKind_BEGINSIZE1) \ { \ G_PushPrintByte(__tmp, __arg.v.x >> 0); \ G_PushPrintByte(__tmp, __arg.v.x >> 8); \ G_PushPrintByte(__tmp, __arg.v.x >> 16); \ G_PushPrintByte(__tmp, __arg.v.x >> 24); \ } \ if (__arg.kind > G_FmtArgKind_BEGINSIZE2) \ { \ G_PushPrintByte(__tmp, __arg.v.y >> 0); \ G_PushPrintByte(__tmp, __arg.v.y >> 8); \ G_PushPrintByte(__tmp, __arg.v.y >> 16); \ G_PushPrintByte(__tmp, __arg.v.y >> 24); \ } \ if (__arg.kind > G_FmtArgKind_BEGINSIZE3) \ { \ G_PushPrintByte(__tmp, __arg.v.z >> 0); \ G_PushPrintByte(__tmp, __arg.v.z >> 8); \ G_PushPrintByte(__tmp, __arg.v.z >> 16); \ G_PushPrintByte(__tmp, __arg.v.z >> 24); \ } \ if (__arg.kind > G_FmtArgKind_BEGINSIZE4) \ { \ G_PushPrintByte(__tmp, __arg.v.w >> 0); \ G_PushPrintByte(__tmp, __arg.v.w >> 8); \ G_PushPrintByte(__tmp, __arg.v.w >> 16); \ G_PushPrintByte(__tmp, __arg.v.w >> 24); \ } \ } \ G_CommitPrint(__tmp); \ } while (0) #define G_PrintF(fmt, ...) G_PrintF_(fmt, ##__VA_ARGS__, G_FmtEnd()) #else #define G_PrintF(fmt) #endif