//////////////////////////////////////////////////////////// //~ Gpu memory reference types typedef u32 G_BaseDescriptorIndex; Struct(G_BufferRef) { G_BaseDescriptorIndex v; }; Struct(G_TextureRef) { G_BaseDescriptorIndex v; }; Struct(G_SamplerRef) { G_BaseDescriptorIndex v; }; #define G_MakeBufferRef(_v) ((G_BufferRef) { .v = (_v) }) #define G_MakeTextureRef(_v) ((G_TextureRef) { .v = (_v) }) #define G_MakeSamplerRef(_v) ((G_SamplerRef) { .v = (_v) }) #define G_NilBufferRef G_MakeBufferRef(0) #define G_NilTextureRef G_MakeTextureRef(0) #define G_NilSamplerRef G_MakeSamplerRef(0) #define G_IsRefNil(r) ((r).v == 0) //////////////////////////////////////////////////////////// //~ Register types // // D3D12 exposes 64 root constants and Vulkan exposes 32 push constants. // Supposedly AMD hardware will start spilling constants once more than // 12 are in use - https://gpuopen.com/learn/rdna-performance-guide/ // #define G_NumGeneralPurposeRegisters (24) // Registers available for any usage #define G_NumReservedRegisters (4) // Registers reserved for internal usage by the GPU layer #define G_NumRegisters (G_NumGeneralPurposeRegisters + G_NumReservedRegisters) #if IsCpu #define G_ForceDeclRegister(type, name, slot) \ enum { name = slot }; \ Struct(CAT(__ShaderRegisterType__,name)) { type v; } #define G_DeclRegister(type, name, slot) \ StaticAssert(sizeof(type) <= 4); \ StaticAssert(slot < G_NumGeneralPurposeRegisters); \ G_ForceDeclRegister(type, name, slot) #else #define G_ForceDeclRegister(type, name, slot) cbuffer name : register(CAT(b,slot)) { type name; } #define G_DeclRegister(type, name, slot) G_ForceDeclRegister(type, name, slot) #endif //////////////////////////////////////////////////////////// //~ Reserved registers // The registers declared below assume this configuration is accurate for slot usage StaticAssert(G_NumGeneralPurposeRegisters == 24); StaticAssert(G_NumReservedRegisters >= 3); G_ForceDeclRegister(G_BufferRef, G_ShaderReg_PrintBuffer, 24); G_ForceDeclRegister(b32, G_ShaderReg_TweakB32, 25); G_ForceDeclRegister(f32, G_ShaderReg_TweakF32, 26); #if IsGpu #define G_TweakBool G_ShaderReg_TweakB32 #define G_TweakFloat G_ShaderReg_TweakF32 #endif //////////////////////////////////////////////////////////// //~ Basic samplers Enum(G_BasicSamplerKind) { G_BasicSamplerKind_PointClamp, G_BasicSamplerKind_PointWrap, G_BasicSamplerKind_PointMirror, G_BasicSamplerKind_BilinearClamp, G_BasicSamplerKind_BilinearWrap, G_BasicSamplerKind_BilinearMirror, G_BasicSamplerKind_TrilinearClamp, G_BasicSamplerKind_TrilinearWrap, G_BasicSamplerKind_TrilinearMirror, G_BasicSamplerKind_COUNT }; //////////////////////////////////////////////////////////// //~ Basic noise #define G_BasicNoiseDims VEC3I32(128, 128, 64) //////////////////////////////////////////////////////////// //~ Index buffers #define G_IB(_count, _buffer, ...) ((G_IndexBufferDesc) { .count = (_count), .buffer = (_buffer), __VA_ARGS__ }) Struct(G_IndexBufferDesc) { u32 count; G_BufferRef buffer; }; //////////////////////////////////////////////////////////// //~ Resource dereference #if IsGpu // TODO: Add explicit uniform-dereference variations, since on AMD hardware // non-uniform is slower and there are some related shader-compilation bugs // in older driver versions template struct G_DerefImpl; template<> struct G_DerefImpl< SamplerState > { static SamplerState Deref(G_SamplerRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } }; template struct G_DerefImpl< StructuredBuffer > { static StructuredBuffer Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; } }; template struct G_DerefImpl< RWStructuredBuffer > { static RWStructuredBuffer Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } }; template<> struct G_DerefImpl< ByteAddressBuffer > { static ByteAddressBuffer Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; } }; template<> struct G_DerefImpl< RWByteAddressBuffer > { static RWByteAddressBuffer Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; } }; template struct G_DerefImpl< Texture1D > { static Texture1D Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; } }; template struct G_DerefImpl< Texture2D > { static Texture2D Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; } }; template struct G_DerefImpl< Texture3D > { static Texture3D Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; } }; template struct G_DerefImpl< RWTexture1D > { static RWTexture1D Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; } }; template struct G_DerefImpl< RWTexture2D > { static RWTexture2D Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; } }; template struct G_DerefImpl< RWTexture3D > { static RWTexture3D Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; } }; #define G_Deref(ref, type, ...) (G_DerefImpl< type >::Deref((ref), ##__VA_ARGS__)) #endif //////////////////////////////////////////////////////////// //~ Texture dimensions #define G_MaxMips 16 #define G_MaxRenderTargets 8 #if IsGpu template u32 G_Count1D(Texture1D obj) { u32 result; obj.GetDimensions(result); return result; } template u32 G_Count1D(RWTexture1D obj) { u32 result; obj.GetDimensions(result); return result; } template Vec2U32 G_Count2D(Texture2D obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; } template Vec2U32 G_Count2D(RWTexture2D obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; } template Vec3U32 G_Count3D(Texture3D obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; } template Vec3U32 G_Count3D(RWTexture3D obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; } #endif //////////////////////////////////////////////////////////// //~ Shader printf Enum(G_FmtArgKind) { G_FmtArgKind_None, G_FmtArgKind_End, G_FmtArgKind_BEGINSIZE1, G_FmtArgKind_Uint, G_FmtArgKind_Sint, G_FmtArgKind_Float, G_FmtArgKind_BEGINSIZE2, G_FmtArgKind_Uint2, G_FmtArgKind_Sint2, G_FmtArgKind_Float2, G_FmtArgKind_BEGINSIZE3, G_FmtArgKind_Uint3, G_FmtArgKind_Sint3, G_FmtArgKind_Float3, G_FmtArgKind_BEGINSIZE4, G_FmtArgKind_Uint4, G_FmtArgKind_Sint4, G_FmtArgKind_Float4, }; Struct(G_FmtArg) { G_FmtArgKind kind; Vec4U32 v; }; #if IsGpu && GPU_SHADER_PRINT G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint; result.v.x = v; return result; } G_FmtArg G_Fmt(Vec2U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint2; result.v.xy = v.xy; return result; } G_FmtArg G_Fmt(Vec3U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint3; result.v.xyz = v.xyz; return result; } G_FmtArg G_Fmt(Vec4U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint4; result.v.xyzw = v.xyzw; return result; } G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint; result.v.x = v; return result; } G_FmtArg G_Fmt(Vec2I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint2; result.v.xy = v.xy; return result; } G_FmtArg G_Fmt(Vec3I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint3; result.v.xyz = v.xyz; return result; } G_FmtArg G_Fmt(Vec4I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint4; result.v.xyzw = v.xyzw; return result; } G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float; result.v.x = asuint(v); return result; } G_FmtArg G_Fmt(Vec2 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float2; result.v.xy = asuint(v.xy); return result; } G_FmtArg G_Fmt(Vec3 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float3; result.v.xyz = asuint(v.xyz); return result; } G_FmtArg G_Fmt(Vec4 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float4; result.v.xyzw = asuint(v.xyzw); return result; } G_FmtArg G_FmtEnd(void) { G_FmtArg result; result.kind = G_FmtArgKind_End; return result; } Struct(G_TempPrintBuffer) { // NOTE: Large array sizes can increase shader compilation time u32 byte_chunks[64]; u32 bytes_count; u32 chars_count; u32 args_count; b32 overflowed; }; void G_PushPrintByte(inout G_TempPrintBuffer buff, u32 v) { u32 chunk_idx = buff.bytes_count / 4; if (chunk_idx < countof(buff.byte_chunks)) { u32 byte_idx_in_chunk = buff.bytes_count & 0x03; if (byte_idx_in_chunk == 0) { // Since buff is not zero initialized, we set the chunk on first write here buff.byte_chunks[chunk_idx] = v & 0xFF; } else { buff.byte_chunks[chunk_idx] |= (v & 0xFF) << (byte_idx_in_chunk * 8); } buff.bytes_count += 1; } else { buff.overflowed = 1; } } void G_CommitPrint(G_TempPrintBuffer buff) { RWByteAddressBuffer rw = G_Deref(G_ShaderReg_PrintBuffer, RWByteAddressBuffer); if (buff.overflowed) { buff.bytes_count = 0; buff.chars_count = 0; buff.args_count = 0; } u32 chunks_count = (buff.bytes_count + 3) / 4; u32 alloc_size = 0; alloc_size += 4; // Header alloc_size += chunks_count * 4; // Chunks // Atomic fetch + add to base counter u32 base; rw.InterlockedAdd(0, alloc_size, base); base += 4; // Offset for allocation counter base += 4; // Offset for success counter base += 4; // Offset for overflow counter if ((base + alloc_size) < GPU_SHADER_PRINT_BUFFER_SIZE) { // Increment success counter rw.InterlockedAdd(4, 1); u32 pos = 0; // Write header { u32 header = 0; header |= (buff.chars_count << 0) & 0x0000FFFF; header |= (buff.args_count << 16) & 0x7FFF0000; header |= (buff.overflowed << 31) & 0xF0000000; rw.Store(base + pos, header); pos += 4; } // Write chunks for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx) { u32 chunk = buff.byte_chunks[chunk_idx]; rw.Store(base + pos, chunk); pos += 4; } } else { // Increment overflow counter rw.InterlockedAdd(8, 1); } } #define G_PrintF_(fmt, ...) \ do { \ G_TempPrintBuffer __tmp; \ __tmp.bytes_count = 0; \ __tmp.overflowed = 0; \ u32 __char_idx = 0; \ while (U32FromChar(fmt[__char_idx]) != 0) \ { \ G_PushPrintByte(__tmp, U32FromChar(fmt[__char_idx])); \ ++__char_idx; \ } \ G_FmtArg __args[] = { __VA_ARGS__ }; \ __tmp.chars_count = __char_idx; \ __tmp.args_count = (countof(__args) - 1); \ for (u32 __arg_idx = 0; __arg_idx < __tmp.args_count; ++__arg_idx) \ { \ G_FmtArg __arg = __args[__arg_idx]; \ G_PushPrintByte(__tmp, __arg.kind); \ if (__arg.kind > G_FmtArgKind_BEGINSIZE1) \ { \ G_PushPrintByte(__tmp, __arg.v.x >> 0); \ G_PushPrintByte(__tmp, __arg.v.x >> 8); \ G_PushPrintByte(__tmp, __arg.v.x >> 16); \ G_PushPrintByte(__tmp, __arg.v.x >> 24); \ } \ if (__arg.kind > G_FmtArgKind_BEGINSIZE2) \ { \ G_PushPrintByte(__tmp, __arg.v.y >> 0); \ G_PushPrintByte(__tmp, __arg.v.y >> 8); \ G_PushPrintByte(__tmp, __arg.v.y >> 16); \ G_PushPrintByte(__tmp, __arg.v.y >> 24); \ } \ if (__arg.kind > G_FmtArgKind_BEGINSIZE3) \ { \ G_PushPrintByte(__tmp, __arg.v.z >> 0); \ G_PushPrintByte(__tmp, __arg.v.z >> 8); \ G_PushPrintByte(__tmp, __arg.v.z >> 16); \ G_PushPrintByte(__tmp, __arg.v.z >> 24); \ } \ if (__arg.kind > G_FmtArgKind_BEGINSIZE4) \ { \ G_PushPrintByte(__tmp, __arg.v.w >> 0); \ G_PushPrintByte(__tmp, __arg.v.w >> 8); \ G_PushPrintByte(__tmp, __arg.v.w >> 16); \ G_PushPrintByte(__tmp, __arg.v.w >> 24); \ } \ } \ G_CommitPrint(__tmp); \ } while (0) #define G_PrintF(fmt, ...) G_PrintF_(fmt, ##__VA_ARGS__, G_FmtEnd()) #else #define G_PrintF(fmt) #endif