power_play/src/gpu/gpu_shared.cgh
2026-03-10 18:03:06 -05:00

370 lines
19 KiB
C

////////////////////////////////////////////////////////////
//~ Gpu memory reference types
typedef u32 G_BaseDescriptorIndex;
Struct(G_BufferRef) { G_BaseDescriptorIndex v; };
Struct(G_TextureRef) { G_BaseDescriptorIndex v; };
Struct(G_SamplerRef) { G_BaseDescriptorIndex v; };
#define G_MakeBufferRef(_v) ((G_BufferRef) { .v = (_v) })
#define G_MakeTextureRef(_v) ((G_TextureRef) { .v = (_v) })
#define G_MakeSamplerRef(_v) ((G_SamplerRef) { .v = (_v) })
#define G_NilBufferRef G_MakeBufferRef(0)
#define G_NilTextureRef G_MakeTextureRef(0)
#define G_NilSamplerRef G_MakeSamplerRef(0)
#define G_IsRefNil(r) ((r).v == 0)
////////////////////////////////////////////////////////////
//~ Constant types
//
// D3D12 exposes 64 root constants and Vulkan exposes 32 push constants.
// Supposedly AMD hardware will start spilling constants once more than
// 12 are in use - https://gpuopen.com/learn/rdna-performance-guide/
//
#define G_NumGeneralPurposeConstants (24) // Constants available for any usage
#define G_NumReservedConstants (4) // Constants reserved for internal usage by the GPU layer
#define G_NumConstants (G_NumGeneralPurposeConstants + G_NumReservedConstants)
#if IsCpu
#define G_ForceDeclConstant(type, name, slot) \
enum { name = slot }; \
Struct(CAT(__ShaderConstantType_,name)) { type v; }
#define G_DeclConstant(type, name, slot) \
StaticAssert(sizeof(type) <= 4); \
StaticAssert(slot < G_NumGeneralPurposeConstants); \
G_ForceDeclConstant(type, name, slot)
#else
#define G_ForceDeclConstant(type, name, slot) cbuffer name : register(CAT(b,slot)) { type name; }
#define G_DeclConstant(type, name, slot) G_ForceDeclConstant(type, name, slot)
#endif
////////////////////////////////////////////////////////////
//~ Reserved constants
// The constants declared below assume this configuration is accurate for slot usage
StaticAssert(G_NumGeneralPurposeConstants == 24);
StaticAssert(G_NumReservedConstants >= 3);
G_ForceDeclConstant(G_BufferRef, G_ShaderConst_PrintBuffer, 24);
G_ForceDeclConstant(b32, G_ShaderConst_TweakB32, 25);
G_ForceDeclConstant(f32, G_ShaderConst_TweakF32, 26);
#if IsGpu
#define G_TweakBool G_ShaderConst_TweakB32
#define G_TweakFloat G_ShaderConst_TweakF32
#endif
////////////////////////////////////////////////////////////
//~ Basic samplers
Enum(G_BasicSamplerKind)
{
G_BasicSamplerKind_PointClamp,
G_BasicSamplerKind_PointWrap,
G_BasicSamplerKind_PointMirror,
G_BasicSamplerKind_BilinearClamp,
G_BasicSamplerKind_BilinearWrap,
G_BasicSamplerKind_BilinearMirror,
G_BasicSamplerKind_TrilinearClamp,
G_BasicSamplerKind_TrilinearWrap,
G_BasicSamplerKind_TrilinearMirror,
G_BasicSamplerKind_COUNT
};
////////////////////////////////////////////////////////////
//~ Resource dereference
#if IsGpu
// NOTE: Uniform dereferencing is faster than Non-Uniform on AMD hardware
//- Scalar/Uniform dereference
template<typename T> StructuredBuffer<T> G_SDeref(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 0]; }
template<typename T> RWStructuredBuffer<T> G_SDerefRW(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 1]; }
ByteAddressBuffer G_SDerefRaw(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 2]; }
RWByteAddressBuffer G_SDerefRawRW(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 3]; }
template<typename T> Texture1D<T> G_SDeref1D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[r.v + (mip * 2) + 0]; }
template<typename T> Texture2D<T> G_SDeref2D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[r.v + (mip * 2) + 0]; }
template<typename T> Texture3D<T> G_SDeref3D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[r.v + (mip * 2) + 0]; }
template<typename T> RWTexture1D<T> G_SDerefRW1D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[r.v + (mip * 2) + 1]; }
template<typename T> RWTexture2D<T> G_SDerefRW2D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[r.v + (mip * 2) + 1]; }
template<typename T> RWTexture3D<T> G_SDerefRW3D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[r.v + (mip * 2) + 1]; }
SamplerState G_SDeref(G_SamplerRef r) { return SamplerDescriptorHeap[r.v]; }
//- Vector/Non-Uniform dereference
template<typename T> StructuredBuffer<T> G_VDeref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
template<typename T> RWStructuredBuffer<T> G_VDerefRW(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
ByteAddressBuffer G_VDerefRaw(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 2)]; }
RWByteAddressBuffer G_VDerefRawRW(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; }
template<typename T> Texture1D<T> G_VDeref1D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; }
template<typename T> Texture2D<T> G_VDeref2D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; }
template<typename T> Texture3D<T> G_VDeref3D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; }
template<typename T> RWTexture1D<T> G_VDerefRW1D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; }
template<typename T> RWTexture2D<T> G_VDerefRW2D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; }
template<typename T> RWTexture3D<T> G_VDerefRW3D(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; }
SamplerState G_VDeref(G_SamplerRef r) { return SamplerDescriptorHeap[NonUniformResourceIndex(r.v)]; }
// //- Scalar/Uniform dereference
// ByteAddressBuffer G_SDerefRaw(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 2]; }
// RWByteAddressBuffer G_SDerefRawRW(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 3]; }
// template<typename T> StructuredBuffer<T> G_SDeref(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 0]; }
// template<typename T> RWStructuredBuffer<T> G_SDerefRW(G_BufferRef r) { return ResourceDescriptorHeap[r.v + 1]; }
// template<typename T> Texture1D<T> G_SDeref(G_Texture1DRef r) { return ResourceDescriptorHeap[r.v + 0]; }
// template<typename T> Texture2D<T> G_SDeref(G_Texture2DRef r) { return ResourceDescriptorHeap[r.v + 0]; }
// template<typename T> Texture3D<T> G_SDeref(G_Texture3DRef r) { return ResourceDescriptorHeap[r.v + 0]; }
// template<typename T> RWTexture1D<T> G_SDerefRW(G_Texture1DRef r) { return ResourceDescriptorHeap[r.v + 1]; }
// template<typename T> RWTexture2D<T> G_SDerefRW(G_Texture2DRef r) { return ResourceDescriptorHeap[r.v + 1]; }
// template<typename T> RWTexture3D<T> G_SDerefRW(G_Texture3DRef r) { return ResourceDescriptorHeap[r.v + 1]; }
// SamplerState G_SDeref(G_SamplerStateRef r) { return SamplerDescriptorHeap[r.v]; }
// //- Vector/Non-Uniform dereference
// ByteAddressBuffer G_VDerefRaw(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 2)]; }
// RWByteAddressBuffer G_VDerefRawRW(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; }
// template<typename T> StructuredBuffer<T> G_VDeref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
// template<typename T> RWStructuredBuffer<T> G_VDerefRW(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
// template<typename T> Texture1D<T> G_VDeref(G_Texture1DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
// template<typename T> Texture2D<T> G_VDeref(G_Texture2DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
// template<typename T> Texture3D<T> G_VDeref(G_Texture3DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
// template<typename T> RWTexture1D<T> G_VDerefRW(G_Texture1DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
// template<typename T> RWTexture2D<T> G_VDerefRW(G_Texture2DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
// template<typename T> RWTexture3D<T> G_VDerefRW(G_Texture3DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
// SamplerState G_VDeref(G_SamplerStateRef r) { return SamplerDescriptorHeap[NonUniformResourceIndex(r.v)]; }
#endif
////////////////////////////////////////////////////////////
//~ Resource countof
#define G_MaxMips 16
#define G_MaxRenderTargets 8
#if IsGpu
template<typename T> u32 countof(StructuredBuffer<T> obj) { u32 result; obj.GetDimensions(result); return result; }
template<typename T> u32 countof(RWStructuredBuffer<T> obj) { u32 result; u32 stride; obj.GetDimensions(result, stride); return result; }
u32 countof(ByteAddressBuffer obj) { u32 result; obj.GetDimensions(result); return result; }
u32 countof(RWByteAddressBuffer obj) { u32 result; obj.GetDimensions(result); return result; }
template<typename T> u32 countof(Texture1D<T> obj) { u32 result; obj.GetDimensions(result); return result; }
template<typename T> u32 countof(RWTexture1D<T> obj) { u32 result; obj.GetDimensions(result); return result; }
template<typename T> Vec2U32 countof(Texture2D<T> obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; }
template<typename T> Vec2U32 countof(RWTexture2D<T> obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; }
template<typename T> Vec3U32 countof(Texture3D<T> obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; }
template<typename T> Vec3U32 countof(RWTexture3D<T> obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; }
#endif
////////////////////////////////////////////////////////////
//~ Debug printf
// This technique is based on MJP's article - https://therealmjp.github.io/posts/hlsl-printf/
Enum(G_FmtArgKind)
{
G_FmtArgKind_None,
G_FmtArgKind_End,
G_FmtArgKind_BEGINSIZE1,
G_FmtArgKind_Uint,
G_FmtArgKind_Sint,
G_FmtArgKind_Float,
G_FmtArgKind_BEGINSIZE2,
G_FmtArgKind_Uint2,
G_FmtArgKind_Sint2,
G_FmtArgKind_Float2,
G_FmtArgKind_BEGINSIZE3,
G_FmtArgKind_Uint3,
G_FmtArgKind_Sint3,
G_FmtArgKind_Float3,
G_FmtArgKind_BEGINSIZE4,
G_FmtArgKind_Uint4,
G_FmtArgKind_Sint4,
G_FmtArgKind_Float4,
};
Struct(G_FmtArg)
{
G_FmtArgKind kind;
Vec4U32 v;
};
#if IsGpu && GPU_SHADER_PRINT
G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint; result.v.x = v; return result; }
G_FmtArg G_Fmt(Vec2U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint2; result.v.xy = v.xy; return result; }
G_FmtArg G_Fmt(Vec3U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint3; result.v.xyz = v.xyz; return result; }
G_FmtArg G_Fmt(Vec4U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint4; result.v.xyzw = v.xyzw; return result; }
G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint; result.v.x = v; return result; }
G_FmtArg G_Fmt(Vec2I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint2; result.v.xy = v.xy; return result; }
G_FmtArg G_Fmt(Vec3I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint3; result.v.xyz = v.xyz; return result; }
G_FmtArg G_Fmt(Vec4I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint4; result.v.xyzw = v.xyzw; return result; }
G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float; result.v.x = asuint(v); return result; }
G_FmtArg G_Fmt(Vec2 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float2; result.v.xy = asuint(v.xy); return result; }
G_FmtArg G_Fmt(Vec3 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float3; result.v.xyz = asuint(v.xyz); return result; }
G_FmtArg G_Fmt(Vec4 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float4; result.v.xyzw = asuint(v.xyzw); return result; }
G_FmtArg G_FmtEnd(void) { G_FmtArg result; result.kind = G_FmtArgKind_End; return result; }
Struct(G_TempPrintBuffer)
{
// NOTE: The larger the array size, the longer the compilation time
u32 byte_chunks[64];
u32 bytes_count;
u32 chars_count;
u32 args_count;
b32 overflowed;
};
void G_PushPrintByte(inout G_TempPrintBuffer buff, u32 v)
{
u32 chunk_idx = buff.bytes_count / 4;
if (chunk_idx < countof(buff.byte_chunks))
{
u32 byte_idx_in_chunk = buff.bytes_count & 0x03;
if (byte_idx_in_chunk == 0)
{
// Since buff is not zero initialized, we set the chunk on first write here
buff.byte_chunks[chunk_idx] = v & 0xFF;
}
else
{
buff.byte_chunks[chunk_idx] |= (v & 0xFF) << (byte_idx_in_chunk * 8);
}
buff.bytes_count += 1;
}
else
{
buff.overflowed = 1;
}
}
void G_CommitPrint(G_TempPrintBuffer buff)
{
RWByteAddressBuffer rw = G_SDerefRawRW(G_ShaderConst_PrintBuffer);
if (buff.overflowed)
{
buff.bytes_count = 0;
buff.chars_count = 0;
buff.args_count = 0;
}
u32 chunks_count = (buff.bytes_count + 3) / 4;
u32 alloc_size = 0;
alloc_size += 4; // Header
alloc_size += chunks_count * 4; // Chunks
// Atomic fetch + add to base counter
u32 base;
rw.InterlockedAdd(0, alloc_size, base);
base += 4; // Offset for allocation counter
base += 4; // Offset for success counter
base += 4; // Offset for overflow counter
if ((base + alloc_size) < countof(rw))
{
// Increment success counter
rw.InterlockedAdd(4, 1);
u32 pos = 0;
// Write header
{
u32 header = 0;
header |= (buff.chars_count << 0) & 0x0000FFFF;
header |= (buff.args_count << 16) & 0x7FFF0000;
header |= (buff.overflowed << 31) & 0xF0000000;
rw.Store(base + pos, header);
pos += 4;
}
// Write chunks
for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
{
u32 chunk = buff.byte_chunks[chunk_idx];
rw.Store(base + pos, chunk);
pos += 4;
}
}
else
{
// Increment overflow counter
rw.InterlockedAdd(8, 1);
}
}
#define G_PrintF_(fmt, ...) do { \
G_TempPrintBuffer __tmp; \
__tmp.bytes_count = 0; \
__tmp.overflowed = 0; \
u32 __char_idx = 0; \
while (U32FromChar(fmt[__char_idx]) != 0) \
{ \
G_PushPrintByte(__tmp, U32FromChar(fmt[__char_idx])); \
++__char_idx; \
} \
G_FmtArg __args[] = { __VA_ARGS__ }; \
__tmp.chars_count = __char_idx; \
__tmp.args_count = (countof(__args) - 1); \
for (u32 __arg_idx = 0; __arg_idx < __tmp.args_count; ++__arg_idx) \
{ \
G_FmtArg __arg = __args[__arg_idx]; \
G_PushPrintByte(__tmp, __arg.kind); \
if (__arg.kind > G_FmtArgKind_BEGINSIZE1) \
{ \
G_PushPrintByte(__tmp, __arg.v.x >> 0); \
G_PushPrintByte(__tmp, __arg.v.x >> 8); \
G_PushPrintByte(__tmp, __arg.v.x >> 16); \
G_PushPrintByte(__tmp, __arg.v.x >> 24); \
} \
if (__arg.kind > G_FmtArgKind_BEGINSIZE2) \
{ \
G_PushPrintByte(__tmp, __arg.v.y >> 0); \
G_PushPrintByte(__tmp, __arg.v.y >> 8); \
G_PushPrintByte(__tmp, __arg.v.y >> 16); \
G_PushPrintByte(__tmp, __arg.v.y >> 24); \
} \
if (__arg.kind > G_FmtArgKind_BEGINSIZE3) \
{ \
G_PushPrintByte(__tmp, __arg.v.z >> 0); \
G_PushPrintByte(__tmp, __arg.v.z >> 8); \
G_PushPrintByte(__tmp, __arg.v.z >> 16); \
G_PushPrintByte(__tmp, __arg.v.z >> 24); \
} \
if (__arg.kind > G_FmtArgKind_BEGINSIZE4) \
{ \
G_PushPrintByte(__tmp, __arg.v.w >> 0); \
G_PushPrintByte(__tmp, __arg.v.w >> 8); \
G_PushPrintByte(__tmp, __arg.v.w >> 16); \
G_PushPrintByte(__tmp, __arg.v.w >> 24); \
} \
} \
G_CommitPrint(__tmp); \
} while (0)
#define G_PrintF(fmt, ...) G_PrintF_(fmt, ##__VA_ARGS__, G_FmtEnd())
#else
#define G_PrintF(fmt)
#endif