332 lines
15 KiB
C
332 lines
15 KiB
C
////////////////////////////////////////////////////////////
|
|
//~ Gpu memory reference types
|
|
|
|
typedef u32 G_BaseDescriptorIndex;
|
|
|
|
Struct(G_BufferRef) { G_BaseDescriptorIndex v; };
|
|
Struct(G_TextureRef) { G_BaseDescriptorIndex v; };
|
|
Struct(G_SamplerRef) { G_BaseDescriptorIndex v; };
|
|
|
|
#define G_MakeBufferRef(_v) ((G_BufferRef) { .v = (_v) })
|
|
#define G_MakeTextureRef(_v) ((G_TextureRef) { .v = (_v) })
|
|
#define G_MakeSamplerRef(_v) ((G_SamplerRef) { .v = (_v) })
|
|
|
|
#define G_NilBufferRef G_MakeBufferRef(0)
|
|
#define G_NilTextureRef G_MakeTextureRef(0)
|
|
#define G_NilSamplerRef G_MakeSamplerRef(0)
|
|
|
|
#define G_IsRefNil(r) ((r).v == 0)
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Register types
|
|
|
|
//
|
|
// D3D12 exposes 64 root constants and Vulkan exposes 32 push constants.
|
|
// Supposedly AMD hardware will start spilling constants once more than
|
|
// 12 are in use - https://gpuopen.com/learn/rdna-performance-guide/
|
|
//
|
|
#define G_NumGeneralPurposeRegisters (24) // Registers available for any usage
|
|
#define G_NumReservedRegisters (4) // Registers reserved for internal usage by the GPU layer
|
|
#define G_NumRegisters (G_NumGeneralPurposeRegisters + G_NumReservedRegisters)
|
|
|
|
#if IsCpu
|
|
#define G_ForceDeclRegister(type, name, slot) \
|
|
enum { name = slot }; \
|
|
Struct(CAT(__ShaderRegisterType__,name)) { type v; }
|
|
#define G_DeclRegister(type, name, slot) \
|
|
StaticAssert(sizeof(type) <= 4); \
|
|
StaticAssert(slot < G_NumGeneralPurposeRegisters); \
|
|
G_ForceDeclRegister(type, name, slot)
|
|
#else
|
|
#define G_ForceDeclRegister(type, name, slot) cbuffer name : register(CAT(b,slot)) { type name; }
|
|
#define G_DeclRegister(type, name, slot) G_ForceDeclRegister(type, name, slot)
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Reserved registers
|
|
|
|
// The registers declared below assume this configuration is accurate for slot usage
|
|
StaticAssert(G_NumGeneralPurposeRegisters == 24);
|
|
StaticAssert(G_NumReservedRegisters >= 3);
|
|
|
|
G_ForceDeclRegister(G_BufferRef, G_ShaderReg_PrintBuffer, 24);
|
|
G_ForceDeclRegister(b32, G_ShaderReg_TweakB32, 25);
|
|
G_ForceDeclRegister(f32, G_ShaderReg_TweakF32, 26);
|
|
|
|
#if IsGpu
|
|
#define G_TweakBool G_ShaderReg_TweakB32
|
|
#define G_TweakFloat G_ShaderReg_TweakF32
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Basic samplers
|
|
|
|
Enum(G_BasicSamplerKind)
|
|
{
|
|
G_BasicSamplerKind_PointClamp,
|
|
G_BasicSamplerKind_PointWrap,
|
|
G_BasicSamplerKind_PointMirror,
|
|
G_BasicSamplerKind_BilinearClamp,
|
|
G_BasicSamplerKind_BilinearWrap,
|
|
G_BasicSamplerKind_BilinearMirror,
|
|
G_BasicSamplerKind_TrilinearClamp,
|
|
G_BasicSamplerKind_TrilinearWrap,
|
|
G_BasicSamplerKind_TrilinearMirror,
|
|
|
|
G_BasicSamplerKind_COUNT
|
|
};
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Basic noise
|
|
|
|
#define G_BasicNoiseDims VEC3I32(128, 128, 64)
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Index buffers
|
|
|
|
#define G_IB(_count, _buffer, ...) ((G_IndexBufferDesc) { .count = (_count), .buffer = (_buffer), __VA_ARGS__ })
|
|
Struct(G_IndexBufferDesc)
|
|
{
|
|
u32 count;
|
|
G_BufferRef buffer;
|
|
};
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Resource dereference
|
|
|
|
#if IsGpu
|
|
// TODO: Add explicit uniform-dereference variations, since on AMD hardware
|
|
// non-uniform is slower and there are some related shader-compilation bugs
|
|
// in older driver versions
|
|
|
|
template<typename R> struct G_DerefImpl;
|
|
template<> struct G_DerefImpl< SamplerState > { static SamplerState Deref(G_SamplerRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } };
|
|
template<typename T> struct G_DerefImpl< StructuredBuffer<T> > { static StructuredBuffer<T> Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; } };
|
|
template<typename T> struct G_DerefImpl< RWStructuredBuffer<T> > { static RWStructuredBuffer<T> Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } };
|
|
template<> struct G_DerefImpl< ByteAddressBuffer > { static ByteAddressBuffer Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; } };
|
|
template<> struct G_DerefImpl< RWByteAddressBuffer > { static RWByteAddressBuffer Deref(G_BufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; } };
|
|
template<typename T> struct G_DerefImpl< Texture1D<T> > { static Texture1D<T> Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; } };
|
|
template<typename T> struct G_DerefImpl< Texture2D<T> > { static Texture2D<T> Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; } };
|
|
template<typename T> struct G_DerefImpl< Texture3D<T> > { static Texture3D<T> Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; } };
|
|
template<typename T> struct G_DerefImpl< RWTexture1D<T> > { static RWTexture1D<T> Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; } };
|
|
template<typename T> struct G_DerefImpl< RWTexture2D<T> > { static RWTexture2D<T> Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; } };
|
|
template<typename T> struct G_DerefImpl< RWTexture3D<T> > { static RWTexture3D<T> Deref(G_TextureRef r, u32 mip=0) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; } };
|
|
|
|
#define G_Deref(ref, type, ...) (G_DerefImpl< type >::Deref((ref), ##__VA_ARGS__))
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Texture dimensions
|
|
|
|
#define G_MaxMips 16
|
|
#define G_MaxRenderTargets 8
|
|
|
|
#if IsGpu
|
|
template<typename T> u32 G_Count1D(Texture1D<T> obj) { u32 result; obj.GetDimensions(result); return result; }
|
|
template<typename T> u32 G_Count1D(RWTexture1D<T> obj) { u32 result; obj.GetDimensions(result); return result; }
|
|
template<typename T> Vec2U32 G_Count2D(Texture2D<T> obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; }
|
|
template<typename T> Vec2U32 G_Count2D(RWTexture2D<T> obj) { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; }
|
|
template<typename T> Vec3U32 G_Count3D(Texture3D<T> obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; }
|
|
template<typename T> Vec3U32 G_Count3D(RWTexture3D<T> obj) { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; }
|
|
#endif
|
|
|
|
////////////////////////////////////////////////////////////
|
|
//~ Shader printf
|
|
|
|
Enum(G_FmtArgKind)
|
|
{
|
|
G_FmtArgKind_None,
|
|
G_FmtArgKind_End,
|
|
|
|
G_FmtArgKind_BEGINSIZE1,
|
|
|
|
G_FmtArgKind_Uint,
|
|
G_FmtArgKind_Sint,
|
|
G_FmtArgKind_Float,
|
|
|
|
G_FmtArgKind_BEGINSIZE2,
|
|
|
|
G_FmtArgKind_Uint2,
|
|
G_FmtArgKind_Sint2,
|
|
G_FmtArgKind_Float2,
|
|
|
|
G_FmtArgKind_BEGINSIZE3,
|
|
|
|
G_FmtArgKind_Uint3,
|
|
G_FmtArgKind_Sint3,
|
|
G_FmtArgKind_Float3,
|
|
|
|
G_FmtArgKind_BEGINSIZE4,
|
|
|
|
G_FmtArgKind_Uint4,
|
|
G_FmtArgKind_Sint4,
|
|
G_FmtArgKind_Float4,
|
|
};
|
|
|
|
Struct(G_FmtArg)
|
|
{
|
|
G_FmtArgKind kind;
|
|
Vec4U32 v;
|
|
};
|
|
|
|
#if IsGpu && GPU_SHADER_PRINT
|
|
G_FmtArg G_Fmt(u32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint; result.v.x = v; return result; }
|
|
G_FmtArg G_Fmt(Vec2U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint2; result.v.xy = v.xy; return result; }
|
|
G_FmtArg G_Fmt(Vec3U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint3; result.v.xyz = v.xyz; return result; }
|
|
G_FmtArg G_Fmt(Vec4U32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Uint4; result.v.xyzw = v.xyzw; return result; }
|
|
|
|
G_FmtArg G_Fmt(i32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint; result.v.x = v; return result; }
|
|
G_FmtArg G_Fmt(Vec2I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint2; result.v.xy = v.xy; return result; }
|
|
G_FmtArg G_Fmt(Vec3I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint3; result.v.xyz = v.xyz; return result; }
|
|
G_FmtArg G_Fmt(Vec4I32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Sint4; result.v.xyzw = v.xyzw; return result; }
|
|
|
|
G_FmtArg G_Fmt(f32 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float; result.v.x = asuint(v); return result; }
|
|
G_FmtArg G_Fmt(Vec2 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float2; result.v.xy = asuint(v.xy); return result; }
|
|
G_FmtArg G_Fmt(Vec3 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float3; result.v.xyz = asuint(v.xyz); return result; }
|
|
G_FmtArg G_Fmt(Vec4 v) { G_FmtArg result; result.kind = G_FmtArgKind_Float4; result.v.xyzw = asuint(v.xyzw); return result; }
|
|
|
|
G_FmtArg G_FmtEnd(void) { G_FmtArg result; result.kind = G_FmtArgKind_End; return result; }
|
|
|
|
Struct(G_TempPrintBuffer)
|
|
{
|
|
// NOTE: Large array sizes can increase shader compilation time
|
|
u32 byte_chunks[64];
|
|
u32 bytes_count;
|
|
u32 chars_count;
|
|
u32 args_count;
|
|
b32 overflowed;
|
|
};
|
|
|
|
void G_PushPrintByte(inout G_TempPrintBuffer buff, u32 v)
|
|
{
|
|
u32 chunk_idx = buff.bytes_count / 4;
|
|
if (chunk_idx < countof(buff.byte_chunks))
|
|
{
|
|
u32 byte_idx_in_chunk = buff.bytes_count & 0x03;
|
|
if (byte_idx_in_chunk == 0)
|
|
{
|
|
// Since buff is not zero initialized, we set the chunk on first write here
|
|
buff.byte_chunks[chunk_idx] = v & 0xFF;
|
|
}
|
|
else
|
|
{
|
|
buff.byte_chunks[chunk_idx] |= (v & 0xFF) << (byte_idx_in_chunk * 8);
|
|
}
|
|
buff.bytes_count += 1;
|
|
}
|
|
else
|
|
{
|
|
buff.overflowed = 1;
|
|
}
|
|
}
|
|
|
|
void G_CommitPrint(G_TempPrintBuffer buff)
|
|
{
|
|
RWByteAddressBuffer rw = G_Deref(G_ShaderReg_PrintBuffer, RWByteAddressBuffer);
|
|
|
|
if (buff.overflowed)
|
|
{
|
|
buff.bytes_count = 0;
|
|
buff.chars_count = 0;
|
|
buff.args_count = 0;
|
|
}
|
|
|
|
u32 chunks_count = (buff.bytes_count + 3) / 4;
|
|
u32 alloc_size = 0;
|
|
alloc_size += 4; // Header
|
|
alloc_size += chunks_count * 4; // Chunks
|
|
|
|
// Atomic fetch + add to base counter
|
|
u32 base;
|
|
rw.InterlockedAdd(0, alloc_size, base);
|
|
base += 4; // Offset for allocation counter
|
|
base += 4; // Offset for success counter
|
|
base += 4; // Offset for overflow counter
|
|
|
|
if ((base + alloc_size) < GPU_SHADER_PRINT_BUFFER_SIZE)
|
|
{
|
|
// Increment success counter
|
|
rw.InterlockedAdd(4, 1);
|
|
u32 pos = 0;
|
|
|
|
// Write header
|
|
{
|
|
u32 header = 0;
|
|
header |= (buff.chars_count << 0) & 0x0000FFFF;
|
|
header |= (buff.args_count << 16) & 0x7FFF0000;
|
|
header |= (buff.overflowed << 31) & 0xF0000000;
|
|
rw.Store(base + pos, header);
|
|
pos += 4;
|
|
}
|
|
|
|
// Write chunks
|
|
for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
|
|
{
|
|
u32 chunk = buff.byte_chunks[chunk_idx];
|
|
rw.Store(base + pos, chunk);
|
|
pos += 4;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Increment overflow counter
|
|
rw.InterlockedAdd(8, 1);
|
|
}
|
|
}
|
|
|
|
#define G_PrintF_(fmt, ...) \
|
|
do { \
|
|
G_TempPrintBuffer __tmp; \
|
|
__tmp.bytes_count = 0; \
|
|
__tmp.overflowed = 0; \
|
|
u32 __char_idx = 0; \
|
|
while (U32FromChar(fmt[__char_idx]) != 0) \
|
|
{ \
|
|
G_PushPrintByte(__tmp, U32FromChar(fmt[__char_idx])); \
|
|
++__char_idx; \
|
|
} \
|
|
G_FmtArg __args[] = { __VA_ARGS__ }; \
|
|
__tmp.chars_count = __char_idx; \
|
|
__tmp.args_count = (countof(__args) - 1); \
|
|
for (u32 __arg_idx = 0; __arg_idx < __tmp.args_count; ++__arg_idx) \
|
|
{ \
|
|
G_FmtArg __arg = __args[__arg_idx]; \
|
|
G_PushPrintByte(__tmp, __arg.kind); \
|
|
if (__arg.kind > G_FmtArgKind_BEGINSIZE1) \
|
|
{ \
|
|
G_PushPrintByte(__tmp, __arg.v.x >> 0); \
|
|
G_PushPrintByte(__tmp, __arg.v.x >> 8); \
|
|
G_PushPrintByte(__tmp, __arg.v.x >> 16); \
|
|
G_PushPrintByte(__tmp, __arg.v.x >> 24); \
|
|
} \
|
|
if (__arg.kind > G_FmtArgKind_BEGINSIZE2) \
|
|
{ \
|
|
G_PushPrintByte(__tmp, __arg.v.y >> 0); \
|
|
G_PushPrintByte(__tmp, __arg.v.y >> 8); \
|
|
G_PushPrintByte(__tmp, __arg.v.y >> 16); \
|
|
G_PushPrintByte(__tmp, __arg.v.y >> 24); \
|
|
} \
|
|
if (__arg.kind > G_FmtArgKind_BEGINSIZE3) \
|
|
{ \
|
|
G_PushPrintByte(__tmp, __arg.v.z >> 0); \
|
|
G_PushPrintByte(__tmp, __arg.v.z >> 8); \
|
|
G_PushPrintByte(__tmp, __arg.v.z >> 16); \
|
|
G_PushPrintByte(__tmp, __arg.v.z >> 24); \
|
|
} \
|
|
if (__arg.kind > G_FmtArgKind_BEGINSIZE4) \
|
|
{ \
|
|
G_PushPrintByte(__tmp, __arg.v.w >> 0); \
|
|
G_PushPrintByte(__tmp, __arg.v.w >> 8); \
|
|
G_PushPrintByte(__tmp, __arg.v.w >> 16); \
|
|
G_PushPrintByte(__tmp, __arg.v.w >> 24); \
|
|
} \
|
|
} \
|
|
G_CommitPrint(__tmp); \
|
|
} while (0)
|
|
|
|
#define G_PrintF(fmt, ...) G_PrintF_(fmt, ##__VA_ARGS__, G_FmtEnd())
|
|
|
|
#else
|
|
#define G_PrintF(fmt)
|
|
#endif
|