power_play/src/gpu/gpu_shared.cgh

////////////////////////////////////////////////////////////
//~ Gpu memory reference types

typedef u32 G_BaseDescriptorIndex;

Struct(G_BufferRef)     { G_BaseDescriptorIndex v; };
Struct(G_TextureRef)    { G_BaseDescriptorIndex v; };
Struct(G_SamplerRef)    { G_BaseDescriptorIndex v; };

#define G_MakeBufferRef(_v)  ((G_BufferRef)  { .v = (_v) })
#define G_MakeTextureRef(_v) ((G_TextureRef) { .v = (_v) })
#define G_MakeSamplerRef(_v) ((G_SamplerRef) { .v = (_v) })

#define G_NilBufferRef  G_MakeBufferRef(0)
#define G_NilTextureRef G_MakeTextureRef(0)
#define G_NilSamplerRef G_MakeSamplerRef(0)

#define G_IsRefNil(r) ((r).v == 0)

////////////////////////////////////////////////////////////
//~ Constant types

//
// D3D12 exposes 64 root constants and Vulkan exposes 32 push constants.
// Supposedly AMD hardware will start spilling constants once more than
// 12 are in use - https://gpuopen.com/learn/rdna-performance-guide/
//
#define G_NumGeneralPurposeConstants    (24)  // Constants available for any usage
#define G_NumReservedConstants          (4)   // Constants reserved for internal usage by the GPU layer
#define G_NumConstants                  (G_NumGeneralPurposeConstants + G_NumReservedConstants)

#if IsCpu
  #define G_ForceDeclConstant(type, name, slot)                   \
    enum { name = slot };                                         \
    Struct(CAT(__ShaderConstantType_,name)) { type v; }
  #define G_DeclConstant(type, name, slot)                        \
    StaticAssert(sizeof(type) <= 4);                              \
    StaticAssert(slot < G_NumGeneralPurposeConstants);            \
    G_ForceDeclConstant(type, name, slot)
#else
  #define G_ForceDeclConstant(type, name, slot) cbuffer name : register(CAT(b,slot)) { type name; }
  #define G_DeclConstant(type, name, slot) G_ForceDeclConstant(type, name, slot)
#endif

////////////////////////////////////////////////////////////
//~ Reserved constants

// The constants declared below assume this configuration is accurate for slot usage
StaticAssert(G_NumGeneralPurposeConstants == 24);
StaticAssert(G_NumReservedConstants >= 3);

G_ForceDeclConstant(G_BufferRef,            G_ShaderConst_PrintBuffer,      24);
G_ForceDeclConstant(b32,                    G_ShaderConst_TweakB32,         25);
G_ForceDeclConstant(f32,                    G_ShaderConst_TweakF32,         26);

#if IsGpu
  #define G_TweakBool  G_ShaderConst_TweakB32
  #define G_TweakFloat G_ShaderConst_TweakF32
#endif

////////////////////////////////////////////////////////////
//~ Basic samplers

Enum(G_BasicSamplerKind)
{
  G_BasicSamplerKind_PointClamp,
  G_BasicSamplerKind_PointWrap,
  G_BasicSamplerKind_PointMirror,
  G_BasicSamplerKind_BilinearClamp,
  G_BasicSamplerKind_BilinearWrap,
  G_BasicSamplerKind_BilinearMirror,
  G_BasicSamplerKind_TrilinearClamp,
  G_BasicSamplerKind_TrilinearWrap,
  G_BasicSamplerKind_TrilinearMirror,

  G_BasicSamplerKind_COUNT
};

////////////////////////////////////////////////////////////
//~ Resource dereference

#if IsGpu
  // NOTE: Uniform dereferencing is faster than Non-Uniform on AMD hardware


  //- Scalar/Uniform dereference
  template<typename T> StructuredBuffer<T>   G_SDeref(G_BufferRef r)                  { return ResourceDescriptorHeap[r.v + 0]; }
  template<typename T> RWStructuredBuffer<T> G_SDerefRW(G_BufferRef r)                { return ResourceDescriptorHeap[r.v + 1]; }
  ByteAddressBuffer                          G_SDerefRaw(G_BufferRef r)               { return ResourceDescriptorHeap[r.v + 2]; }
  RWByteAddressBuffer                        G_SDerefRawRW(G_BufferRef r)             { return ResourceDescriptorHeap[r.v + 3]; }
  template<typename T> Texture1D<T>          G_SDeref1D(G_TextureRef r, u32 mip=0)    { return ResourceDescriptorHeap[r.v + (mip * 2) + 0]; }
  template<typename T> Texture2D<T>          G_SDeref2D(G_TextureRef r, u32 mip=0)    { return ResourceDescriptorHeap[r.v + (mip * 2) + 0]; }
  template<typename T> Texture3D<T>          G_SDeref3D(G_TextureRef r, u32 mip=0)    { return ResourceDescriptorHeap[r.v + (mip * 2) + 0]; }
  template<typename T> RWTexture1D<T>        G_SDerefRW1D(G_TextureRef r, u32 mip=0)  { return ResourceDescriptorHeap[r.v + (mip * 2) + 1]; }
  template<typename T> RWTexture2D<T>        G_SDerefRW2D(G_TextureRef r, u32 mip=0)  { return ResourceDescriptorHeap[r.v + (mip * 2) + 1]; }
  template<typename T> RWTexture3D<T>        G_SDerefRW3D(G_TextureRef r, u32 mip=0)  { return ResourceDescriptorHeap[r.v + (mip * 2) + 1]; }
  SamplerState                               G_SDeref(G_SamplerRef r)                 { return SamplerDescriptorHeap[r.v]; }

  //- Vector/Non-Uniform dereference
  template<typename T> StructuredBuffer<T>   G_VDeref(G_BufferRef r)                  { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
  template<typename T> RWStructuredBuffer<T> G_VDerefRW(G_BufferRef r)                { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
  ByteAddressBuffer                          G_VDerefRaw(G_BufferRef r)               { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 2)]; }
  RWByteAddressBuffer                        G_VDerefRawRW(G_BufferRef r)             { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; }
  template<typename T> Texture1D<T>          G_VDeref1D(G_TextureRef r, u32 mip=0)    { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; }
  template<typename T> Texture2D<T>          G_VDeref2D(G_TextureRef r, u32 mip=0)    { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; }
  template<typename T> Texture3D<T>          G_VDeref3D(G_TextureRef r, u32 mip=0)    { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 0)]; }
  template<typename T> RWTexture1D<T>        G_VDerefRW1D(G_TextureRef r, u32 mip=0)  { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; }
  template<typename T> RWTexture2D<T>        G_VDerefRW2D(G_TextureRef r, u32 mip=0)  { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; }
  template<typename T> RWTexture3D<T>        G_VDerefRW3D(G_TextureRef r, u32 mip=0)  { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + (mip * 2) + 1)]; }
  SamplerState                               G_VDeref(G_SamplerRef r)                 { return SamplerDescriptorHeap[NonUniformResourceIndex(r.v)]; }


  // //- Scalar/Uniform dereference
  // ByteAddressBuffer                          G_SDerefRaw(G_BufferRef r)            { return ResourceDescriptorHeap[r.v + 2]; }
  // RWByteAddressBuffer                        G_SDerefRawRW(G_BufferRef r)          { return ResourceDescriptorHeap[r.v + 3]; }
  // template<typename T> StructuredBuffer<T>   G_SDeref(G_BufferRef r)               { return ResourceDescriptorHeap[r.v + 0]; }
  // template<typename T> RWStructuredBuffer<T> G_SDerefRW(G_BufferRef r)             { return ResourceDescriptorHeap[r.v + 1]; }
  // template<typename T> Texture1D<T>          G_SDeref(G_Texture1DRef r)            { return ResourceDescriptorHeap[r.v + 0]; }
  // template<typename T> Texture2D<T>          G_SDeref(G_Texture2DRef r)            { return ResourceDescriptorHeap[r.v + 0]; }
  // template<typename T> Texture3D<T>          G_SDeref(G_Texture3DRef r)            { return ResourceDescriptorHeap[r.v + 0]; }
  // template<typename T> RWTexture1D<T>        G_SDerefRW(G_Texture1DRef r)          { return ResourceDescriptorHeap[r.v + 1]; }
  // template<typename T> RWTexture2D<T>        G_SDerefRW(G_Texture2DRef r)          { return ResourceDescriptorHeap[r.v + 1]; }
  // template<typename T> RWTexture3D<T>        G_SDerefRW(G_Texture3DRef r)          { return ResourceDescriptorHeap[r.v + 1]; }
  // SamplerState                               G_SDeref(G_SamplerStateRef r)         { return SamplerDescriptorHeap[r.v]; }

  // //- Vector/Non-Uniform dereference
  // ByteAddressBuffer                          G_VDerefRaw(G_BufferRef r)            { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 2)]; }
  // RWByteAddressBuffer                        G_VDerefRawRW(G_BufferRef r)          { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 3)]; }
  // template<typename T> StructuredBuffer<T>   G_VDeref(G_BufferRef r)               { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
  // template<typename T> RWStructuredBuffer<T> G_VDerefRW(G_BufferRef r)             { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
  // template<typename T> Texture1D<T>          G_VDeref(G_Texture1DRef r)            { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
  // template<typename T> Texture2D<T>          G_VDeref(G_Texture2DRef r)            { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
  // template<typename T> Texture3D<T>          G_VDeref(G_Texture3DRef r)            { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 0)]; }
  // template<typename T> RWTexture1D<T>        G_VDerefRW(G_Texture1DRef r)          { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
  // template<typename T> RWTexture2D<T>        G_VDerefRW(G_Texture2DRef r)          { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
  // template<typename T> RWTexture3D<T>        G_VDerefRW(G_Texture3DRef r)          { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; }
  // SamplerState                               G_VDeref(G_SamplerStateRef r)         { return SamplerDescriptorHeap[NonUniformResourceIndex(r.v)]; }
#endif

////////////////////////////////////////////////////////////
//~ Resource countof

#define G_MaxMips 16
#define G_MaxRenderTargets 8

#if IsGpu
  template<typename T> u32      countof(StructuredBuffer<T> obj)    { u32 result; obj.GetDimensions(result); return result; }
  template<typename T> u32      countof(RWStructuredBuffer<T> obj)  { u32 result; u32 stride; obj.GetDimensions(result, stride); return result; }
  u32                           countof(ByteAddressBuffer obj)      { u32 result; obj.GetDimensions(result); return result; }
  u32                           countof(RWByteAddressBuffer obj)    { u32 result; obj.GetDimensions(result); return result; }
  template<typename T> u32      countof(Texture1D<T> obj)           { u32 result; obj.GetDimensions(result); return result; }
  template<typename T> u32      countof(RWTexture1D<T> obj)         { u32 result; obj.GetDimensions(result); return result; }
  template<typename T> Vec2U32  countof(Texture2D<T> obj)           { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; }
  template<typename T> Vec2U32  countof(RWTexture2D<T> obj)         { Vec2U32 result; obj.GetDimensions(result.x, result.y); return result; }
  template<typename T> Vec3U32  countof(Texture3D<T> obj)           { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; }
  template<typename T> Vec3U32  countof(RWTexture3D<T> obj)         { Vec3U32 result; obj.GetDimensions(result.x, result.y, result.z); return result; }
#endif

////////////////////////////////////////////////////////////
//~ Debug printf

// This technique is based on MJP's article - https://therealmjp.github.io/posts/hlsl-printf/

Enum(G_FmtArgKind)
{
  G_FmtArgKind_None,
  G_FmtArgKind_End,

  G_FmtArgKind_BEGINSIZE1,

  G_FmtArgKind_Uint,
  G_FmtArgKind_Sint,
  G_FmtArgKind_Float,

  G_FmtArgKind_BEGINSIZE2,

  G_FmtArgKind_Uint2,
  G_FmtArgKind_Sint2,
  G_FmtArgKind_Float2,

  G_FmtArgKind_BEGINSIZE3,

  G_FmtArgKind_Uint3,
  G_FmtArgKind_Sint3,
  G_FmtArgKind_Float3,

  G_FmtArgKind_BEGINSIZE4,

  G_FmtArgKind_Uint4,
  G_FmtArgKind_Sint4,
  G_FmtArgKind_Float4,
};

Struct(G_FmtArg)
{
  G_FmtArgKind kind;
  Vec4U32 v;
};

#if IsGpu && GPU_SHADER_PRINT
  G_FmtArg G_Fmt(u32 v)       { G_FmtArg result;    result.kind = G_FmtArgKind_Uint;      result.v.x = v;                     return result; }
  G_FmtArg G_Fmt(Vec2U32 v)   { G_FmtArg result;    result.kind = G_FmtArgKind_Uint2;     result.v.xy = v.xy;                 return result; }
  G_FmtArg G_Fmt(Vec3U32 v)   { G_FmtArg result;    result.kind = G_FmtArgKind_Uint3;     result.v.xyz = v.xyz;               return result; }
  G_FmtArg G_Fmt(Vec4U32 v)   { G_FmtArg result;    result.kind = G_FmtArgKind_Uint4;     result.v.xyzw = v.xyzw;             return result; }

  G_FmtArg G_Fmt(i32 v)       { G_FmtArg result;    result.kind = G_FmtArgKind_Sint;      result.v.x = v;                     return result; }
  G_FmtArg G_Fmt(Vec2I32 v)   { G_FmtArg result;    result.kind = G_FmtArgKind_Sint2;     result.v.xy = v.xy;                 return result; }
  G_FmtArg G_Fmt(Vec3I32 v)   { G_FmtArg result;    result.kind = G_FmtArgKind_Sint3;     result.v.xyz = v.xyz;               return result; }
  G_FmtArg G_Fmt(Vec4I32 v)   { G_FmtArg result;    result.kind = G_FmtArgKind_Sint4;     result.v.xyzw = v.xyzw;             return result; }

  G_FmtArg G_Fmt(f32 v)       { G_FmtArg result;    result.kind = G_FmtArgKind_Float;     result.v.x = asuint(v);             return result; }
  G_FmtArg G_Fmt(Vec2 v)      { G_FmtArg result;    result.kind = G_FmtArgKind_Float2;    result.v.xy = asuint(v.xy);         return result; }
  G_FmtArg G_Fmt(Vec3 v)      { G_FmtArg result;    result.kind = G_FmtArgKind_Float3;    result.v.xyz = asuint(v.xyz);       return result; }
  G_FmtArg G_Fmt(Vec4 v)      { G_FmtArg result;    result.kind = G_FmtArgKind_Float4;    result.v.xyzw = asuint(v.xyzw);     return result; }

  G_FmtArg G_FmtEnd(void)     { G_FmtArg result;    result.kind = G_FmtArgKind_End;    return result; }

  Struct(G_TempPrintBuffer)
  {
    // NOTE: The larger the array size, the longer the compilation time
    u32 byte_chunks[64];
    u32 bytes_count;
    u32 chars_count;
    u32 args_count;
    b32 overflowed;
  };

  void G_PushPrintByte(inout G_TempPrintBuffer buff, u32 v)
  {
    u32 chunk_idx = buff.bytes_count / 4;
    if (chunk_idx < countof(buff.byte_chunks))
    {
      u32 byte_idx_in_chunk = buff.bytes_count & 0x03;
      if (byte_idx_in_chunk == 0)
      {
        // Since buff is not zero initialized, we set the chunk on first write here
        buff.byte_chunks[chunk_idx] = v & 0xFF;
      }
      else
      {
        buff.byte_chunks[chunk_idx] |= (v & 0xFF) << (byte_idx_in_chunk * 8);
      }
      buff.bytes_count += 1;
    }
    else
    {
      buff.overflowed = 1;
    }
  }

  void G_CommitPrint(G_TempPrintBuffer buff)
  {
    RWByteAddressBuffer rw = G_SDerefRawRW(G_ShaderConst_PrintBuffer);

    if (buff.overflowed)
    {
      buff.bytes_count = 0;
      buff.chars_count = 0;
      buff.args_count = 0;
    }

    u32 chunks_count = (buff.bytes_count + 3) / 4;
    u32 alloc_size = 0;
    alloc_size += 4;                 // Header
    alloc_size += chunks_count * 4;  // Chunks

    // Atomic fetch + add to base counter
    u32 base;
    rw.InterlockedAdd(0, alloc_size, base);
    base += 4;  // Offset for allocation counter
    base += 4;  // Offset for success counter
    base += 4;  // Offset for overflow counter

    if ((base + alloc_size) < countof(rw))
    {
      // Increment success counter
      rw.InterlockedAdd(4, 1);
      u32 pos = 0;

      // Write header
      {
        u32 header = 0;
        header |= (buff.chars_count <<  0) & 0x0000FFFF;
        header |= (buff.args_count  << 16) & 0x7FFF0000;
        header |= (buff.overflowed  << 31) & 0xF0000000;
        rw.Store(base + pos, header);
        pos += 4;
      }

      // Write chunks
      for (u32 chunk_idx = 0; chunk_idx < chunks_count; ++chunk_idx)
      {
        u32 chunk = buff.byte_chunks[chunk_idx];
        rw.Store(base + pos, chunk);
        pos += 4;
      }
    }
    else
    {
      // Increment overflow counter
      rw.InterlockedAdd(8, 1);
    }
  }

  #define G_PrintF_(fmt, ...) do {                                        \
    G_TempPrintBuffer __tmp;                                              \
    __tmp.bytes_count = 0;                                                \
    __tmp.overflowed = 0;                                                 \
    u32 __char_idx = 0;                                                   \
    while (U32FromChar(fmt[__char_idx]) != 0)                             \
    {                                                                     \
      G_PushPrintByte(__tmp, U32FromChar(fmt[__char_idx]));               \
      ++__char_idx;                                                       \
    }                                                                     \
    G_FmtArg __args[] = { __VA_ARGS__ };                                  \
    __tmp.chars_count = __char_idx;                                       \
    __tmp.args_count = (countof(__args) - 1);                             \
    for (u32 __arg_idx = 0; __arg_idx < __tmp.args_count; ++__arg_idx)    \
    {                                                                     \
      G_FmtArg __arg = __args[__arg_idx];                                 \
      G_PushPrintByte(__tmp, __arg.kind);                                 \
      if (__arg.kind > G_FmtArgKind_BEGINSIZE1)                           \
      {                                                                   \
        G_PushPrintByte(__tmp, __arg.v.x >>  0);                          \
        G_PushPrintByte(__tmp, __arg.v.x >>  8);                          \
        G_PushPrintByte(__tmp, __arg.v.x >> 16);                          \
        G_PushPrintByte(__tmp, __arg.v.x >> 24);                          \
      }                                                                   \
      if (__arg.kind > G_FmtArgKind_BEGINSIZE2)                           \
      {                                                                   \
        G_PushPrintByte(__tmp, __arg.v.y >>  0);                          \
        G_PushPrintByte(__tmp, __arg.v.y >>  8);                          \
        G_PushPrintByte(__tmp, __arg.v.y >> 16);                          \
        G_PushPrintByte(__tmp, __arg.v.y >> 24);                          \
      }                                                                   \
      if (__arg.kind > G_FmtArgKind_BEGINSIZE3)                           \
      {                                                                   \
        G_PushPrintByte(__tmp, __arg.v.z >>  0);                          \
        G_PushPrintByte(__tmp, __arg.v.z >>  8);                          \
        G_PushPrintByte(__tmp, __arg.v.z >> 16);                          \
        G_PushPrintByte(__tmp, __arg.v.z >> 24);                          \
      }                                                                   \
      if (__arg.kind > G_FmtArgKind_BEGINSIZE4)                           \
      {                                                                   \
        G_PushPrintByte(__tmp, __arg.v.w >>  0);                          \
        G_PushPrintByte(__tmp, __arg.v.w >>  8);                          \
        G_PushPrintByte(__tmp, __arg.v.w >> 16);                          \
        G_PushPrintByte(__tmp, __arg.v.w >> 24);                          \
      }                                                                   \
    }                                                                     \
    G_CommitPrint(__tmp);                                                 \
  } while (0)

  #define G_PrintF(fmt, ...) G_PrintF_(fmt, ##__VA_ARGS__, G_FmtEnd())

#else
  #define G_PrintF(fmt)
#endif