specify thread-size instead of group-size when dispatching compute shaders

2026-03-03 18:25:29 -06:00 · 2026-03-03 18:25:29 -06:00 · 38196a8eb7
commit 38196a8eb7
parent 35eb4e4839
10 changed files with 72 additions and 55 deletions
--- a/src/base/base.cgh
+++ b/src/base/base.cgh
@ -252,8 +252,8 @@
 #endif

 //- Preprocessor concatenation
-#define Cat1(a, b) a ## b
-#define Cat(a, b) Cat1(a, b)
+#define CAT1(a, b) a ## b
+#define CAT(a, b) CAT1(a, b)

 //- Preprocessor stringization
 #define Stringize1(x) #x
@ -461,10 +461,10 @@
 #define IsFixedArray(a) (IsIndexable(a) && (((void *)&a) == ((void *)a)))

 //- struct region
-#define BeginFieldRegion(name) i8 __begfieldreg__##name
-#define EndFieldRegion(name) i8 __endfieldreg__##name
-#define CopyFieldRegion(dst, src, r) CopyBytes(&dst->__begfieldreg__##r, &src->__begfieldreg__##r, (u8 *)&dst->__endfieldreg__##r - (u8 *)&dst->__begfieldreg__##r)
-#define ZeroFieldRegion(dst, src, r) ZeroBytes(&dst->__begfieldreg__##r, &src->__begfieldreg__##r, (u8 *)&dst->__endfieldreg__##r - (u8 *)&dst->__begfieldreg__##r)
+#define BeginFieldRegion(name) i8 CAT(__begfieldreg__, name)
+#define EndFieldRegion(name)   i8 CAT(__endfieldreg__, name)
+#define CopyFieldRegion(dst, src, r) CopyBytes(&dst->CAT(__begfieldreg__, r), &src->CAT(__begfieldreg__, r), (u8 *)&dst->CAT(__endfieldreg__, r) - (u8 *)&dst->CAT(__begfieldreg__, r))
+#define ZeroFieldRegion(dst, src, r) ZeroBytes(&dst->CAT(__begfieldreg__, r), &src->CAT(__begfieldreg__, r), (u8 *)&dst->CAT(__endfieldreg__, r) - (u8 *)&dst->CAT(__begfieldreg__, r))

 //- Packed
 #if IsCompilerMsvc
@ -736,28 +736,28 @@ Struct(VertexShaderDesc)    { ResourceKey resource; u32 x, y, z; };
 Struct(PixelShaderDesc)     { ResourceKey resource; u32 x, y, z; };
 Struct(ComputeShaderDesc)   { ResourceKey resource; u32 x, y, z; };

-#define GetGroupSize(name)  VEC3U32(name##__GroupSize_X, name##__GroupSize_Y, name##__GroupSize_Z)
+#define GroupSize(name) VEC3U32(CAT(name, __GroupSize_X), CAT(name, __GroupSize_Y), CAT(name, __GroupSize_Z))

 #if IsGpu
  #define Semantic(name) name : name
  #define VertexShader(name, return_type)         return_type name(u32 Semantic(SV_InstanceID), u32 Semantic(SV_VertexID))
  #define PixelShader(name, return_type, ...)     return_type name(__VA_ARGS__)
-  #define ComputeShader(name)                                                           \
-    [numthreads(name##__GroupSize_X, name##__GroupSize_Y, name##__GroupSize_Z)]          \
-    void name(                                                                          \
-      u32 Semantic(SV_GroupIndex),                                                      \
-      Vec3U32 Semantic(SV_GroupID),                                                     \
-      Vec3U32 Semantic(SV_GroupThreadID),                                               \
-      Vec3U32 Semantic(SV_DispatchThreadID)                                             \
+  #define ComputeShader(name)                                                                           \
+    [numthreads(CAT(name, __GroupSize_X), CAT(name, __GroupSize_Y), CAT(name, __GroupSize_Z))]          \
+    void name(                                                                                          \
+      u32 Semantic(SV_GroupIndex),                                                                      \
+      Vec3U32 Semantic(SV_GroupID),                                                                     \
+      Vec3U32 Semantic(SV_GroupThreadID),                                                               \
+      Vec3U32 Semantic(SV_DispatchThreadID)                                                             \
    )
 #endif

 #if IsCpu
-  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { name##__GroupSize_X = x, name##__GroupSize_Y = y, name##__GroupSize_Z = z }; static ComputeShaderDesc name = { resource_hash, x, y, z }
+  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name, __GroupSize_X) = x, CAT(name, __GroupSize_Y) = y, CAT(name, __GroupSize_Z) = z }; static ComputeShaderDesc name = { resource_hash, x, y, z }
  #define DeclVertexShader(name, resource_hash)             static VertexShaderDesc name = { resource_hash, 1, 1, 1 }
  #define DeclPixelShader(name, resource_hash)              static PixelShaderDesc name = { resource_hash, 1, 1, 1 }
-#else
-  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { name##__GroupSize_X = x, name##__GroupSize_Y = y, name##__GroupSize_Z = z };
+#elif IsGpu
+  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name, __GroupSize_X) = x, CAT(name, __GroupSize_Y) = y, CAT(name, __GroupSize_Z) = z };
  #define DeclVertexShader(name, resource_hash)
  #define DeclPixelShader(name, resource_hash)
 #endif
--- a/src/gpu/gpu_common.c
+++ b/src/gpu/gpu_common.c
@ -215,6 +215,17 @@ Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip)
  return result;
 }

+//- Thread count
+
+Vec3I32 G_GroupCountFromThreadCount(ComputeShaderDesc cs, Vec3I32 threads)
+{
+  return VEC3I32(
+    (threads.x + cs.x - 1) / cs.x,
+    (threads.y + cs.y - 1) / cs.y,
+    (threads.z + cs.z - 1) / cs.z
+  );
+}
+
 //- Viewport / scissor

 Rng3 G_ViewportFromTexture(G_ResourceHandle texture)
--- a/src/gpu/gpu_common.h
+++ b/src/gpu/gpu_common.h
@ -39,6 +39,9 @@ i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip);
 Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip);
 Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip);

+//- Thread count
+Vec3I32 G_GroupCountFromThreadCount(ComputeShaderDesc cs, Vec3I32 threads);
+
 //- Viewport / scissor
 Rng3 G_ViewportFromTexture(G_ResourceHandle texture);
 Rng2 G_ScissorFromTexture(G_ResourceHandle texture);
--- a/src/gpu/gpu_core.h
+++ b/src/gpu/gpu_core.h
@ -679,7 +679,7 @@ void G_CopyTextureToBuffer(G_CommandListHandle cl, G_ResourceHandle dst, Vec3I32
 void G_SetConstantEx(G_CommandListHandle cl, i32 slot, void *src_32bit, u32 size);

 #define G_SetConstant(cl, name, value) do {                    \
-    name##__shaderconstanttype __src;                          \
+    CAT(name, __shaderconstanttype) __src;                     \
    __src.v = value;                                           \
    G_SetConstantEx((cl), (name), &__src, sizeof(__src));      \
  } while (0)
@ -733,7 +733,11 @@ void G_MemorySyncEx(G_CommandListHandle cl, G_MemoryBarrierDesc desc);

 //- Compute

-void G_Compute(G_CommandListHandle cl, ComputeShaderDesc cs, Vec3I32 groups);
+void G_ComputeEx(G_CommandListHandle cl, ComputeShaderDesc cs, Vec3I32 threads);
+
+#define G_Compute(cl, cs, threads)    G_ComputeEx((cl), (cs), VEC3I32((threads), 1, 1))
+#define G_Compute2D(cl, cs, threads)  G_ComputeEx((cl), (cs), VEC3I32((threads).x, (threads).y, 1))
+#define G_Compute3D(cl, cs, threads)  G_ComputeEx((cl), (cs), VEC3I32((threads).x, (threads).y, (threads).z))

 //- Rasterize

--- a/src/gpu/gpu_dx12/gpu_dx12_core.c
+++ b/src/gpu/gpu_dx12/gpu_dx12_core.c
@ -884,7 +884,7 @@ G_D12_Pipeline *G_D12_PipelineFromDesc(G_D12_PipelineDesc desc)
          raster_desc.DepthBias = D3D12_DEFAULT_DEPTH_BIAS;
          raster_desc.DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP;
          raster_desc.SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS;
-          raster_desc.DepthClipEnable = 1;
+          raster_desc.DepthClipEnable = 0;
          raster_desc.MultisampleEnable = 0;
          raster_desc.AntialiasedLineEnable = 0;
          raster_desc.ForcedSampleCount = 0;
@ -3240,15 +3240,15 @@ void G_MemorySyncEx(G_CommandListHandle cl_handle, G_MemoryBarrierDesc desc)

 //- Compute

-void G_Compute(G_CommandListHandle cl_handle, ComputeShaderDesc cs, Vec3I32 groups)
+void G_ComputeEx(G_CommandListHandle cl_handle, ComputeShaderDesc cs, Vec3I32 threads)
 {
-  if (groups.x > 0 && groups.y > 0 && groups.z > 0)
+  if (threads.x > 0 && threads.y > 0 && threads.z > 0)
  {
    G_D12_CmdList *cl = G_D12_CmdListFromHandle(cl_handle);
    G_D12_Cmd *cmd = G_D12_PushCmd(cl);
    cmd->kind = G_D12_CmdKind_Compute;
    cmd->compute.cs = cs;
-    cmd->compute.groups = groups;
+    cmd->compute.groups = G_GroupCountFromThreadCount(cs, threads);
  }
 }

--- a/src/meta/meta.c
+++ b/src/meta/meta.c
@ -1087,13 +1087,22 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
            e->kind == ShaderEntryKind_PS ? Lit("ps_6_6") :
            Lit("cs_6_6")
          );
+
+          StringList local_defs = Zi;
+          {
+            PushStringToList(perm, &local_defs, StringF(perm, "-DShaderName=%F", FmtString(shader_name)));
+            PushStringToList(perm, &local_defs, StringF(perm, "-DShaderDef_%F=1", FmtString(shader_name)));
+            PushStringToList(perm, &local_defs, StringF(perm, "-DShaderTarget=%F", FmtString(target)));
+          }
+
          String cmd = StringF(
            perm,
-            "dxc.exe -T %F -E %F -Fo %F %F %F %F %F",
+            "dxc.exe -T %F -E %F -Fo %F %F %F %F %F %F",
            FmtString(target),
            FmtString(e->name),
            FmtString(out_file),
            FmtString(gpu_out_file),
+            FmtString(StringFromList(perm, local_defs, Lit(" "))),
            FmtString(StringFromList(perm, cp.defs, Lit(" "))),
            FmtString(StringFromList(perm, cp.flags_dxc, Lit(" "))),
            FmtString(StringFromList(perm, cp.warnings_dxc, Lit(" ")))
@ -1229,19 +1238,12 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
        if (output.len > 0)
        {
          String msg = output;
-          if (!StringContains(msg, Lit("In file")))
-          {
-            // If error message is missing "In file" then it may have
-            // failed to even find the entry point, meaning we should
-            // include the name of the shader in the error message for
-            // clarification.
-            msg = StringF(
-              perm,
-              "Error compiling shader \"%F\"\n%F",
-              FmtString(gpu_obj->name),
-              FmtString(output)
-            );
-          }
+          msg = StringF(
+            perm,
+            "%F\n%F",
+            FmtString(gpu_obj->name),
+            FmtString(output)
+          );
          if (obj_errored)
          {
            if (error_gpu_obj_outputs.count == 0)
--- a/src/pp/pp_vis/pp_vis_core.c
+++ b/src/pp/pp_vis/pp_vis_core.c
@ -5281,15 +5281,15 @@ void V_TickForever(WaveLaneCtx *lane)

      {
        // Prepare shade
-        G_Compute(cl, V_PrepareShadeCS, V_ThreadGroupSizeFromTexSize(frame->shade_dims));
+        G_Compute2D(cl, V_PrepareShadeCS, frame->shade_dims);

        // Prepare cells
-        G_Compute(cl, V_PrepareCellsCS, V_ThreadGroupSizeFromTexSize(cells_dims));
+        G_Compute2D(cl, V_PrepareCellsCS, cells_dims);

        // Clear particles
        if (frame->should_clear_particles)
        {
-          G_Compute(cl, V_ClearParticlesCS, V_ThreadGroupSizeFromBufferSize(V_ParticlesCap));
+          G_Compute(cl, V_ClearParticlesCS, V_ParticlesCap);
          V.particle_seq = 0;
        }

@ -5304,7 +5304,7 @@ void V_TickForever(WaveLaneCtx *lane)
            Vec2I32 down_dims = G_DimsFromMip2D(G_Count2D(backdrop_target), mip_idx);

            G_SetConstant(cl, V_GpuConst_MipIdx, mip_idx);
-            G_Compute(cl, V_BackdropDownCS, V_ThreadGroupSizeFromTexSize(down_dims));
+            G_Compute2D(cl, V_BackdropDownCS, down_dims);

            G_DumbGlobalMemorySync(cl);
          }
@ -5315,7 +5315,7 @@ void V_TickForever(WaveLaneCtx *lane)
            Vec2I32 up_dims = G_DimsFromMip2D(G_Count2D(backdrop_target), mip_idx);

            G_SetConstant(cl, V_GpuConst_MipIdx, mip_idx);
-            G_Compute(cl, V_BackdropUpCS, V_ThreadGroupSizeFromTexSize(up_dims));
+            G_Compute2D(cl, V_BackdropUpCS, up_dims);

            G_DumbGlobalMemorySync(cl);
         }
@ -5341,7 +5341,7 @@ void V_TickForever(WaveLaneCtx *lane)
        );

        // Emit particles
-        G_Compute(cl, V_EmitParticlesCS, V_ThreadGroupSizeFromBufferSize(frame->emitters_count));
+        G_Compute(cl, V_EmitParticlesCS, frame->emitters_count);

        // Sync particles, occluders, & albedo
        G_DumbGlobalMemorySync(cl);
@ -5353,7 +5353,7 @@ void V_TickForever(WaveLaneCtx *lane)

      {
        // Simulate particles
-        G_Compute(cl, V_SimParticlesCS, V_ThreadGroupSizeFromBufferSize(V_ParticlesCap));
+        G_Compute(cl, V_SimParticlesCS, V_ParticlesCap);

        // Sync cells
        G_DumbGlobalMemorySync(cl);
@ -5366,7 +5366,7 @@ void V_TickForever(WaveLaneCtx *lane)

      if (0)
      {
-        G_Compute(cl, V_ShadeCS, V_ThreadGroupSizeFromTexSize(frame->shade_dims));
+        G_Compute2D(cl, V_ShadeCS, frame->shade_dims);

        G_DumbGlobalMemorySync(cl);
      }
@ -5375,7 +5375,7 @@ void V_TickForever(WaveLaneCtx *lane)
      //- Composite pass

      {
-        G_Compute(cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
+        G_Compute2D(cl, V_CompositeCS, frame->screen_dims);

        // Sync screen tex
        G_DumbGlobalMemorySync(cl);
@ -5398,7 +5398,7 @@ void V_TickForever(WaveLaneCtx *lane)
          Vec2I32 down_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx);

          G_SetConstant(cl, V_GpuConst_MipIdx, mip_idx);
-          G_Compute(cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims));
+          G_Compute2D(cl, V_BloomDownCS, down_dims);

          G_DumbGlobalMemorySync(cl);
        }
@ -5409,7 +5409,7 @@ void V_TickForever(WaveLaneCtx *lane)
          Vec2I32 up_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx);

          G_SetConstant(cl, V_GpuConst_MipIdx, mip_idx);
-          G_Compute(cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims));
+          G_Compute2D(cl, V_BloomUpCS, up_dims);

          G_DumbGlobalMemorySync(cl);
       }
@ -5419,7 +5419,7 @@ void V_TickForever(WaveLaneCtx *lane)
      //- Finalization pass

      {
-        G_Compute(cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
+        G_Compute2D(cl, V_FinalizeCS, frame->screen_dims);

        G_DumbGlobalMemorySync(cl);
      }
--- a/src/pp/pp_vis/pp_vis_shared.cgh
+++ b/src/pp/pp_vis/pp_vis_shared.cgh
@ -377,7 +377,4 @@ Struct(V_SharedFrame)
 ////////////////////////////////////////////////////////////
 //~ Helpers

-#define V_ThreadGroupSizeFromBufferSize(buffer_size) VEC3I32((((buffer_size) + 255) / 256), 1, 1)
-#define V_ThreadGroupSizeFromTexSize(tex_size) VEC3I32(((tex_size).x + 15) / 16, ((tex_size).y + 15) / 16, 1)
-
 V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind);
--- a/src/proto/proto.c
+++ b/src/proto/proto.c
@ -39,7 +39,7 @@ void PT_RunForever(WaveLaneCtx *lane)

          // Test pass
          {
-            G_Compute(cl, PT_TestCS, VEC3I32((final_target_size.x + 7) / 8, (final_target_size.y + 7) / 8, 1));
+            G_Compute2D(cl, PT_TestCS, final_target_size);
          }
          G_DumbMemorySync(cl, final_target_res);

--- a/src/proto/proto_gpu.gh
+++ b/src/proto/proto_gpu.gh
@ -27,5 +27,5 @@ Struct(PT_BlitPSOutput)
 ComputeShader(PT_TestCS);

 //- Blit
-DeclVertexShader(PT_BlitVS, PT_BlitPSInput);
-DeclPixelShader(PT_BlitPS, PT_BlitPSOutput, PT_BlitPSInput input);
+VertexShader(PT_BlitVS, PT_BlitPSInput);
+PixelShader(PT_BlitPS, PT_BlitPSOutput, PT_BlitPSInput input);