256-threads per vis compute group

This commit is contained in:
jacob 2026-02-19 16:42:03 -06:00
parent 3b7b324369
commit 8a87ec2f6b
7 changed files with 48 additions and 38 deletions

View File

@ -17,6 +17,7 @@
#define SIM_CLIENT_INTERP_RATIO 2.0 #define SIM_CLIENT_INTERP_RATIO 2.0
#define GPU_NAMES IsRtcEnabled
#define GPU_DEBUG 0 #define GPU_DEBUG 0
#define GPU_DEBUG_VALIDATION 0 #define GPU_DEBUG_VALIDATION 0

View File

@ -153,9 +153,9 @@ void G_Bootstrap(void)
{ {
G_D12_CommandQueueDesc descs[] = { G_D12_CommandQueueDesc descs[] = {
{ .type = D3D12_COMMAND_LIST_TYPE_DIRECT, .priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH }, { .type = D3D12_COMMAND_LIST_TYPE_DIRECT, .priority = D3D12_COMMAND_QUEUE_PRIORITY_HIGH, .name = Lit("Direct Queue") },
{ .type = D3D12_COMMAND_LIST_TYPE_COMPUTE, .priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL }, { .type = D3D12_COMMAND_LIST_TYPE_COMPUTE, .priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .name = Lit("Compute Queue") },
{ .type = D3D12_COMMAND_LIST_TYPE_COPY, .priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL }, { .type = D3D12_COMMAND_LIST_TYPE_COPY, .priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, .name = Lit("Copy Queue") },
}; };
for (u32 i = 0; i < MinU32(countof(descs), countof(G_D12.queues)); ++i) for (u32 i = 0; i < MinU32(countof(descs), countof(G_D12.queues)); ++i)
{ {
@ -167,6 +167,7 @@ void G_Bootstrap(void)
if (SUCCEEDED(hr)) if (SUCCEEDED(hr))
{ {
hr = ID3D12Device_CreateFence(G_D12.device, 0, 0, &IID_ID3D12Fence, (void **)&queue->commit_fence); hr = ID3D12Device_CreateFence(G_D12.device, 0, 0, &IID_ID3D12Fence, (void **)&queue->commit_fence);
G_D12_SetObjectName((ID3D12Object *)queue->d3d_queue, desc.name);
} }
if (FAILED(hr)) if (FAILED(hr))
{ {
@ -179,22 +180,25 @@ void G_Bootstrap(void)
//- Initialize descriptor heaps //- Initialize descriptor heaps
{ {
Struct(Dx12HeapDesc) { D3D12_DESCRIPTOR_HEAP_TYPE type; D3D12_DESCRIPTOR_HEAP_FLAGS flags; u64 max; }; Struct(Dx12HeapDesc) { D3D12_DESCRIPTOR_HEAP_TYPE type; D3D12_DESCRIPTOR_HEAP_FLAGS flags; u64 max; String name; };
Dx12HeapDesc descs[G_D12_DescriptorHeapKind_COUNT] = { Dx12HeapDesc descs[G_D12_DescriptorHeapKind_COUNT] = {
[G_D12_DescriptorHeapKind_CbvSrvUav] = { [G_D12_DescriptorHeapKind_CbvSrvUav] = {
.type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, .type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV,
.flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, .flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE,
.max = G_D12_MaxCbvSrvUavDescriptors, .max = G_D12_MaxCbvSrvUavDescriptors,
.name = Lit("Primary Resource Descriptor Heap"),
}, },
[G_D12_DescriptorHeapKind_Rtv] = { [G_D12_DescriptorHeapKind_Rtv] = {
.type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV, .type = D3D12_DESCRIPTOR_HEAP_TYPE_RTV,
.flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE, .flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE,
.max = G_D12_MaxRtvDescriptors, .max = G_D12_MaxRtvDescriptors,
.name = Lit("Primary RTV Descriptor Heap"),
}, },
[G_D12_DescriptorHeapKind_Sampler] = { [G_D12_DescriptorHeapKind_Sampler] = {
.type = D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER, .type = D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER,
.flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE, .flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE,
.max = G_D12_MaxSamplerDescriptors, .max = G_D12_MaxSamplerDescriptors,
.name = Lit("Primary Sampler Descriptor Heap"),
}, },
}; };
for (G_D12_DescriptorHeapKind kind = 0; kind < countof(descs); ++kind) for (G_D12_DescriptorHeapKind kind = 0; kind < countof(descs); ++kind)
@ -309,13 +313,15 @@ void G_Bootstrap(void)
gpu_perm, cl, gpu_perm, cl,
u8, u8,
queue->print_buffer_size, queue->print_buffer_size,
.flags = G_ResourceFlag_AllowShaderReadWrite .flags = G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Debug print gpu buffer"),
); );
queue->print_readback_buffer = G_PushBuffer( queue->print_readback_buffer = G_PushBuffer(
gpu_perm, cl, gpu_perm, cl,
u8, u8,
queue->print_buffer_size, queue->print_buffer_size,
.flags = G_ResourceFlag_HostMemory .flags = G_ResourceFlag_HostMemory,
.name = Lit("Debug print readback buffer")
); );
queue->print_buffer_ref = G_PushRWByteAddressBufferRef(gpu_perm, queue->print_buffer); queue->print_buffer_ref = G_PushRWByteAddressBufferRef(gpu_perm, queue->print_buffer);
} }
@ -536,7 +542,7 @@ void G_D12_SetObjectName(ID3D12Object *object, String name)
TempArena scratch = BeginScratchNoConflict(); TempArena scratch = BeginScratchNoConflict();
{ {
wchar_t *name_wstr = WstrFromString(scratch.arena, name); wchar_t *name_wstr = WstrFromString(scratch.arena, name);
ID3D12Resource_SetName(object, name_wstr); ID3D12Object_SetName(object, name_wstr);
} }
EndScratch(scratch); EndScratch(scratch);
} }
@ -618,8 +624,8 @@ G_D12_Pipeline *G_D12_PipelineFromDesc(G_D12_PipelineDesc desc)
pipeline_name = StringF( pipeline_name = StringF(
scratch.arena, scratch.arena,
"%F %F", "%F %F",
FmtHandle(desc.cs.resource.v), FmtString(NameFromResource(desc.cs.resource)),
FmtString(NameFromResource(desc.cs.resource)) FmtHandle(desc.cs.resource.v)
); );
} }
else else
@ -627,10 +633,10 @@ G_D12_Pipeline *G_D12_PipelineFromDesc(G_D12_PipelineDesc desc)
pipeline_name = StringF( pipeline_name = StringF(
scratch.arena, scratch.arena,
"%F %F - %F %F", "%F %F - %F %F",
FmtHandle(desc.vs.resource.v),
FmtString(NameFromResource(desc.vs.resource)), FmtString(NameFromResource(desc.vs.resource)),
FmtHandle(desc.ps.resource.v), FmtHandle(desc.vs.resource.v),
FmtString(NameFromResource(desc.ps.resource)) FmtString(NameFromResource(desc.ps.resource)),
FmtHandle(desc.ps.resource.v)
); );
} }
@ -791,7 +797,7 @@ G_D12_Pipeline *G_D12_PipelineFromDesc(G_D12_PipelineDesc desc)
if (ok) if (ok)
{ {
if (GPU_DEBUG) if (GPU_NAMES)
{ {
G_D12_SetObjectName((ID3D12Object *)pso, pipeline_name); G_D12_SetObjectName((ID3D12Object *)pso, pipeline_name);
} }
@ -1204,7 +1210,7 @@ G_ResourceHandle G_PushResource(G_ArenaHandle arena_handle, G_CommandListHandle
ZeroStruct(release); ZeroStruct(release);
SllQueuePush(cl->releases.first, cl->releases.last, release); SllQueuePush(cl->releases.first, cl->releases.last, release);
release->d3d_resource = resource->d3d_resource; release->d3d_resource = resource->d3d_resource;
if (GPU_DEBUG) if (GPU_NAMES)
{ {
StaticAssert(countof(release->name_text) == countof(resource->name_text)); StaticAssert(countof(release->name_text) == countof(resource->name_text));
release->name_len = resource->name_len; release->name_len = resource->name_len;
@ -1315,7 +1321,7 @@ G_ResourceHandle G_PushResource(G_ArenaHandle arena_handle, G_CommandListHandle
{ {
resource->name_len = new_name.len; resource->name_len = new_name.len;
CopyBytes(resource->name_text, new_name.text, new_name.len); CopyBytes(resource->name_text, new_name.text, new_name.len);
if (GPU_DEBUG) if (GPU_NAMES)
{ {
G_D12_SetObjectName((ID3D12Object *)resource->d3d_resource, new_name); G_D12_SetObjectName((ID3D12Object *)resource->d3d_resource, new_name);
} }

View File

@ -222,6 +222,7 @@ Struct(G_D12_CommandQueueDesc)
{ {
D3D12_COMMAND_LIST_TYPE type; D3D12_COMMAND_LIST_TYPE type;
D3D12_COMMAND_QUEUE_PRIORITY priority; D3D12_COMMAND_QUEUE_PRIORITY priority;
String name;
}; };
Struct(G_D12_Queue) Struct(G_D12_Queue)

View File

@ -5210,6 +5210,8 @@ void V_TickForever(WaveLaneCtx *lane)
{ {
G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
G_DumbGlobalMemorySync(frame->cl);
} }
////////////////////////////// //////////////////////////////

View File

@ -56,7 +56,7 @@ Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Prepare frame //~ Prepare frame
ComputeShader2D(V_PrepareShadeCS, 8, 8) ComputeShader2D(V_PrepareShadeCS, 16, 16)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
RWTexture2D<Vec4> shade = G_Dereference<Vec4>(frame.shade_rw); RWTexture2D<Vec4> shade = G_Dereference<Vec4>(frame.shade_rw);
@ -69,7 +69,7 @@ ComputeShader2D(V_PrepareShadeCS, 8, 8)
} }
//- Prepare cells //- Prepare cells
ComputeShader2D(V_PrepareCellsCS, 8, 8) ComputeShader2D(V_PrepareCellsCS, 16, 16)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
Texture2D<P_TileKind> tiles = G_Dereference<P_TileKind>(frame.tiles); Texture2D<P_TileKind> tiles = G_Dereference<P_TileKind>(frame.tiles);
@ -158,7 +158,7 @@ ComputeShader2D(V_PrepareCellsCS, 8, 8)
} }
//- Clear particles //- Clear particles
ComputeShader(V_ClearParticlesCS, 64) ComputeShader(V_ClearParticlesCS, 256)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
RWStructuredBuffer<V_Particle> particles = G_Dereference<V_Particle>(frame.particles); RWStructuredBuffer<V_Particle> particles = G_Dereference<V_Particle>(frame.particles);
@ -236,7 +236,7 @@ PixelShader(V_QuadPS, V_QuadPSOutput, V_QuadPSInput input)
////////////////////////////// //////////////////////////////
//- Particle emitter shader //- Particle emitter shader
ComputeShader(V_EmitParticlesCS, 64) ComputeShader(V_EmitParticlesCS, 256)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
StructuredBuffer<V_Emitter> emitters = G_Dereference<V_Emitter>(frame.emitters); StructuredBuffer<V_Emitter> emitters = G_Dereference<V_Emitter>(frame.emitters);
@ -267,7 +267,7 @@ ComputeShader(V_EmitParticlesCS, 64)
////////////////////////////// //////////////////////////////
//- Particle sim shader //- Particle sim shader
ComputeShader(V_SimParticlesCS, 64) ComputeShader(V_SimParticlesCS, 256)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
Texture2D<P_TileKind> tiles = G_Dereference<P_TileKind>(frame.tiles); Texture2D<P_TileKind> tiles = G_Dereference<P_TileKind>(frame.tiles);
@ -544,7 +544,7 @@ ComputeShader(V_SimParticlesCS, 64)
// TODO: Remove this // TODO: Remove this
ComputeShader2D(V_ShadeCS, 8, 8) ComputeShader2D(V_ShadeCS, 16, 16)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_PointClamp]); SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_PointClamp]);
@ -580,7 +580,7 @@ ComputeShader2D(V_ShadeCS, 8, 8)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Composite //~ Composite
ComputeShader2D(V_CompositeCS, 8, 8) ComputeShader2D(V_CompositeCS, 16, 16)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
// Texture2D<Vec4> shade_tex = G_Dereference<Vec4>(frame.shade_ro); // Texture2D<Vec4> shade_tex = G_Dereference<Vec4>(frame.shade_ro);
@ -959,7 +959,7 @@ ComputeShader2D(V_CompositeCS, 8, 8)
////////////////////////////// //////////////////////////////
//- Downsample //- Downsample
ComputeShader2D(V_BloomDownCS, 8, 8) ComputeShader2D(V_BloomDownCS, 16, 16)
{ {
i32 mips_count = V_GpuConst_MipsCount; i32 mips_count = V_GpuConst_MipsCount;
i32 mip_idx = V_GpuConst_MipIdx; i32 mip_idx = V_GpuConst_MipIdx;
@ -1035,7 +1035,7 @@ ComputeShader2D(V_BloomDownCS, 8, 8)
////////////////////////////// //////////////////////////////
//- Upsample //- Upsample
ComputeShader2D(V_BloomUpCS, 8, 8) ComputeShader2D(V_BloomUpCS, 16, 16)
{ {
i32 mips_count = V_GpuConst_MipsCount; i32 mips_count = V_GpuConst_MipsCount;
i32 mip_idx = V_GpuConst_MipIdx; i32 mip_idx = V_GpuConst_MipIdx;
@ -1103,7 +1103,7 @@ ComputeShader2D(V_BloomUpCS, 8, 8)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Finalize //~ Finalize
ComputeShader2D(V_FinalizeCS, 8, 8) ComputeShader2D(V_FinalizeCS, 16, 16)
{ {
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0]; V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
SamplerState bilinear_sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); SamplerState bilinear_sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]);

View File

@ -51,29 +51,29 @@ Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density);
//~ Shaders //~ Shaders
//- Utility shaders //- Utility shaders
ComputeShader2D(V_PrepareCellsCS, 8, 8); ComputeShader2D(V_PrepareCellsCS, 16, 16);
ComputeShader(V_ClearParticlesCS, 64); ComputeShader(V_ClearParticlesCS, 256);
//- Quads //- Quads
VertexShader(V_QuadVS, V_QuadPSInput); VertexShader(V_QuadVS, V_QuadPSInput);
PixelShader(V_QuadPS, V_QuadPSOutput, V_QuadPSInput input); PixelShader(V_QuadPS, V_QuadPSOutput, V_QuadPSInput input);
//- Particle simulation //- Particle simulation
ComputeShader(V_EmitParticlesCS, 64); ComputeShader(V_EmitParticlesCS, 256);
ComputeShader(V_SimParticlesCS, 64); ComputeShader(V_SimParticlesCS, 256);
//- Shade //- Shade
ComputeShader2D(V_ShadeCS, 8, 8); ComputeShader2D(V_ShadeCS, 16, 16);
//- Composite //- Composite
ComputeShader2D(V_CompositeCS, 8, 8); ComputeShader2D(V_CompositeCS, 16, 16);
//- Bloom //- Bloom
ComputeShader2D(V_BloomDownCS, 8, 8); ComputeShader2D(V_BloomDownCS, 16, 16);
ComputeShader2D(V_BloomUpCS, 8, 8); ComputeShader2D(V_BloomUpCS, 16, 16);
//- Finalize //- Finalize
ComputeShader2D(V_FinalizeCS, 8, 8); ComputeShader2D(V_FinalizeCS, 16, 16);
//- Debug shapes //- Debug shapes
VertexShader(V_DVertVS, V_DVertPSInput); VertexShader(V_DVertVS, V_DVertPSInput);

View File

@ -373,7 +373,7 @@ Struct(V_SharedFrame)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
//~ Helpers //~ Helpers
#define V_ThreadGroupSizeFromBufferSize(buffer_size) VEC3I32((((buffer_size) + 63) / 64), 1, 1) #define V_ThreadGroupSizeFromBufferSize(buffer_size) VEC3I32((((buffer_size) + 255) / 256), 1, 1)
#define V_ThreadGroupSizeFromTexSize(tex_size) VEC3I32(((tex_size).x + 7) / 8, ((tex_size).y + 7) / 8, 1) #define V_ThreadGroupSizeFromTexSize(tex_size) VEC3I32(((tex_size).x + 15) / 16, ((tex_size).y + 15) / 16, 1)
V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind); V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind);