From f52d07d3bc90de2bae50ab364a33866c803aa954 Mon Sep 17 00:00:00 2001 From: jacob Date: Tue, 3 Mar 2026 00:33:54 -0600 Subject: [PATCH] move UI rect fetch to vertex shader --- src/gpu/gpu_shared.cgh | 50 +++++++++++++++++++------------------ src/meta/meta.c | 1 + src/pp/pp_vis/pp_vis_gpu.g | 9 +++---- src/pp/pp_vis/pp_vis_gpu.gh | 2 +- src/ui/ui_gpu.g | 12 +++------ src/ui/ui_gpu.gh | 2 +- 6 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/gpu/gpu_shared.cgh b/src/gpu/gpu_shared.cgh index 374212ff..c88f6d80 100644 --- a/src/gpu/gpu_shared.cgh +++ b/src/gpu/gpu_shared.cgh @@ -83,31 +83,33 @@ Enum(G_BasicSamplerKind) //~ Resource dereference #if IsGpu - //- Scalar/Uniform dereference - SamplerState G_SDeref(G_SamplerStateRef r) { u32 idx = r.v; return SamplerDescriptorHeap[idx]; } - template StructuredBuffer G_SDeref(G_StructuredBufferRef r) { u32 idx = r.v; return ResourceDescriptorHeap[idx]; } - ByteAddressBuffer G_SDeref(G_ByteAddressBufferRef r) { u32 idx = r.v; return ResourceDescriptorHeap[idx]; } - template Texture1D G_SDeref(G_Texture1DRef r) { u32 idx = r.v; return ResourceDescriptorHeap[idx]; } - template Texture2D G_SDeref(G_Texture2DRef r) { u32 idx = r.v; return ResourceDescriptorHeap[idx]; } - template Texture3D G_SDeref(G_Texture3DRef r) { u32 idx = r.v; return ResourceDescriptorHeap[idx]; } - template RWStructuredBuffer G_SDerefRW(G_StructuredBufferRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[idx]; } - RWByteAddressBuffer G_SDerefRW(G_ByteAddressBufferRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[idx]; } - template RWTexture1D G_SDerefRW(G_Texture1DRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[idx]; } - template RWTexture2D G_SDerefRW(G_Texture2DRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[idx]; } - template RWTexture3D G_SDerefRW(G_Texture3DRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[idx]; } + // NOTE: Uniform dereferencing is faster than Non-Uniform on AMD hardware - //- Vector/Non-Uniform dereference (slower on AMD) - SamplerState G_VDeref(G_SamplerStateRef r) { u32 idx = r.v; return SamplerDescriptorHeap[NonUniformResourceIndex(idx)]; } - template StructuredBuffer G_VDeref(G_StructuredBufferRef r) { u32 idx = r.v; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - ByteAddressBuffer G_VDeref(G_ByteAddressBufferRef r) { u32 idx = r.v; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - template Texture1D G_VDeref(G_Texture1DRef r) { u32 idx = r.v; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - template Texture2D G_VDeref(G_Texture2DRef r) { u32 idx = r.v; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - template Texture3D G_VDeref(G_Texture3DRef r) { u32 idx = r.v; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - template RWStructuredBuffer G_VDerefRW(G_StructuredBufferRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - RWByteAddressBuffer G_VDerefRW(G_ByteAddressBufferRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - template RWTexture1D G_VDerefRW(G_Texture1DRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - template RWTexture2D G_VDerefRW(G_Texture2DRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } - template RWTexture3D G_VDerefRW(G_Texture3DRef r) { u32 idx = r.v + 1; return ResourceDescriptorHeap[NonUniformResourceIndex(idx)]; } + //- Scalar/Uniform dereference + SamplerState G_SDeref(G_SamplerStateRef r) { return SamplerDescriptorHeap[r.v]; } + template StructuredBuffer G_SDeref(G_StructuredBufferRef r) { return ResourceDescriptorHeap[r.v]; } + ByteAddressBuffer G_SDeref(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[r.v]; } + template Texture1D G_SDeref(G_Texture1DRef r) { return ResourceDescriptorHeap[r.v]; } + template Texture2D G_SDeref(G_Texture2DRef r) { return ResourceDescriptorHeap[r.v]; } + template Texture3D G_SDeref(G_Texture3DRef r) { return ResourceDescriptorHeap[r.v]; } + template RWStructuredBuffer G_SDerefRW(G_StructuredBufferRef r) { return ResourceDescriptorHeap[r.v + 1]; } + RWByteAddressBuffer G_SDerefRW(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[r.v + 1]; } + template RWTexture1D G_SDerefRW(G_Texture1DRef r) { return ResourceDescriptorHeap[r.v + 1]; } + template RWTexture2D G_SDerefRW(G_Texture2DRef r) { return ResourceDescriptorHeap[r.v + 1]; } + template RWTexture3D G_SDerefRW(G_Texture3DRef r) { return ResourceDescriptorHeap[r.v + 1]; } + + //- Vector/Non-Uniform dereference + SamplerState G_VDeref(G_SamplerStateRef r) { return SamplerDescriptorHeap[NonUniformResourceIndex(r.v)]; } + template StructuredBuffer G_VDeref(G_StructuredBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } + ByteAddressBuffer G_VDeref(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } + template Texture1D G_VDeref(G_Texture1DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } + template Texture2D G_VDeref(G_Texture2DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } + template Texture3D G_VDeref(G_Texture3DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v)]; } + template RWStructuredBuffer G_VDerefRW(G_StructuredBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } + RWByteAddressBuffer G_VDerefRW(G_ByteAddressBufferRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } + template RWTexture1D G_VDerefRW(G_Texture1DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } + template RWTexture2D G_VDerefRW(G_Texture2DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } + template RWTexture3D G_VDerefRW(G_Texture3DRef r) { return ResourceDescriptorHeap[NonUniformResourceIndex(r.v + 1)]; } #endif //////////////////////////////////////////////////////////// diff --git a/src/meta/meta.c b/src/meta/meta.c index 2cbae886..5b68a5c4 100644 --- a/src/meta/meta.c +++ b/src/meta/meta.c @@ -563,6 +563,7 @@ void M_BuildEntryPoint(WaveLaneCtx *lane) PushStringToList(perm, &cp.warnings_dxc, Lit("-Wno-unused-local-typedef")); PushStringToList(perm, &cp.warnings_dxc, Lit("-Wno-conversion")); PushStringToList(perm, &cp.warnings_dxc, Lit("-Wno-switch")); + // PushStringToList(perm, &cp.warnings_dxc, Lit("-Wno-inline-asm")); // Disables false-positive "Gradient operations are not affected by wave-sensitive data or control flow." } } diff --git a/src/pp/pp_vis/pp_vis_gpu.g b/src/pp/pp_vis/pp_vis_gpu.g index 16c81621..a3743bf6 100644 --- a/src/pp/pp_vis/pp_vis_gpu.g +++ b/src/pp/pp_vis/pp_vis_gpu.g @@ -192,7 +192,7 @@ ImplComputeShader2D(V_BackdropDownCS) { bd_up = G_SDeref(frame.backdrop_mips[mip_idx - 1]); } - RWTexture2D bd_down = G_VDerefRW(frame.backdrop_mips[mip_idx]); + RWTexture2D bd_down = G_SDerefRW(frame.backdrop_mips[mip_idx]); Vec2 down_dims = countof(bd_down); @@ -316,9 +316,9 @@ ImplVertexShader(V_QuadVS, V_QuadPSInput) V_QuadPSInput result; result.sv_position = Vec4(NdcFromPos(screen_pos, frame.screen_dims).xy, 0, 1); - result.quad_idx = SV_InstanceID; result.world_pos = world_pos; result.samp_uv = samp_uv; + result.quad = quad; return result; } @@ -328,11 +328,10 @@ ImplVertexShader(V_QuadVS, V_QuadPSInput) ImplPixelShader(V_QuadPS, V_QuadPSOutput, V_QuadPSInput input) { V_SharedFrame frame = G_SDeref(V_GpuConst_Frame)[0]; - StructuredBuffer quads = G_SDeref(frame.quads); SamplerState sampler = G_SDeref(frame.basic_samplers[G_BasicSamplerKind_PointClamp]); RWTexture2D occluders = G_SDerefRW(frame.occluders); - V_Quad quad = quads[input.quad_idx]; + V_Quad quad = input.quad; Texture2D tex = G_VDeref(quad.tex); Vec2 world_pos = input.world_pos; @@ -1173,7 +1172,7 @@ ImplComputeShader2D(V_BloomDownCS) } } - if (IsInside(bloom_pos, down_dims)) + if (all(bloom_pos < down_dims)) { bloom_down[bloom_pos] = result; } diff --git a/src/pp/pp_vis/pp_vis_gpu.gh b/src/pp/pp_vis/pp_vis_gpu.gh index db690ca8..d1cfb891 100644 --- a/src/pp/pp_vis/pp_vis_gpu.gh +++ b/src/pp/pp_vis/pp_vis_gpu.gh @@ -4,9 +4,9 @@ Struct(V_QuadPSInput) { Semantic(Vec4, sv_position); - Semantic(nointerpolation u32, quad_idx); Semantic(Vec2, world_pos); Semantic(Vec2, samp_uv); + Semantic(nointerpolation V_Quad, quad); }; Struct(V_QuadPSOutput) diff --git a/src/ui/ui_gpu.g b/src/ui/ui_gpu.g index 2ad2bfa7..d0fb196f 100644 --- a/src/ui/ui_gpu.g +++ b/src/ui/ui_gpu.g @@ -8,8 +8,8 @@ ImplVertexShader(UI_DRectVS, UI_DRectPSInput) { UI_GpuParams params = G_SDeref(UI_GpuConst_Params)[0]; StructuredBuffer rects = G_SDeref(params.rects); - UI_GpuRect rect = rects[SV_InstanceID]; + UI_GpuRect rect = rects[SV_InstanceID]; Vec2 rect_uv = RectUvFromIdx(SV_VertexID); Vec2 tex_uv = lerp(rect.tex_slice_uv.p0, rect.tex_slice_uv.p1, rect_uv); Vec2 target_pos = lerp(rect.bounds.p0, rect.bounds.p1, rect_uv); @@ -17,15 +17,13 @@ ImplVertexShader(UI_DRectVS, UI_DRectPSInput) UI_DRectPSInput result; { result.sv_position = Vec4(NdcFromPos(target_pos, Vec2(params.target_size).xy), 0, 1); - result.rect_idx = SV_InstanceID; - result.base_background_premul = Premul(rect.background_lin); result.base_border_premul = Premul(rect.border_lin); result.tint_premul = Premul(rect.tint_lin); result.debug_premul = Premul(rect.debug_lin); - result.rect_uv = rect_uv; result.tex_uv = tex_uv; + result.rect = rect; } return result; } @@ -36,13 +34,11 @@ ImplVertexShader(UI_DRectVS, UI_DRectPSInput) ImplPixelShader(UI_DRectPS, UI_DRectPSOutput, UI_DRectPSInput input) { UI_GpuParams params = G_SDeref(UI_GpuConst_Params)[0]; - StructuredBuffer rects = G_SDeref(params.rects); SamplerState sampler = G_SDeref(params.sampler); - UI_GpuRect rect = rects[input.rect_idx]; - - Vec2 p = input.sv_position.xy; + UI_GpuRect rect = input.rect; Vec2 rect_uv = input.rect_uv; + Vec2 p = input.sv_position.xy; Vec2 p0 = rect.bounds.p0; Vec2 p1 = rect.bounds.p1; diff --git a/src/ui/ui_gpu.gh b/src/ui/ui_gpu.gh index 6c3a201d..93ee8fc5 100644 --- a/src/ui/ui_gpu.gh +++ b/src/ui/ui_gpu.gh @@ -4,13 +4,13 @@ Struct(UI_DRectPSInput) { Semantic(Vec4, sv_position); - Semantic(nointerpolation u32, rect_idx); Semantic(Vec4, base_background_premul); Semantic(Vec4, base_border_premul); Semantic(Vec4, tint_premul); Semantic(Vec4, debug_premul); Semantic(Vec2, rect_uv); Semantic(Vec2, tex_uv); + nointerpolation Semantic(UI_GpuRect, rect); }; Struct(UI_DRectPSOutput)