From 83a41fc289ee4bc412f881a1f5044316a9b2f188 Mon Sep 17 00:00:00 2001 From: jacob Date: Sun, 15 Feb 2026 04:11:08 -0600 Subject: [PATCH] turn composite pass into compute shader --- src/gpu/gpu_dx12/gpu_dx12_core.c | 32 ++++++++------- src/pp/pp_vis/pp_vis.lay | 3 +- src/pp/pp_vis/pp_vis_core.c | 56 +++++++++++++++----------- src/pp/pp_vis/pp_vis_gpu.g | 47 +++++++++++---------- src/pp/pp_vis/pp_vis_gpu.gh | 1 + src/pp/pp_vis/pp_vis_shared.cgh | 15 +++++-- src/proto/proto_shaders.g | 2 +- src/window/window_win32/window_win32.c | 12 ++++-- 8 files changed, 97 insertions(+), 71 deletions(-) diff --git a/src/gpu/gpu_dx12/gpu_dx12_core.c b/src/gpu/gpu_dx12/gpu_dx12_core.c index ec502e12..428bab17 100644 --- a/src/gpu/gpu_dx12/gpu_dx12_core.c +++ b/src/gpu/gpu_dx12/gpu_dx12_core.c @@ -61,12 +61,11 @@ void G_Bootstrap(void) String error = Lit("Could not initialize GPU device."); String first_gpu_name = Zi; u32 adapter_index = 0; - b32 skip = 0; // For iGPU testing - for (;;) + b32 done = 0; + i32 skips = 0; // For iGPU testing + while (!done) { - { - hr = IDXGIFactory6_EnumAdapterByGpuPreference(G_D12.factory, adapter_index, DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE, &IID_IDXGIAdapter3, (void **)&adapter); - } + hr = IDXGIFactory6_EnumAdapterByGpuPreference(G_D12.factory, adapter_index, DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE, &IID_IDXGIAdapter3, (void **)&adapter); if (SUCCEEDED(hr)) { DXGI_ADAPTER_DESC1 desc; @@ -80,24 +79,29 @@ void G_Bootstrap(void) // - HighestShaderModel >= D3D_SHADER_MODEL_6_6 // - ResourceBindingTier >= D3D12_RESOURCE_BINDING_TIER_3 // - EnhancedBarriersSupported == 1 + // - AtomicInt64OnDescriptorHeapResourceSupported == 1 hr = D3D12CreateDevice((IUnknown *)adapter, D3D_FEATURE_LEVEL_12_0, &IID_ID3D12Device10, (void **)&device); } - if (SUCCEEDED(hr) && !skip) + if (SUCCEEDED(hr) && skips <= 0) { - break; + done = 1; + } + else + { + skips -= 1; + adapter_index += 1; + ID3D12Device_Release(device); + IDXGIAdapter3_Release(adapter); + adapter = 0; + device = 0; } - skip = 0; - ID3D12Device_Release(device); - IDXGIAdapter3_Release(adapter); - adapter = 0; - device = 0; - ++adapter_index; } else { - break; + done = 1; } } + if (!device) { if (first_gpu_name.len > 0) diff --git a/src/pp/pp_vis/pp_vis.lay b/src/pp/pp_vis/pp_vis.lay index 35f236c5..ccf71049 100644 --- a/src/pp/pp_vis/pp_vis.lay +++ b/src/pp/pp_vis/pp_vis.lay @@ -23,8 +23,7 @@ @ComputeShader V_EmitParticlesCS @ComputeShader V_SimParticlesCS @ComputeShader V_ShadeCS -@VertexShader V_CompositeVS -@PixelShader V_CompositePS +@ComputeShader V_CompositeCS @VertexShader V_DVertVS @PixelShader V_DVertPS diff --git a/src/pp/pp_vis/pp_vis_core.c b/src/pp/pp_vis/pp_vis_core.c index e63e96f9..dfb0dc29 100644 --- a/src/pp/pp_vis/pp_vis_core.c +++ b/src/pp/pp_vis/pp_vis_core.c @@ -2566,7 +2566,8 @@ void V_TickForever(WaveLaneCtx *lane) { V_Emitter emitter = Zi; - emitter.kind = V_ParticleKind_Blood; + emitter.kind = V_ParticleKind_BloodTrail; + // emitter.kind = V_ParticleKind_BloodDebris; f32 angle = AngleFromVec2(frame->look); // f32 angle = 0; @@ -2585,19 +2586,34 @@ void V_TickForever(WaveLaneCtx *lane) emitter.speed.max = speed + speed_spread * 0.5; emitter.angle.min = angle - angle_spread * 0.5; emitter.angle.max = angle + angle_spread * 0.5; + emitter.count = Kibi(32) * frame->dt; + V_PushParticles(emitter); + } - // emitter.falloff.min = emitter.falloff.max = 0; + { + V_Emitter emitter = Zi; - // emitter.count = CeilF32(Kibi(64) * frame->dt); - // emitter.count = Mebi(16); - // emitter.count = Mebi(2); - // emitter.count = Kibi(32); - // emitter.count = Kibi(8); - emitter.count = 128; - // emitter.count = 128; - // emitter.count = 32; - // emitter.count = 1; + // emitter.kind = V_ParticleKind_BloodTrail; + emitter.kind = V_ParticleKind_BloodDebris; + f32 angle = AngleFromVec2(frame->look); + // f32 angle = 0; + f32 angle_spread = Tau * 0.25; + // f32 angle_spread = Tau; + // f32 angle_spread = 0; + + // f32 speed = 5; + f32 speed = 10; + // f32 speed = 50; + // f32 speed = 100; + f32 speed_spread = speed * 2; + + emitter.pos.p0 = emitter.pos.p1 = frame->world_cursor; + emitter.speed.min = speed - speed_spread * 0.5; + emitter.speed.max = speed + speed_spread * 0.5; + emitter.angle.min = angle - angle_spread * 0.5; + emitter.angle.max = angle + angle_spread * 0.5; + emitter.count = Kibi(32) * frame->dt; V_PushParticles(emitter); } } @@ -4829,11 +4845,12 @@ void V_TickForever(WaveLaneCtx *lane) frame->gpu_arena, frame->cl, G_Format_R16G16B16A16_Float, frame->screen_dims, - G_Layout_DirectQueue_RenderTargetWrite, - .flags = G_ResourceFlag_AllowRenderTarget, + G_Layout_DirectQueue_ShaderReadWrite, + .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, .name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick)) ); frame->screen_ro = G_PushTexture2DRef(frame->gpu_arena, screen_target); + frame->screen_rw = G_PushRWTexture2DRef(frame->gpu_arena, screen_target); Rng3 screen_viewport = RNG3(VEC3(0, 0, 0), VEC3(frame->screen_dims.x, frame->screen_dims.y, 1)); Rng2 screen_scissor = RNG2(VEC2(screen_viewport.p0.x, screen_viewport.p0.y), VEC2(screen_viewport.p1.x, screen_viewport.p1.y)); @@ -4855,8 +4872,6 @@ void V_TickForever(WaveLaneCtx *lane) frame->shade_dims, G_Layout_DirectQueue_ShaderReadWrite, .flags = G_ResourceFlag_AllowShaderReadWrite, - // FIXME: Remove this - // .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_ForceNoReuse, .name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick)) ); frame->shade_ro = G_PushTexture2DRef(frame->gpu_arena, shade_target); @@ -5015,14 +5030,9 @@ void V_TickForever(WaveLaneCtx *lane) if (!disable_vis_draw) { - G_Rasterize( - frame->cl, - V_CompositeVS, V_CompositePS, - 1, G_QuadIndices(), - 1, &G_Rt(screen_target, G_BlendMode_CompositeStraightAlpha), - screen_viewport, screen_scissor, - G_RasterMode_TriangleList - ); + G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); + + G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite); } ////////////////////////////// diff --git a/src/pp/pp_vis/pp_vis_gpu.g b/src/pp/pp_vis/pp_vis_gpu.g index 103045fa..6e02d89e 100644 --- a/src/pp/pp_vis/pp_vis_gpu.g +++ b/src/pp/pp_vis/pp_vis_gpu.g @@ -44,7 +44,7 @@ Vec4 V_ColorFromParticle(V_ParticleKind particle_kind, u32 particle_idx, u32 den // f32 t = smoothstep(0, 2, (f32)density); result.a = lerp(0, 0.85, t); } - else if (particle_kind == V_ParticleKind_Blood) + else if (particle_kind == V_ParticleKind_BloodTrail || particle_kind == V_ParticleKind_BloodDebris) { // f32 t = (f32)density / 5; // t = pow(t, 2); @@ -53,13 +53,18 @@ Vec4 V_ColorFromParticle(V_ParticleKind particle_kind, u32 particle_idx, u32 den f32 t = (f32)density / 5; // t = smoothstep(-10, 10, t); - t = smoothstep(-5, 5, t); + // t = smoothstep(-5, 5, t); + t = smoothstep(0, 50, t); // result.rgb *= 1.0 - (t * 0.9); - result.a = t; + + // result.a = t; + result.a += (1.0 - result.a) * (t); } } - result.rgb += (rand_color - 0.5) * 0.025; + result.rgb = saturate(result.rgb + (rand_color - 0.5) * 0.05); + // result.a += (rand_alpha - 0.5) * 0.025; + // result.a *= rand_alpha; // Apply dryness result.rgb *= 1.0 - (dryness * 0.75); @@ -284,6 +289,9 @@ ComputeShader(V_SimParticlesCS, 64) f32 rand_speed = Norm16(seed0 >> 32); f32 rand_falloff = Norm16(seed0 >> 48); + u64 seed1 = MixU64(seed0); + f32 rand_density = Norm16(seed1 >> 0); + ////////////////////////////// //- Init @@ -454,8 +462,10 @@ ComputeShader(V_SimParticlesCS, 64) u32 stains_count = floor(particle.stain_accum); if (stains_count > 0) { + // TODO: Fixed point + u32 density = round(stains_count * rand_density); InterlockedMax(stain_cells[cell_pos], packed); - InterlockedAdd(stain_densities[cell_pos], stains_count); + InterlockedAdd(stain_densities[cell_pos], density); drynesses[cell_pos] = 0; particle.stain_accum -= stains_count; } @@ -553,25 +563,12 @@ ComputeShader2D(V_ShadeCS, 8, 8) //////////////////////////////////////////////////////////// //~ Composite -////////////////////////////// -//- Vertex shader - -VertexShader(V_CompositeVS, V_CompositePSInput) -{ - Vec2 uv = RectUvFromIdx(SV_VertexID); - V_CompositePSInput result; - result.sv_position = Vec4(NdcFromUv(uv).xy, 0, 1); - return result; -} - -////////////////////////////// -//- Pixel shader - -PixelShader(V_CompositePS, V_CompositePSOutput, V_CompositePSInput input) +ComputeShader2D(V_CompositeCS, 8, 8) { V_SharedFrame frame = G_Dereference(V_ShaderConst_Frame)[0]; // Texture2D shade_tex = G_Dereference(frame.shade_ro); Texture2D albedo_tex = G_Dereference(frame.albedo_ro); + RWTexture2D screen_tex = G_Dereference(frame.screen_rw); RWTexture2D stain_cells = G_Dereference(frame.stain_cells); RWTexture2D ground_cells = G_Dereference(frame.ground_cells); RWTexture2D stain_densities = G_Dereference(frame.stain_densities); @@ -583,7 +580,7 @@ PixelShader(V_CompositePS, V_CompositePSOutput, V_CompositePSInput input) SamplerState clamp_sampler = G_Dereference(frame.pt_clamp_sampler); RWStructuredBuffer particles = G_Dereference(frame.particles); - Vec2 screen_pos = input.sv_position.xy; + Vec2 screen_pos = SV_DispatchThreadID.xy + 0.5; Vec2 world_pos = mul(frame.af.screen_to_world, Vec3(screen_pos, 1)); Vec2 tile_pos = mul(frame.af.world_to_tile, Vec3(world_pos, 1)); Vec2 cell_pos = mul(frame.af.world_to_cell, Vec3(world_pos, 1)); @@ -593,6 +590,7 @@ PixelShader(V_CompositePS, V_CompositePSOutput, V_CompositePSInput input) Vec2 world_bounds_screen_p0 = mul(frame.af.world_to_screen, Vec3(-half_world_dims.xy, 1)); Vec2 world_bounds_screen_p1 = mul(frame.af.world_to_screen, Vec3(half_world_dims.xy, 1)); b32 is_in_world = all(cell_pos >= 0) && all(cell_pos < countof(ground_cells)); + b32 is_in_screen = all(screen_pos >= 0) && all(screen_pos < countof(screen_tex)); P_TileKind tile = tiles[tile_pos]; P_TileKind equipped_tile = frame.equipped_tile; @@ -929,9 +927,10 @@ PixelShader(V_CompositePS, V_CompositePSOutput, V_CompositePSInput input) result = Unpremul(result); - V_CompositePSOutput output; - output.sv_target0 = result; - return output; + if (is_in_screen) + { + screen_tex[screen_pos] = result; + } } //////////////////////////////////////////////////////////// diff --git a/src/pp/pp_vis/pp_vis_gpu.gh b/src/pp/pp_vis/pp_vis_gpu.gh index 37b5ca9a..2262ede5 100644 --- a/src/pp/pp_vis/pp_vis_gpu.gh +++ b/src/pp/pp_vis/pp_vis_gpu.gh @@ -68,6 +68,7 @@ ComputeShader2D(V_ShadeCS, 8, 8); //- Composite VertexShader(V_CompositeVS, V_CompositePSInput); PixelShader(V_CompositePS, V_CompositePSOutput, V_CompositePSInput input); +ComputeShader2D(V_CompositeCS, 8, 8); //- Debug shapes VertexShader(V_DVertVS, V_DVertPSInput); diff --git a/src/pp/pp_vis/pp_vis_shared.cgh b/src/pp/pp_vis/pp_vis_shared.cgh index 64bc0452..4d400712 100644 --- a/src/pp/pp_vis/pp_vis_shared.cgh +++ b/src/pp/pp_vis/pp_vis_shared.cgh @@ -137,6 +137,7 @@ Struct(V_SharedFrame) G_Texture2DRef tiles; G_Texture2DRef screen_ro; + G_RWTexture2DRef screen_rw; G_Texture2DRef shade_ro; G_RWTexture2DRef shade_rw; G_Texture2DRef albedo_ro; @@ -198,10 +199,16 @@ Enum(V_ParticleFlag) \ /* Ground particles */ \ X( \ - /* Name */ Blood, \ - /* Flags */ V_ParticleFlag_None | V_ParticleFlag_NoReflect, \ + /* Name */ BloodTrail, \ + /* Flags */ V_ParticleFlag_NoReflect, \ /* Stain rate, pen chance */ 500, 0.25, \ - /* Base color */ 0.5, 0.1, 0.1, 1 \ + /* Base color */ 0.5, 0.1, 0.1, 0.1 \ + ) \ + X( \ + /* Name */ BloodDebris, \ + /* Flags */ V_ParticleFlag_Ground | V_ParticleFlag_PruneWhenStill | V_ParticleFlag_StainWhenPruned, \ + /* Stain rate, pen chance */ 1, 0, \ + /* Base color */ 0.5, 0.1, 0.1, 0.8 \ ) \ X( \ /* Name */ Debris, \ @@ -231,7 +238,7 @@ Enum(V_ParticleFlag) /* Stain rate, pen chance */ 0, 0, \ /* Base color */ 1, 1, 0, 1 \ ) \ -/* -------------------------------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------------------------------------- */ Enum(V_ParticleKind) { diff --git a/src/proto/proto_shaders.g b/src/proto/proto_shaders.g index 7e271939..93714896 100644 --- a/src/proto/proto_shaders.g +++ b/src/proto/proto_shaders.g @@ -39,7 +39,7 @@ Struct(PT_BlitPSOutput) VertexShader(PT_BlitVS, PT_BlitPSInput) { - Vec2 uv = RectUvFromVertexId(SV_VertexID); + Vec2 uv = RectUvFromIdx(SV_VertexID); PT_BlitPSInput result; result.sv_position = Vec4(NdcFromUv(uv).xy, 0, 1); result.src_uv = uv; diff --git a/src/window/window_win32/window_win32.c b/src/window/window_win32/window_win32.c index a4ef973c..7d36fbb9 100644 --- a/src/window/window_win32/window_win32.c +++ b/src/window/window_win32/window_win32.c @@ -1,5 +1,10 @@ WND_W32_Ctx WND_W32 = Zi; +//////////////////////////////////////////////////////////// +//~ Win32 libs + +#pragma comment(lib, "gdi32") + //////////////////////////////////////////////////////////// //~ @hookimpl Bootstrap @@ -130,6 +135,7 @@ void WND_W32_ProcessMessagesForever(WaveLaneCtx *lane) { WND_W32_Window *window = &WND_W32.window; window->w2u_events_arena = AcquireArena(Gibi(64)); + Atomic64Set(&window->desired_cursor, (i64)WND_W32.cursors[WND_CursorKind_Default]); //- Initialize hwnd { @@ -231,7 +237,7 @@ LRESULT CALLBACK WND_W32_WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM l if ((HWND)wparam == hwnd && LOWORD(lparam) == HTCLIENT) { HCURSOR desired_cursor = (HCURSOR)Atomic64Fetch(&window->desired_cursor); - b32 desired_cursor_hidden = !Atomic64Fetch(&window->desired_cursor_hidden); + b32 desired_cursor_hidden = !!Atomic64Fetch(&window->desired_cursor_hidden); if (desired_cursor != window->active_cursor) { SetCursor(desired_cursor); @@ -241,13 +247,13 @@ LRESULT CALLBACK WND_W32_WindowProc(HWND hwnd, UINT msg, WPARAM wparam, LPARAM l { if (desired_cursor_hidden) { - while (ShowCursor(1) < 0) + while (ShowCursor(0) >= 0) { } } else { - while (ShowCursor(0) >= 0) + while (ShowCursor(1) < 0) { } }