From e9bad68135f7ea039d9e5e11d2292c7c73cae3ff Mon Sep 17 00:00:00 2001 From: jacob Date: Wed, 18 Feb 2026 14:21:16 -0600 Subject: [PATCH] fix bloom shimmer. use common layouts for vis textures --- src/gpu/gpu_common.c | 52 +- src/gpu/gpu_common.h | 6 +- src/gpu/gpu_core.h | 10 +- src/pp/pp_vis/pp_vis.lay | 2 +- src/pp/pp_vis/pp_vis_core.c | 102 ++-- src/pp/pp_vis/pp_vis_gpu.g | 224 ++++---- src/pp/pp_vis/pp_vis_gpu.gh | 5 +- src/pp/pp_vis/pp_vis_shared.cg | 20 +- src/pp/pp_vis/pp_vis_shared.cgh | 20 +- tatus | 926 ++++++++++++++++++++++++++++++++ 10 files changed, 1163 insertions(+), 204 deletions(-) create mode 100644 tatus diff --git a/src/gpu/gpu_common.c b/src/gpu/gpu_common.c index a9686d87..43835793 100644 --- a/src/gpu/gpu_common.c +++ b/src/gpu/gpu_common.c @@ -25,7 +25,7 @@ void G_BootstrapCommon(void) gpu_perm, cl, G_Format_R8G8B8A8_Uint, VEC2I32(8, 8), - G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present, + G_Layout_Simultaneous, .flags = G_ResourceFlag_ZeroMemory ); G.blank_tex = G_PushTexture2DRef(gpu_perm, blank_tex); @@ -44,7 +44,7 @@ void G_BootstrapCommon(void) gpu_perm, cl, G_Format_R16_Uint, noise_dims, - G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present + G_Layout_Simultaneous ); G_CopyCpuToTexture( cl, @@ -143,30 +143,54 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList //- Mip -i32 G_DimsFromMip1D(i32 texture_dims, i32 mip) +i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip) { - mip = ClampI32(mip, 0, 31); + mip = ClampI32(mip, -31, 31); i32 result = 0; - result = MaxI32(result >> mip, 1); + if (mip >= 0) + { + result = MaxI32(result >> mip, 1); + } + else + { + result = MaxI32(result << -mip, 1); + } return result; } -Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip) +Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip) { - mip = ClampI32(mip, 0, 31); + mip = ClampI32(mip, -31, 31); Vec2I32 result = Zi; - result.x = MaxI32(texture_dims.x >> mip, 1); - result.y = MaxI32(texture_dims.y >> mip, 1); + if (mip >= 0) + { + result.x = MaxI32(mip0_dims.x >> mip, 1); + result.y = MaxI32(mip0_dims.y >> mip, 1); + } + else + { + result.x = MaxI32(mip0_dims.x << -mip, 1); + result.y = MaxI32(mip0_dims.y << -mip, 1); + } return result; } -Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip) +Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip) { - mip = ClampI32(mip, 0, 31); + mip = ClampI32(mip, -31, 31); Vec3I32 result = Zi; - result.x = MaxI32(texture_dims.x >> mip, 1); - result.y = MaxI32(texture_dims.y >> mip, 1); - result.z = MaxI32(texture_dims.z >> mip, 1); + if (mip >= 0) + { + result.x = MaxI32(mip0_dims.x >> mip, 1); + result.y = MaxI32(mip0_dims.y >> mip, 1); + result.z = MaxI32(mip0_dims.z >> mip, 1); + } + else + { + result.x = MaxI32(mip0_dims.x << -mip, 1); + result.y = MaxI32(mip0_dims.y << -mip, 1); + result.z = MaxI32(mip0_dims.z << -mip, 1); + } return result; } diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h index eb3ee6d2..03927040 100644 --- a/src/gpu/gpu_common.h +++ b/src/gpu/gpu_common.h @@ -35,9 +35,9 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList G_PushBufferFromCpuCopy_((_arena), (_cl), (_src), (G_BufferDesc) { .size = (_src).len, __VA_ARGS__ }) //- Mip -i32 G_DimsFromMip1D(i32 texture_dims, i32 mip); -Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip); -Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip); +i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip); +Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip); +Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip); //- Viewport / scissor Rng3 G_ViewportFromTexture(G_ResourceHandle texture); diff --git a/src/gpu/gpu_core.h b/src/gpu/gpu_core.h index 7e1b329a..bed18c93 100644 --- a/src/gpu/gpu_core.h +++ b/src/gpu/gpu_core.h @@ -242,18 +242,16 @@ Enum(G_Access) G_Access_IndexBuffer = (1 << 8), G_Access_IndirectArgument = (1 << 9), - G_Access_All = 0xFFFFFFFF + G_Access_All = 0xFFFFFFFF // Represents all accesses relevant to the specified sync stage }; Enum(G_Layout) { G_Layout_NoChange, - // "Simultaneous" allows a resource to be used on any queue with any access - // type, as long as there is only one writer at a time, and the writer is not - // writing to any texels currently being read. - // Resources cannot transition to/from this layout. They must be created - // with it and are locked to it. + // Simultaneous layout allows a resource to be used on any queue with any + // access type (except depth-stencil). Resources cannot transition to/from + // this layout, they must be created with it. G_Layout_Simultaneous, // D3D12_BARRIER_LAYOUT_COMMON + D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS G_Layout_Undefined, // D3D12_BARRIER_LAYOUT_UNDEFINED diff --git a/src/pp/pp_vis/pp_vis.lay b/src/pp/pp_vis/pp_vis.lay index f72dc528..2d916376 100644 --- a/src/pp/pp_vis/pp_vis.lay +++ b/src/pp/pp_vis/pp_vis.lay @@ -26,7 +26,7 @@ @ComputeShader V_CompositeCS @ComputeShader V_BloomDownCS @ComputeShader V_BloomUpCS -@ComputeShader V_PostProcessCS +@ComputeShader V_FinalizeCS @VertexShader V_DVertVS @PixelShader V_DVertPS diff --git a/src/pp/pp_vis/pp_vis_core.c b/src/pp/pp_vis/pp_vis_core.c index f2f5e6b5..338036ba 100644 --- a/src/pp/pp_vis/pp_vis_core.c +++ b/src/pp/pp_vis/pp_vis_core.c @@ -416,7 +416,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R8_Uint, tiles_dims, - G_Layout_DirectQueue_ShaderRead, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory, .name = Lit("Tiles") ); @@ -441,7 +441,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Uint, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = StringF(perm, "Particle cells - layer %F", FmtSint(layer)) ); @@ -454,7 +454,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Uint, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = StringF(perm, "Particle densities - layer %F", FmtSint(layer)) ); @@ -469,7 +469,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R16G16B16A16_Float, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Stains") ); @@ -481,7 +481,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R16G16B16A16_Float, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Dry stains") ); @@ -493,7 +493,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Float, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Drynesses") ); @@ -505,7 +505,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Uint, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Occluders cells") ); @@ -614,6 +614,8 @@ void V_TickForever(WaveLaneCtx *lane) frame->dt = SecondsFromNs(frame->dt_ns); frame->rand = prev_frame->rand; + frame->should_tone_map = TweakBool("Tone mapping enabled", 1); + if (P_IsEntKeyNil(V.player_key)) { TrueRand(StringFromStruct(&V.player_key)); @@ -4918,18 +4920,17 @@ void V_TickForever(WaveLaneCtx *lane) frame->tile_descs[tile_kind] = tile_desc; } } + // Upload tiles if (frame->tiles_dirty) { // LogDebugF("Uploading tiles to gpu"); - G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_CopyWrite); G_CopyCpuToTexture( frame->cl, gpu_tiles_res, VEC3I32(0, 0, 0), local_world->tiles, VEC3I32(tiles_dims.x, tiles_dims.y, 1), RNG3I32(VEC3I32(0, 0, 0), VEC3I32(tiles_dims.x, tiles_dims.y, 1)) ); - G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_ShaderRead); } // Screen texture @@ -4937,7 +4938,7 @@ void V_TickForever(WaveLaneCtx *lane) frame->gpu_arena, frame->cl, G_Format_R16G16B16A16_Float, frame->screen_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, .name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick)) ); @@ -4951,11 +4952,10 @@ void V_TickForever(WaveLaneCtx *lane) frame->gpu_arena, frame->cl, G_Format_R16G16B16A16_Float, G_DimsFromMip2D(G_Count2D(screen_target), 1), - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, .name = StringF(frame->arena, "Bloom target [%F]", FmtSint(frame->tick)), - // .max_mips = 4 - .max_mips = 8 + .max_mips = 64 ); for (i32 mip_idx = 0; mip_idx < G_CountMips(bloom_target); ++mip_idx) { @@ -4979,7 +4979,7 @@ void V_TickForever(WaveLaneCtx *lane) frame->gpu_arena, frame->cl, G_Format_R16G16B16A16_Float, frame->shade_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_AllowShaderReadWrite, .name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick)) ); @@ -5091,6 +5091,9 @@ void V_TickForever(WaveLaneCtx *lane) // Sync particles & occluders G_DumbGlobalMemorySync(frame->cl); + + // Transition albedo + G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite); } ////////////////////////////// @@ -5113,83 +5116,63 @@ void V_TickForever(WaveLaneCtx *lane) G_Compute(frame->cl, V_ShadeCS, V_ThreadGroupSizeFromTexSize(frame->shade_dims)); } - ////////////////////////////// - //- Transition G-buffers to readonly - - { - G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead); - G_DumbMemoryLayoutSync(frame->cl, shade_target, G_Layout_DirectQueue_ShaderRead); - } - ////////////////////////////// //- Composite pass { G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead); + // Sync screen tex + G_DumbGlobalMemorySync(frame->cl); } ////////////////////////////// //- Bloom passes { - i32 mips_count = G_CountMips(bloom_target); + i32 mips_count = G_CountMips(bloom_target) + 1; + G_SetConstant(frame->cl, V_GpuConst_MipsCount, mips_count); + + // NOTE: Because bloom mip chain starts at half screen size, mip_idx 0 + // actually represents the screen texture, while mip_idx - 1 represents + // the first mip index in the bloom mip chain //- Downsample + blur passes - for (i32 mip_idx = 0; mip_idx < mips_count; ++mip_idx) + for (i32 mip_idx = 1; mip_idx < mips_count; ++mip_idx) { - Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx); - if (mip_idx == 0) - { - // Init bloom pyramid from screen target on first pass (prefilter) - gpu_flags |= V_GpuFlag_InitBloom; - G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags); - G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->screen_ro); - } - else - { - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx - 1, mip_idx - 1)); - G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx - 1]); - } - G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]); - { - G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(dims)); - } - gpu_flags &= ~V_GpuFlag_InitBloom; - G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags); + Vec2I32 down_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx); + + G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); + G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims)); + + G_DumbGlobalMemorySync(frame->cl); } //- Upsample passes for (i32 mip_idx = mips_count - 2; mip_idx >= 0; --mip_idx) { - Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx); + Vec2I32 up_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx); - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderReadWrite, .mips = RNGI32(mip_idx, mip_idx)); - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx + 1, mip_idx + 1)); + G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); + G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims)); - G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx + 1]); - G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]); - - G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(dims)); - } + G_DumbGlobalMemorySync(frame->cl); + } } ////////////////////////////// - //- Post process pass + //- Finalization pass { - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderReadWrite); - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(0, 0)); - G_Compute(frame->cl, V_PostProcessCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); + G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); } ////////////////////////////// //- Debug shapes pass - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite); - { + G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite); + G_Rasterize( frame->cl, V_DVertVS, V_DVertPS, @@ -5198,12 +5181,13 @@ void V_TickForever(WaveLaneCtx *lane) screen_viewport, screen_scissor, G_RasterMode_TriangleList ); + + G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite); } ////////////////////////////// //- Finalize screen target - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead); { Rng2 uv = Zi; uv.p0 = Vec2FromVec(screen_viewport.p0); diff --git a/src/pp/pp_vis/pp_vis_gpu.g b/src/pp/pp_vis/pp_vis_gpu.g index f8a254de..3ff31b54 100644 --- a/src/pp/pp_vis/pp_vis_gpu.g +++ b/src/pp/pp_vis/pp_vis_gpu.g @@ -53,13 +53,6 @@ Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density) return result; } -// ACES approximation by Krzysztof Narkowicz -// https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/ -Vec3 V_ToneMap(Vec3 v) -{ - return saturate((v * (2.51f * v + 0.03f)) / (v * (2.43f * v + 0.59f) + 0.14f)); -} - //////////////////////////////////////////////////////////// //~ Prepare frame @@ -142,11 +135,11 @@ ComputeShader2D(V_PrepareCellsCS, 8, 8) } else if (over_stain.a > 0) { - Vec4 stain = dry_stains[cell_pos]; Vec4 dry_stain = max(dry_stains[cell_pos], 0); + Vec4 stain = dry_stain; - stain = BlendPremul(over_stain, stain); dry_stain = BlendPremul(over_dry_stain, dry_stain); + stain = BlendPremul(over_stain, stain); stains[cell_pos] = stain; dry_stains[cell_pos] = dry_stain; @@ -483,7 +476,7 @@ ComputeShader(V_SimParticlesCS, 64) particle.prev_occluder = occluder; } - if (!AnyBit(desc.flags, V_ParticleFlag_NoPruneWhenStill) && dot(particle.velocity, particle.velocity) < 0.0001) + if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold)) { prune = 1; } @@ -723,7 +716,6 @@ ComputeShader2D(V_CompositeCS, 8, 8) Vec4 ground_particle_color = 0; Vec4 air_particle_color = 0; - for (V_ParticleLayer layer = (V_ParticleLayer)0; layer < V_ParticleLayer_COUNT; layer += (V_ParticleLayer)1) { RWTexture2D cells = G_Dereference(frame.particle_cells[layer]); @@ -752,9 +744,9 @@ ComputeShader2D(V_CompositeCS, 8, 8) // Darken wall particles / stains if (tile == P_TileKind_Wall) { - ground_particle_color *= 0.25; - air_particle_color *= 0.25; - stain_color *= 0.25; + ground_particle_color *= 0.5; + air_particle_color *= 0.5; + stain_color *= 0.5; } ////////////////////////////// @@ -972,57 +964,74 @@ ComputeShader2D(V_CompositeCS, 8, 8) //////////////////////////////////////////////////////////// //~ Bloom +////////////////////////////// +//- Downsample + ComputeShader2D(V_BloomDownCS, 8, 8) { - V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; - Texture2D bloom_up = G_Dereference(V_GpuConst_BloomRead); - RWTexture2D bloom_down = G_Dereference(V_GpuConst_BloomWrite); - SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); + i32 mips_count = V_GpuConst_MipsCount; + i32 mip_idx = V_GpuConst_MipIdx; + + V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; + SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); + RWTexture2D bloom_down = G_Dereference(frame.bloom_mips_rw[mip_idx - 1]); + + Texture2D bloom_up; + b32 is_first_pass = mip_idx == 1; + if (is_first_pass) + { + bloom_up = G_Dereference(frame.screen_ro); + } + else + { + bloom_up = G_Dereference(frame.bloom_mips_ro[mip_idx - 2]); + } - Vec2 up_dims = countof(bloom_up); Vec2 down_dims = countof(bloom_down); Vec2 bloom_pos = SV_DispatchThreadID + 0.5; Vec2 bloom_uv = bloom_pos / down_dims; Vec2 off_uv = 0.5 / down_dims; - b32 is_first_pass = !!(V_GpuConst_Flags & V_GpuFlag_InitBloom); - Struct(SampleDesc) { Vec2 uv; f32 weight; }; - SampleDesc samples[] = { - { bloom_uv + Vec2(0, 0), 0.5 }, - { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 }, - { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 }, - { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 }, - { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 }, - }; + f32 threshold = 0.25; + f32 knee = 0.75; Vec4 result = 0; - for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx) { - SampleDesc desc = samples[sample_idx]; - Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0); - - f32 knee_weight = 1; - if (is_first_pass) + // 5-tap sample + Struct(SampleDesc) { Vec2 uv; f32 weight; }; + SampleDesc samples[] = { + { bloom_uv + Vec2(0, 0), 0.5 }, + { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 }, + { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 }, + { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 }, + { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 }, + }; + for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx) { - f32 luminance = LuminanceFromColor(src); - f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance - f32 bright = max(luminance, (max_rgb - 1.0) * 0.5); - if (bright > 0) - { - f32 threshold = 1.0; - f32 knee = 0.5; - f32 over_threshold = max(bright - threshold, 0.0); - f32 ramp = saturate(over_threshold / knee); - knee_weight = (over_threshold * ramp * ramp) / bright; - } - else - { - knee_weight = 0; - } - } + SampleDesc desc = samples[sample_idx]; + Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0); - result += src * desc.weight * knee_weight; + f32 knee_weight = 1; + if (is_first_pass) + { + f32 luminance = LuminanceFromColor(src); + f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance + f32 bright = max(luminance, (max_rgb - 1.0) * 0.5); + if (bright > 0) + { + f32 over_threshold = max(bright - threshold, 0.0); + f32 ramp = saturate(over_threshold / knee); + knee_weight = (over_threshold * ramp * ramp) / bright; + } + else + { + knee_weight = 0; + } + } + + result += src * desc.weight * knee_weight; + } } if (IsInside(bloom_pos, down_dims)) @@ -1031,52 +1040,78 @@ ComputeShader2D(V_BloomDownCS, 8, 8) } } +////////////////////////////// +//- Upsample + ComputeShader2D(V_BloomUpCS, 8, 8) { - V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; - Texture2D bloom_down = G_Dereference(V_GpuConst_BloomRead); - RWTexture2D bloom_up = G_Dereference(V_GpuConst_BloomWrite); - SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); + i32 mips_count = V_GpuConst_MipsCount; + i32 mip_idx = V_GpuConst_MipIdx; + + V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; + SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); + Texture2D bloom_down = G_Dereference(frame.bloom_mips_ro[mip_idx]); + + b32 is_last_pass = mip_idx == 0; + RWTexture2D bloom_up; + if (is_last_pass) + { + bloom_up = G_Dereference(frame.screen_rw); + } + else + { + bloom_up = G_Dereference(frame.bloom_mips_rw[mip_idx - 1]); + } - Vec2 up_dims = countof(bloom_up); Vec2 down_dims = countof(bloom_down); + Vec2 up_dims = countof(bloom_up); Vec2 bloom_pos = SV_DispatchThreadID + 0.5; Vec2 bloom_uv = bloom_pos / up_dims; - Vec2 off_uv = 1 / up_dims; + Vec2 off_inner_uv = 1 / down_dims; + Vec2 off_outer_uv = off_inner_uv * 2; + // 13-tap sample Vec4 result = 0; { // Center - result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 4; - // Edges + result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 9.0f / 41.0f; + + // Outer Edges result += ( - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, 0), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, 0), 0) - ) * 2; - // Corners + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_outer_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_outer_uv.x, 0), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_outer_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_outer_uv.x, 0), 0) + ) * 3.0f / 41.0f; + + // Inner corners result += ( - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, -off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, off_uv.y), 0) - ); - // Normalize - result /= 16; + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_inner_uv.x, -off_inner_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_inner_uv.x, -off_inner_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_inner_uv.x, off_inner_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_inner_uv.x, off_inner_uv.y), 0) + ) * 4.0f / 41.0f; + + // Outer corners + result += ( + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_outer_uv.x, -off_outer_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_outer_uv.x, -off_outer_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_outer_uv.x, off_outer_uv.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_outer_uv.x, off_outer_uv.y), 0) + ) * 1.0f / 41.0f; } if (IsInside(bloom_pos, up_dims)) { - bloom_up[bloom_pos] += result; + bloom_up[bloom_pos] += result * 0.75; } } //////////////////////////////////////////////////////////// -//~ Post process +//~ Finalize -ComputeShader2D(V_PostProcessCS, 8, 8) +ComputeShader2D(V_FinalizeCS, 8, 8) { V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; SamplerState bilinear_sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); @@ -1084,42 +1119,21 @@ ComputeShader2D(V_PostProcessCS, 8, 8) RWTexture2D screen_tex = G_Dereference(frame.screen_rw); Vec2 screen_pos = SV_DispatchThreadID + 0.5; - Vec2 screen_uv = screen_pos / frame.screen_dims; b32 is_in_screen = IsInside(screen_pos, frame.screen_dims); - - ////////////////////////////// - //- Original - - Vec4 original = 0; if (is_in_screen) { - original = screen_tex[screen_pos]; - original.rgb *= original.a; - } + Vec4 result = screen_tex[screen_pos]; + //- Tone map + if (frame.should_tone_map) + { + // ACES approximation by Krzysztof Narkowicz + // https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/ + result.rgb = saturate((result.rgb * (2.51f * result.rgb + 0.03f)) / (result.rgb * (2.43f * result.rgb + 0.59f) + 0.14f)); + } - ////////////////////////////// - //- Bloom + result = Unpremul(result); - Vec4 bloom = 0; - if (is_in_screen) - { - bloom = bloom_tex.SampleLevel(bilinear_sampler, screen_uv, 0); - // bloom.rgb *= bloom.a; - } - - ////////////////////////////// - //- Compose - - Vec4 result = Vec4(0, 0, 0, 1); - result = BlendPremul(original, result); - result += bloom; - // result.rgb = V_ToneMap(result); - - result = Unpremul(result); - - if (is_in_screen) - { screen_tex[screen_pos] = result; } } diff --git a/src/pp/pp_vis/pp_vis_gpu.gh b/src/pp/pp_vis/pp_vis_gpu.gh index a47a2335..f176f2f8 100644 --- a/src/pp/pp_vis/pp_vis_gpu.gh +++ b/src/pp/pp_vis/pp_vis_gpu.gh @@ -46,7 +46,6 @@ Struct(V_DVertPSOutput) f32 V_RandFromPos(Vec3 pos); Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density); -Vec3 V_ToneMap(Vec3 v); //////////////////////////////////////////////////////////// //~ Shaders @@ -73,8 +72,8 @@ ComputeShader2D(V_CompositeCS, 8, 8); ComputeShader2D(V_BloomDownCS, 8, 8); ComputeShader2D(V_BloomUpCS, 8, 8); -//- Post process -ComputeShader2D(V_PostProcessCS, 8, 8); +//- Finalize +ComputeShader2D(V_FinalizeCS, 8, 8); //- Debug shapes VertexShader(V_DVertVS, V_DVertPSInput); diff --git a/src/pp/pp_vis/pp_vis_shared.cg b/src/pp/pp_vis/pp_vis_shared.cg index 2419a6f2..72f6ae8d 100644 --- a/src/pp/pp_vis/pp_vis_shared.cg +++ b/src/pp/pp_vis/pp_vis_shared.cg @@ -11,37 +11,42 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind) V_ParticleDesc result; { PERSIST Readonly V_ParticleFlag flags[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) flags, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) flags, V_ParticlesXList(X) #undef X }; PERSIST Readonly V_ParticleLayer layers[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) layer, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) layer, V_ParticlesXList(X) #undef X }; PERSIST Readonly f32 stain_rates[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) stain_rate, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) stain_rate, V_ParticlesXList(X) #undef X }; PERSIST Readonly f32 pen_rates[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) pen_rate, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) pen_rate, V_ParticlesXList(X) #undef X }; PERSIST Readonly f32 lifetimes[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) lifetime, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) lifetime, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly f32 prune_speed_thresholds[V_ParticleKind_COUNT] = { + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) prune_speed_threshold, V_ParticlesXList(X) #undef X }; PERSIST Readonly Vec4 base_colors[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) base_color, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) base_color, V_ParticlesXList(X) #undef X }; PERSIST Readonly Vec4 dry_factor[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) dry_factor, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) dry_factor, V_ParticlesXList(X) #undef X }; @@ -51,6 +56,7 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind) result.stain_rate = stain_rates[kind]; result.pen_rate = pen_rates[kind]; result.lifetime = lifetimes[kind]; + result.prune_speed_threshold = prune_speed_thresholds[kind]; result.base_color = LinearFromSrgb(base_colors[kind]); result.dry_factor = LinearFromSrgb(dry_factor[kind]); } diff --git a/src/pp/pp_vis/pp_vis_shared.cgh b/src/pp/pp_vis/pp_vis_shared.cgh index 16ca6419..71d88ea5 100644 --- a/src/pp/pp_vis/pp_vis_shared.cgh +++ b/src/pp/pp_vis/pp_vis_shared.cgh @@ -9,14 +9,13 @@ Enum(V_GpuFlag) { V_GpuFlag_None = 0, - V_GpuFlag_InitBloom = (1 << 0), }; G_DeclConstant(V_GpuFlag, V_GpuConst_Flags, 0); G_DeclConstant(G_StructuredBufferRef, V_GpuConst_Frame, 1); G_DeclConstant(G_Texture3DRef, V_GpuConst_NoiseTex, 2); -G_DeclConstant(G_Texture2DRef, V_GpuConst_BloomRead, 3); -G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4); +G_DeclConstant(i32, V_GpuConst_MipsCount, 3); +G_DeclConstant(i32, V_GpuConst_MipIdx, 4); //////////////////////////////////////////////////////////// //~ Particle types @@ -29,7 +28,6 @@ G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4); Enum(V_ParticleFlag) { V_ParticleFlag_None = 0, - V_ParticleFlag_NoPruneWhenStill = (1 << 0), V_ParticleFlag_StainWhenPruned = (1 << 1), V_ParticleFlag_NoReflect = (1 << 2), V_ParticleFlag_OnlyCollideWithWalls = (1 << 3), @@ -53,6 +51,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Ground, \ /* Stain rate, pen chance */ 30, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0, 0, 0, 0), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -64,8 +63,9 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Ground, \ /* Stain rate, pen chance */ 100, 0.25, \ /* Lifetime */ Inf, \ - /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.05), \ - /* Dry color factor */ CompVec4(0.5, 0.5, 0.5, 1) \ + /* Prune speed threshold */ 0.5, \ + /* Base color */ CompVec4(0.6, 0.1, 0.1, 0.05), \ + /* Dry color factor */ CompVec4(0.4, 0.4, 0.4, 1) \ ) \ X( \ /* Name */ BloodDebris, \ @@ -73,6 +73,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 30, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.8), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -82,6 +83,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.4, 0.3, 0.2, 1), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -91,6 +93,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.1, \ /* Base color */ CompVec4(2, 0.5, 0, 1), \ /* Dry color factor */ CompVec4(0.2, 0.1, 0.0, 1) \ ) \ @@ -102,6 +105,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ 0.075, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.8, 0.6, 0.2, 0.25), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -111,6 +115,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Air, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.25, 0.25, 0.25, 0.75), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -122,6 +127,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(1, 1, 0, 1), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -168,6 +174,7 @@ Struct(V_ParticleDesc) f32 stain_rate; f32 pen_rate; f32 lifetime; + f32 prune_speed_threshold; Vec4 base_color; Vec4 dry_factor; }; @@ -264,6 +271,7 @@ Struct(V_SharedFrame) b32 tiles_dirty; b32 should_clear_particles; + b32 should_tone_map; b32 is_looking; b32 is_moving; diff --git a/tatus b/tatus new file mode 100644 index 00000000..1be2e11b --- /dev/null +++ b/tatus @@ -0,0 +1,926 @@ +diff --git a/src/gpu/gpu_common.c b/src/gpu/gpu_common.c +index a9686d87..43835793 100644 +--- a/src/gpu/gpu_common.c ++++ b/src/gpu/gpu_common.c +@@ -25,7 +25,7 @@ void G_BootstrapCommon(void) + gpu_perm, cl, + G_Format_R8G8B8A8_Uint, + VEC2I32(8, 8), +- G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present, ++ G_Layout_Simultaneous, + .flags = G_ResourceFlag_ZeroMemory + ); + G.blank_tex = G_PushTexture2DRef(gpu_perm, blank_tex); +@@ -44,7 +44,7 @@ void G_BootstrapCommon(void) + gpu_perm, cl, + G_Format_R16_Uint, + noise_dims, +- G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present ++ G_Layout_Simultaneous + ); + G_CopyCpuToTexture( + cl, +@@ -143,30 +143,54 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList +  + //- Mip +  +-i32 G_DimsFromMip1D(i32 texture_dims, i32 mip) ++i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip) + { +- mip = ClampI32(mip, 0, 31); ++ mip = ClampI32(mip, -31, 31); + i32 result = 0; +- result = MaxI32(result >> mip, 1); ++ if (mip >= 0) ++ { ++ result = MaxI32(result >> mip, 1); ++ } ++ else ++ { ++ result = MaxI32(result << -mip, 1); ++ } + return result; + } +  +-Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip) ++Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip) + { +- mip = ClampI32(mip, 0, 31); ++ mip = ClampI32(mip, -31, 31); + Vec2I32 result = Zi; +- result.x = MaxI32(texture_dims.x >> mip, 1); +- result.y = MaxI32(texture_dims.y >> mip, 1); ++ if (mip >= 0) ++ { ++ result.x = MaxI32(mip0_dims.x >> mip, 1); ++ result.y = MaxI32(mip0_dims.y >> mip, 1); ++ } ++ else ++ { ++ result.x = MaxI32(mip0_dims.x << -mip, 1); ++ result.y = MaxI32(mip0_dims.y << -mip, 1); ++ } + return result; + } +  +-Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip) ++Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip) + { +- mip = ClampI32(mip, 0, 31); ++ mip = ClampI32(mip, -31, 31); + Vec3I32 result = Zi; +- result.x = MaxI32(texture_dims.x >> mip, 1); +- result.y = MaxI32(texture_dims.y >> mip, 1); +- result.z = MaxI32(texture_dims.z >> mip, 1); ++ if (mip >= 0) ++ { ++ result.x = MaxI32(mip0_dims.x >> mip, 1); ++ result.y = MaxI32(mip0_dims.y >> mip, 1); ++ result.z = MaxI32(mip0_dims.z >> mip, 1); ++ } ++ else ++ { ++ result.x = MaxI32(mip0_dims.x << -mip, 1); ++ result.y = MaxI32(mip0_dims.y << -mip, 1); ++ result.z = MaxI32(mip0_dims.z << -mip, 1); ++ } + return result; + } +  +diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h +index eb3ee6d2..03927040 100644 +--- a/src/gpu/gpu_common.h ++++ b/src/gpu/gpu_common.h +@@ -35,9 +35,9 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList + G_PushBufferFromCpuCopy_((_arena), (_cl), (_src), (G_BufferDesc) { .size = (_src).len, __VA_ARGS__ }) +  + //- Mip +-i32 G_DimsFromMip1D(i32 texture_dims, i32 mip); +-Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip); +-Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip); ++i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip); ++Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip); ++Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip); +  + //- Viewport / scissor + Rng3 G_ViewportFromTexture(G_ResourceHandle texture); +diff --git a/src/gpu/gpu_core.h b/src/gpu/gpu_core.h +index 7e1b329a..bed18c93 100644 +--- a/src/gpu/gpu_core.h ++++ b/src/gpu/gpu_core.h +@@ -242,18 +242,16 @@ Enum(G_Access) + G_Access_IndexBuffer = (1 << 8), + G_Access_IndirectArgument = (1 << 9), +  +- G_Access_All = 0xFFFFFFFF ++ G_Access_All = 0xFFFFFFFF // Represents all accesses relevant to the specified sync stage + }; +  + Enum(G_Layout) + { + G_Layout_NoChange, +  +- // "Simultaneous" allows a resource to be used on any queue with any access +- // type, as long as there is only one writer at a time, and the writer is not +- // writing to any texels currently being read. +- // Resources cannot transition to/from this layout. They must be created +- // with it and are locked to it. ++ // Simultaneous layout allows a resource to be used on any queue with any ++ // access type (except depth-stencil). Resources cannot transition to/from ++ // this layout, they must be created with it. + G_Layout_Simultaneous, // D3D12_BARRIER_LAYOUT_COMMON + D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS +  + G_Layout_Undefined, // D3D12_BARRIER_LAYOUT_UNDEFINED +diff --git a/src/pp/pp_vis/pp_vis.lay b/src/pp/pp_vis/pp_vis.lay +index f72dc528..2d916376 100644 +--- a/src/pp/pp_vis/pp_vis.lay ++++ b/src/pp/pp_vis/pp_vis.lay +@@ -26,7 +26,7 @@ + @ComputeShader V_CompositeCS + @ComputeShader V_BloomDownCS + @ComputeShader V_BloomUpCS +-@ComputeShader V_PostProcessCS ++@ComputeShader V_FinalizeCS + @VertexShader V_DVertVS + @PixelShader V_DVertPS +  +diff --git a/src/pp/pp_vis/pp_vis_core.c b/src/pp/pp_vis/pp_vis_core.c +index f2f5e6b5..338036ba 100644 +--- a/src/pp/pp_vis/pp_vis_core.c ++++ b/src/pp/pp_vis/pp_vis_core.c +@@ -416,7 +416,7 @@ void V_TickForever(WaveLaneCtx *lane) + gpu_perm, cl, + G_Format_R8_Uint, + tiles_dims, +- G_Layout_DirectQueue_ShaderRead, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_ZeroMemory, + .name = Lit("Tiles") + ); +@@ -441,7 +441,7 @@ void V_TickForever(WaveLaneCtx *lane) + gpu_perm, cl, + G_Format_R32_Uint, + cells_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, + .name = StringF(perm, "Particle cells - layer %F", FmtSint(layer)) + ); +@@ -454,7 +454,7 @@ void V_TickForever(WaveLaneCtx *lane) + gpu_perm, cl, + G_Format_R32_Uint, + cells_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, + .name = StringF(perm, "Particle densities - layer %F", FmtSint(layer)) + ); +@@ -469,7 +469,7 @@ void V_TickForever(WaveLaneCtx *lane) + gpu_perm, cl, + G_Format_R16G16B16A16_Float, + cells_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, + .name = Lit("Stains") + ); +@@ -481,7 +481,7 @@ void V_TickForever(WaveLaneCtx *lane) + gpu_perm, cl, + G_Format_R16G16B16A16_Float, + cells_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, + .name = Lit("Dry stains") + ); +@@ -493,7 +493,7 @@ void V_TickForever(WaveLaneCtx *lane) + gpu_perm, cl, + G_Format_R32_Float, + cells_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, + .name = Lit("Drynesses") + ); +@@ -505,7 +505,7 @@ void V_TickForever(WaveLaneCtx *lane) + gpu_perm, cl, + G_Format_R32_Uint, + cells_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, + .name = Lit("Occluders cells") + ); +@@ -614,6 +614,8 @@ void V_TickForever(WaveLaneCtx *lane) + frame->dt = SecondsFromNs(frame->dt_ns); + frame->rand = prev_frame->rand; +  ++ frame->should_tone_map = TweakBool("Tone mapping enabled", 1); ++ + if (P_IsEntKeyNil(V.player_key)) + { + TrueRand(StringFromStruct(&V.player_key)); +@@ -4918,18 +4920,17 @@ void V_TickForever(WaveLaneCtx *lane) + frame->tile_descs[tile_kind] = tile_desc; + } + } ++ + // Upload tiles + if (frame->tiles_dirty) + { + // LogDebugF("Uploading tiles to gpu"); +- G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_CopyWrite); + G_CopyCpuToTexture( + frame->cl, + gpu_tiles_res, VEC3I32(0, 0, 0), + local_world->tiles, VEC3I32(tiles_dims.x, tiles_dims.y, 1), + RNG3I32(VEC3I32(0, 0, 0), VEC3I32(tiles_dims.x, tiles_dims.y, 1)) + ); +- G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_ShaderRead); + } +  + // Screen texture +@@ -4937,7 +4938,7 @@ void V_TickForever(WaveLaneCtx *lane) + frame->gpu_arena, frame->cl, + G_Format_R16G16B16A16_Float, + frame->screen_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, + .name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick)) + ); +@@ -4951,11 +4952,10 @@ void V_TickForever(WaveLaneCtx *lane) + frame->gpu_arena, frame->cl, + G_Format_R16G16B16A16_Float, + G_DimsFromMip2D(G_Count2D(screen_target), 1), +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, + .name = StringF(frame->arena, "Bloom target [%F]", FmtSint(frame->tick)), +- // .max_mips = 4 +- .max_mips = 8 ++ .max_mips = 64 + ); + for (i32 mip_idx = 0; mip_idx < G_CountMips(bloom_target); ++mip_idx) + { +@@ -4979,7 +4979,7 @@ void V_TickForever(WaveLaneCtx *lane) + frame->gpu_arena, frame->cl, + G_Format_R16G16B16A16_Float, + frame->shade_dims, +- G_Layout_DirectQueue_ShaderReadWrite, ++ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, + .flags = G_ResourceFlag_AllowShaderReadWrite, + .name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick)) + ); +@@ -5091,6 +5091,9 @@ void V_TickForever(WaveLaneCtx *lane) +  + // Sync particles & occluders + G_DumbGlobalMemorySync(frame->cl); ++ ++ // Transition albedo ++ G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite); + } +  + ////////////////////////////// +@@ -5113,83 +5116,63 @@ void V_TickForever(WaveLaneCtx *lane) + G_Compute(frame->cl, V_ShadeCS, V_ThreadGroupSizeFromTexSize(frame->shade_dims)); + } +  +- ////////////////////////////// +- //- Transition G-buffers to readonly +- +- { +- G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead); +- G_DumbMemoryLayoutSync(frame->cl, shade_target, G_Layout_DirectQueue_ShaderRead); +- } +- + ////////////////////////////// + //- Composite pass +  + { + G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); +  +- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead); ++ // Sync screen tex ++ G_DumbGlobalMemorySync(frame->cl); + } +  + ////////////////////////////// + //- Bloom passes +  + { +- i32 mips_count = G_CountMips(bloom_target); ++ i32 mips_count = G_CountMips(bloom_target) + 1; ++ G_SetConstant(frame->cl, V_GpuConst_MipsCount, mips_count); ++ ++ // NOTE: Because bloom mip chain starts at half screen size, mip_idx 0 ++ // actually represents the screen texture, while mip_idx - 1 represents ++ // the first mip index in the bloom mip chain +  + //- Downsample + blur passes +- for (i32 mip_idx = 0; mip_idx < mips_count; ++mip_idx) ++ for (i32 mip_idx = 1; mip_idx < mips_count; ++mip_idx) + { +- Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx); +- if (mip_idx == 0) +- { +- // Init bloom pyramid from screen target on first pass (prefilter) +- gpu_flags |= V_GpuFlag_InitBloom; +- G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags); +- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->screen_ro); +- } +- else +- { +- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx - 1, mip_idx - 1)); +- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx - 1]); +- } +- G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]); +- { +- G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(dims)); +- } +- gpu_flags &= ~V_GpuFlag_InitBloom; +- G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags); ++ Vec2I32 down_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx); ++ ++ G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); ++ G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims)); ++ ++ G_DumbGlobalMemorySync(frame->cl); + } +  + //- Upsample passes + for (i32 mip_idx = mips_count - 2; mip_idx >= 0; --mip_idx) + { +- Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx); +- +- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderReadWrite, .mips = RNGI32(mip_idx, mip_idx)); +- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx + 1, mip_idx + 1)); ++ Vec2I32 up_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx); +  +- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx + 1]); +- G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]); ++ G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); ++ G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims)); +  +- G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(dims)); +- } ++ G_DumbGlobalMemorySync(frame->cl); ++ } + } +  + ////////////////////////////// +- //- Post process pass ++ //- Finalization pass +  + { +- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderReadWrite); +- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(0, 0)); +- G_Compute(frame->cl, V_PostProcessCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); ++ G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); + } +  + ////////////////////////////// + //- Debug shapes pass +  +- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite); +- + { ++ G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite); ++ + G_Rasterize( + frame->cl, + V_DVertVS, V_DVertPS, +@@ -5198,12 +5181,13 @@ void V_TickForever(WaveLaneCtx *lane) + screen_viewport, screen_scissor, + G_RasterMode_TriangleList + ); ++ ++ G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite); + } +  + ////////////////////////////// + //- Finalize screen target +  +- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead); + { + Rng2 uv = Zi; + uv.p0 = Vec2FromVec(screen_viewport.p0); +diff --git a/src/pp/pp_vis/pp_vis_gpu.g b/src/pp/pp_vis/pp_vis_gpu.g +index f8a254de..c0a9e47d 100644 +--- a/src/pp/pp_vis/pp_vis_gpu.g ++++ b/src/pp/pp_vis/pp_vis_gpu.g +@@ -53,13 +53,6 @@ Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density) + return result; + } +  +-// ACES approximation by Krzysztof Narkowicz +-// https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/ +-Vec3 V_ToneMap(Vec3 v) +-{ +- return saturate((v * (2.51f * v + 0.03f)) / (v * (2.43f * v + 0.59f) + 0.14f)); +-} +- + //////////////////////////////////////////////////////////// + //~ Prepare frame +  +@@ -142,11 +135,11 @@ ComputeShader2D(V_PrepareCellsCS, 8, 8) + } + else if (over_stain.a > 0) + { +- Vec4 stain = dry_stains[cell_pos]; + Vec4 dry_stain = max(dry_stains[cell_pos], 0); ++ Vec4 stain = dry_stain; +  +- stain = BlendPremul(over_stain, stain); + dry_stain = BlendPremul(over_dry_stain, dry_stain); ++ stain = BlendPremul(over_stain, stain); +  + stains[cell_pos] = stain; + dry_stains[cell_pos] = dry_stain; +@@ -483,7 +476,7 @@ ComputeShader(V_SimParticlesCS, 64) + particle.prev_occluder = occluder; + } +  +- if (!AnyBit(desc.flags, V_ParticleFlag_NoPruneWhenStill) && dot(particle.velocity, particle.velocity) < 0.0001) ++ if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold)) + { + prune = 1; + } +@@ -723,7 +716,6 @@ ComputeShader2D(V_CompositeCS, 8, 8) + Vec4 ground_particle_color = 0; + Vec4 air_particle_color = 0; +  +- + for (V_ParticleLayer layer = (V_ParticleLayer)0; layer < V_ParticleLayer_COUNT; layer += (V_ParticleLayer)1) + { + RWTexture2D cells = G_Dereference(frame.particle_cells[layer]); +@@ -752,9 +744,9 @@ ComputeShader2D(V_CompositeCS, 8, 8) + // Darken wall particles / stains + if (tile == P_TileKind_Wall) + { +- ground_particle_color *= 0.25; +- air_particle_color *= 0.25; +- stain_color *= 0.25; ++ ground_particle_color *= 0.5; ++ air_particle_color *= 0.5; ++ stain_color *= 0.5; + } +  + ////////////////////////////// +@@ -972,57 +964,73 @@ ComputeShader2D(V_CompositeCS, 8, 8) + //////////////////////////////////////////////////////////// + //~ Bloom +  ++////////////////////////////// ++//- Downsample ++ + ComputeShader2D(V_BloomDownCS, 8, 8) + { ++ i32 mips_count = V_GpuConst_MipsCount; ++ i32 mip_idx = V_GpuConst_MipIdx; ++ + V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; +- Texture2D bloom_up = G_Dereference(V_GpuConst_BloomRead); +- RWTexture2D bloom_down = G_Dereference(V_GpuConst_BloomWrite); + SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); ++ RWTexture2D bloom_down = G_Dereference(frame.bloom_mips_rw[mip_idx - 1]); ++ ++ Texture2D bloom_up; ++ b32 is_first_pass = mip_idx == 1; ++ if (is_first_pass) ++ { ++ bloom_up = G_Dereference(frame.screen_ro); ++ } ++ else ++ { ++ bloom_up = G_Dereference(frame.bloom_mips_ro[mip_idx - 2]); ++ } +  +- Vec2 up_dims = countof(bloom_up); + Vec2 down_dims = countof(bloom_down); +  + Vec2 bloom_pos = SV_DispatchThreadID + 0.5; + Vec2 bloom_uv = bloom_pos / down_dims; + Vec2 off_uv = 0.5 / down_dims; +- b32 is_first_pass = !!(V_GpuConst_Flags & V_GpuFlag_InitBloom); +  +- Struct(SampleDesc) { Vec2 uv; f32 weight; }; +- SampleDesc samples[] = { +- { bloom_uv + Vec2(0, 0), 0.5 }, +- { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 }, +- { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 }, +- { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 }, +- { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 }, +- }; ++ f32 threshold = 0.25; ++ f32 knee = 0.75; +  + Vec4 result = 0; +- for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx) + { +- SampleDesc desc = samples[sample_idx]; +- Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0); +- +- f32 knee_weight = 1; +- if (is_first_pass) ++ Struct(SampleDesc) { Vec2 uv; f32 weight; }; ++ SampleDesc samples[] = { ++ { bloom_uv + Vec2(0, 0), 0.5 }, ++ { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 }, ++ { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 }, ++ { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 }, ++ { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 }, ++ }; ++ for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx) + { +- f32 luminance = LuminanceFromColor(src); +- f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance +- f32 bright = max(luminance, (max_rgb - 1.0) * 0.5); +- if (bright > 0) +- { +- f32 threshold = 1.0; +- f32 knee = 0.5; +- f32 over_threshold = max(bright - threshold, 0.0); +- f32 ramp = saturate(over_threshold / knee); +- knee_weight = (over_threshold * ramp * ramp) / bright; +- } +- else ++ SampleDesc desc = samples[sample_idx]; ++ Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0); ++ ++ f32 knee_weight = 1; ++ if (is_first_pass) + { +- knee_weight = 0; ++ f32 luminance = LuminanceFromColor(src); ++ f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance ++ f32 bright = max(luminance, (max_rgb - 1.0) * 0.5); ++ if (bright > 0) ++ { ++ f32 over_threshold = max(bright - threshold, 0.0); ++ f32 ramp = saturate(over_threshold / knee); ++ knee_weight = (over_threshold * ramp * ramp) / bright; ++ } ++ else ++ { ++ knee_weight = 0; ++ } + } +- } +  +- result += src * desc.weight * knee_weight; ++ result += src * desc.weight * knee_weight; ++ } + } +  + if (IsInside(bloom_pos, down_dims)) +@@ -1031,52 +1039,77 @@ ComputeShader2D(V_BloomDownCS, 8, 8) + } + } +  ++////////////////////////////// ++//- Upsample ++ + ComputeShader2D(V_BloomUpCS, 8, 8) + { ++ i32 mips_count = V_GpuConst_MipsCount; ++ i32 mip_idx = V_GpuConst_MipIdx; ++ + V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; +- Texture2D bloom_down = G_Dereference(V_GpuConst_BloomRead); +- RWTexture2D bloom_up = G_Dereference(V_GpuConst_BloomWrite); + SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); ++ Texture2D bloom_down = G_Dereference(frame.bloom_mips_ro[mip_idx]); ++ ++ b32 is_last_pass = mip_idx == 0; ++ RWTexture2D bloom_up; ++ if (is_last_pass) ++ { ++ bloom_up = G_Dereference(frame.screen_rw); ++ } ++ else ++ { ++ bloom_up = G_Dereference(frame.bloom_mips_rw[mip_idx - 1]); ++ } +  +- Vec2 up_dims = countof(bloom_up); + Vec2 down_dims = countof(bloom_down); ++ Vec2 up_dims = countof(bloom_up); +  + Vec2 bloom_pos = SV_DispatchThreadID + 0.5; + Vec2 bloom_uv = bloom_pos / up_dims; +- Vec2 off_uv = 1 / up_dims; ++ Vec2 off_uv0 = 1 / down_dims; ++ Vec2 off_uv1 = off_uv0 * 2; +  + Vec4 result = 0; + { + // Center +- result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 4; +- // Edges ++ result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 9.0f / 41.0f; ++ ++ // Outer Edges + result += ( +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv.y), 0) + +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, 0), 0) + +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv.y), 0) + +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, 0), 0) +- ) * 2; +- // Corners ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv1.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, 0), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv1.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, 0), 0) ++ ) * 3.0f / 41.0f; ++ ++ // Inner corners ++ result += ( ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, -off_uv0.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, -off_uv0.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, off_uv0.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, off_uv0.y), 0) ++ ) * 4.0f / 41.0f; ++ ++ // Outer corners + result += ( +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0) + +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, -off_uv.y), 0) + +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, off_uv.y), 0) + +- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, off_uv.y), 0) +- ); +- // Normalize +- result /= 16; ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, -off_uv1.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, -off_uv1.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, off_uv1.y), 0) + ++ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, off_uv1.y), 0) ++ ) * 1.0f / 41.0f; + } +  + if (IsInside(bloom_pos, up_dims)) + { +- bloom_up[bloom_pos] += result; ++ bloom_up[bloom_pos] += result * 0.75; + } + } +  + //////////////////////////////////////////////////////////// +-//~ Post process ++//~ Finalize +  +-ComputeShader2D(V_PostProcessCS, 8, 8) ++ComputeShader2D(V_FinalizeCS, 8, 8) + { + V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; + SamplerState bilinear_sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); +@@ -1084,42 +1117,21 @@ ComputeShader2D(V_PostProcessCS, 8, 8) + RWTexture2D screen_tex = G_Dereference(frame.screen_rw); +  + Vec2 screen_pos = SV_DispatchThreadID + 0.5; +- Vec2 screen_uv = screen_pos / frame.screen_dims; + b32 is_in_screen = IsInside(screen_pos, frame.screen_dims); +- +- ////////////////////////////// +- //- Original +- +- Vec4 original = 0; + if (is_in_screen) + { +- original = screen_tex[screen_pos]; +- original.rgb *= original.a; +- } ++ Vec4 result = screen_tex[screen_pos]; +  ++ //- Tone map ++ if (frame.should_tone_map) ++ { ++ // ACES approximation by Krzysztof Narkowicz ++ // https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/ ++ result.rgb = saturate((result.rgb * (2.51f * result.rgb + 0.03f)) / (result.rgb * (2.43f * result.rgb + 0.59f) + 0.14f)); ++ } +  +- ////////////////////////////// +- //- Bloom +- +- Vec4 bloom = 0; +- if (is_in_screen) +- { +- bloom = bloom_tex.SampleLevel(bilinear_sampler, screen_uv, 0); +- // bloom.rgb *= bloom.a; +- } +- +- ////////////////////////////// +- //- Compose +- +- Vec4 result = Vec4(0, 0, 0, 1); +- result = BlendPremul(original, result); +- result += bloom; +- // result.rgb = V_ToneMap(result); ++ result = Unpremul(result); +  +- result = Unpremul(result); +- +- if (is_in_screen) +- { + screen_tex[screen_pos] = result; + } + } +diff --git a/src/pp/pp_vis/pp_vis_gpu.gh b/src/pp/pp_vis/pp_vis_gpu.gh +index a47a2335..f176f2f8 100644 +--- a/src/pp/pp_vis/pp_vis_gpu.gh ++++ b/src/pp/pp_vis/pp_vis_gpu.gh +@@ -46,7 +46,6 @@ Struct(V_DVertPSOutput) +  + f32 V_RandFromPos(Vec3 pos); + Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density); +-Vec3 V_ToneMap(Vec3 v); +  + //////////////////////////////////////////////////////////// + //~ Shaders +@@ -73,8 +72,8 @@ ComputeShader2D(V_CompositeCS, 8, 8); + ComputeShader2D(V_BloomDownCS, 8, 8); + ComputeShader2D(V_BloomUpCS, 8, 8); +  +-//- Post process +-ComputeShader2D(V_PostProcessCS, 8, 8); ++//- Finalize ++ComputeShader2D(V_FinalizeCS, 8, 8); +  + //- Debug shapes + VertexShader(V_DVertVS, V_DVertPSInput); +diff --git a/src/pp/pp_vis/pp_vis_shared.cg b/src/pp/pp_vis/pp_vis_shared.cg +index 2419a6f2..72f6ae8d 100644 +--- a/src/pp/pp_vis/pp_vis_shared.cg ++++ b/src/pp/pp_vis/pp_vis_shared.cg +@@ -11,37 +11,42 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind) + V_ParticleDesc result; + { + PERSIST Readonly V_ParticleFlag flags[V_ParticleKind_COUNT] = { +- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) flags, ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) flags, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly V_ParticleLayer layers[V_ParticleKind_COUNT] = { +- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) layer, ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) layer, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly f32 stain_rates[V_ParticleKind_COUNT] = { +- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) stain_rate, ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) stain_rate, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly f32 pen_rates[V_ParticleKind_COUNT] = { +- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) pen_rate, ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) pen_rate, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly f32 lifetimes[V_ParticleKind_COUNT] = { +- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) lifetime, ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) lifetime, ++ V_ParticlesXList(X) ++ #undef X ++ }; ++ PERSIST Readonly f32 prune_speed_thresholds[V_ParticleKind_COUNT] = { ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) prune_speed_threshold, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly Vec4 base_colors[V_ParticleKind_COUNT] = { +- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) base_color, ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) base_color, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly Vec4 dry_factor[V_ParticleKind_COUNT] = { +- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) dry_factor, ++ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) dry_factor, + V_ParticlesXList(X) + #undef X + }; +@@ -51,6 +56,7 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind) + result.stain_rate = stain_rates[kind]; + result.pen_rate = pen_rates[kind]; + result.lifetime = lifetimes[kind]; ++ result.prune_speed_threshold = prune_speed_thresholds[kind]; + result.base_color = LinearFromSrgb(base_colors[kind]); + result.dry_factor = LinearFromSrgb(dry_factor[kind]); + } +diff --git a/src/pp/pp_vis/pp_vis_shared.cgh b/src/pp/pp_vis/pp_vis_shared.cgh +index 16ca6419..71d88ea5 100644 +--- a/src/pp/pp_vis/pp_vis_shared.cgh ++++ b/src/pp/pp_vis/pp_vis_shared.cgh +@@ -9,14 +9,13 @@ + Enum(V_GpuFlag) + { + V_GpuFlag_None = 0, +- V_GpuFlag_InitBloom = (1 << 0), + }; +  + G_DeclConstant(V_GpuFlag, V_GpuConst_Flags, 0); + G_DeclConstant(G_StructuredBufferRef, V_GpuConst_Frame, 1); + G_DeclConstant(G_Texture3DRef, V_GpuConst_NoiseTex, 2); +-G_DeclConstant(G_Texture2DRef, V_GpuConst_BloomRead, 3); +-G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4); ++G_DeclConstant(i32, V_GpuConst_MipsCount, 3); ++G_DeclConstant(i32, V_GpuConst_MipIdx, 4); +  + //////////////////////////////////////////////////////////// + //~ Particle types +@@ -29,7 +28,6 @@ G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4); + Enum(V_ParticleFlag) + { + V_ParticleFlag_None = 0, +- V_ParticleFlag_NoPruneWhenStill = (1 << 0), + V_ParticleFlag_StainWhenPruned = (1 << 1), + V_ParticleFlag_NoReflect = (1 << 2), + V_ParticleFlag_OnlyCollideWithWalls = (1 << 3), +@@ -53,6 +51,7 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Ground, \ + /* Stain rate, pen chance */ 30, 0, \ + /* Lifetime */ Inf, \ ++ /* Prune speed threshold */ 0.01, \ + /* Base color */ CompVec4(0, 0, 0, 0), \ + /* Dry color factor */ CompVec4(1, 1, 1, 1) \ + ) \ +@@ -64,8 +63,9 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Ground, \ + /* Stain rate, pen chance */ 100, 0.25, \ + /* Lifetime */ Inf, \ +- /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.05), \ +- /* Dry color factor */ CompVec4(0.5, 0.5, 0.5, 1) \ ++ /* Prune speed threshold */ 0.5, \ ++ /* Base color */ CompVec4(0.6, 0.1, 0.1, 0.05), \ ++ /* Dry color factor */ CompVec4(0.4, 0.4, 0.4, 1) \ + ) \ + X( \ + /* Name */ BloodDebris, \ +@@ -73,6 +73,7 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Mid, \ + /* Stain rate, pen chance */ 30, 0, \ + /* Lifetime */ Inf, \ ++ /* Prune speed threshold */ 0.01, \ + /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.8), \ + /* Dry color factor */ CompVec4(1, 1, 1, 1) \ + ) \ +@@ -82,6 +83,7 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Mid, \ + /* Stain rate, pen chance */ 0, 0, \ + /* Lifetime */ Inf, \ ++ /* Prune speed threshold */ 0.01, \ + /* Base color */ CompVec4(0.4, 0.3, 0.2, 1), \ + /* Dry color factor */ CompVec4(1, 1, 1, 1) \ + ) \ +@@ -91,6 +93,7 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Mid, \ + /* Stain rate, pen chance */ 0, 0, \ + /* Lifetime */ Inf, \ ++ /* Prune speed threshold */ 0.1, \ + /* Base color */ CompVec4(2, 0.5, 0, 1), \ + /* Dry color factor */ CompVec4(0.2, 0.1, 0.0, 1) \ + ) \ +@@ -102,6 +105,7 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Mid, \ + /* Stain rate, pen chance */ 0, 0, \ + /* Lifetime */ 0.075, \ ++ /* Prune speed threshold */ 0.01, \ + /* Base color */ CompVec4(0.8, 0.6, 0.2, 0.25), \ + /* Dry color factor */ CompVec4(1, 1, 1, 1) \ + ) \ +@@ -111,6 +115,7 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Air, \ + /* Stain rate, pen chance */ 0, 0, \ + /* Lifetime */ Inf, \ ++ /* Prune speed threshold */ 0.01, \ + /* Base color */ CompVec4(0.25, 0.25, 0.25, 0.75), \ + /* Dry color factor */ CompVec4(1, 1, 1, 1) \ + ) \ +@@ -122,6 +127,7 @@ Enum(V_ParticleLayer) + /* Layer */ V_ParticleLayer_Mid, \ + /* Stain rate, pen chance */ 0, 0, \ + /* Lifetime */ Inf, \ ++ /* Prune speed threshold */ 0.01, \ + /* Base color */ CompVec4(1, 1, 0, 1), \ + /* Dry color factor */ CompVec4(1, 1, 1, 1) \ + ) \ +@@ -168,6 +174,7 @@ Struct(V_ParticleDesc) + f32 stain_rate; + f32 pen_rate; + f32 lifetime; ++ f32 prune_speed_threshold; + Vec4 base_color; + Vec4 dry_factor; + }; +@@ -264,6 +271,7 @@ Struct(V_SharedFrame) +  + b32 tiles_dirty; + b32 should_clear_particles; ++ b32 should_tone_map; +  + b32 is_looking; + b32 is_moving;