diff --git a/src/gpu/gpu_common.c b/src/gpu/gpu_common.c index a9686d87..43835793 100644 --- a/src/gpu/gpu_common.c +++ b/src/gpu/gpu_common.c @@ -25,7 +25,7 @@ void G_BootstrapCommon(void) gpu_perm, cl, G_Format_R8G8B8A8_Uint, VEC2I32(8, 8), - G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present, + G_Layout_Simultaneous, .flags = G_ResourceFlag_ZeroMemory ); G.blank_tex = G_PushTexture2DRef(gpu_perm, blank_tex); @@ -44,7 +44,7 @@ void G_BootstrapCommon(void) gpu_perm, cl, G_Format_R16_Uint, noise_dims, - G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present + G_Layout_Simultaneous ); G_CopyCpuToTexture( cl, @@ -143,30 +143,54 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList  //- Mip  -i32 G_DimsFromMip1D(i32 texture_dims, i32 mip) +i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip) { - mip = ClampI32(mip, 0, 31); + mip = ClampI32(mip, -31, 31); i32 result = 0; - result = MaxI32(result >> mip, 1); + if (mip >= 0) + { + result = MaxI32(result >> mip, 1); + } + else + { + result = MaxI32(result << -mip, 1); + } return result; }  -Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip) +Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip) { - mip = ClampI32(mip, 0, 31); + mip = ClampI32(mip, -31, 31); Vec2I32 result = Zi; - result.x = MaxI32(texture_dims.x >> mip, 1); - result.y = MaxI32(texture_dims.y >> mip, 1); + if (mip >= 0) + { + result.x = MaxI32(mip0_dims.x >> mip, 1); + result.y = MaxI32(mip0_dims.y >> mip, 1); + } + else + { + result.x = MaxI32(mip0_dims.x << -mip, 1); + result.y = MaxI32(mip0_dims.y << -mip, 1); + } return result; }  -Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip) +Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip) { - mip = ClampI32(mip, 0, 31); + mip = ClampI32(mip, -31, 31); Vec3I32 result = Zi; - result.x = MaxI32(texture_dims.x >> mip, 1); - result.y = MaxI32(texture_dims.y >> mip, 1); - result.z = MaxI32(texture_dims.z >> mip, 1); + if (mip >= 0) + { + result.x = MaxI32(mip0_dims.x >> mip, 1); + result.y = MaxI32(mip0_dims.y >> mip, 1); + result.z = MaxI32(mip0_dims.z >> mip, 1); + } + else + { + result.x = MaxI32(mip0_dims.x << -mip, 1); + result.y = MaxI32(mip0_dims.y << -mip, 1); + result.z = MaxI32(mip0_dims.z << -mip, 1); + } return result; }  diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h index eb3ee6d2..03927040 100644 --- a/src/gpu/gpu_common.h +++ b/src/gpu/gpu_common.h @@ -35,9 +35,9 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList G_PushBufferFromCpuCopy_((_arena), (_cl), (_src), (G_BufferDesc) { .size = (_src).len, __VA_ARGS__ })  //- Mip -i32 G_DimsFromMip1D(i32 texture_dims, i32 mip); -Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip); -Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip); +i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip); +Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip); +Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip);  //- Viewport / scissor Rng3 G_ViewportFromTexture(G_ResourceHandle texture); diff --git a/src/gpu/gpu_core.h b/src/gpu/gpu_core.h index 7e1b329a..bed18c93 100644 --- a/src/gpu/gpu_core.h +++ b/src/gpu/gpu_core.h @@ -242,18 +242,16 @@ Enum(G_Access) G_Access_IndexBuffer = (1 << 8), G_Access_IndirectArgument = (1 << 9),  - G_Access_All = 0xFFFFFFFF + G_Access_All = 0xFFFFFFFF // Represents all accesses relevant to the specified sync stage };  Enum(G_Layout) { G_Layout_NoChange,  - // "Simultaneous" allows a resource to be used on any queue with any access - // type, as long as there is only one writer at a time, and the writer is not - // writing to any texels currently being read. - // Resources cannot transition to/from this layout. They must be created - // with it and are locked to it. + // Simultaneous layout allows a resource to be used on any queue with any + // access type (except depth-stencil). Resources cannot transition to/from + // this layout, they must be created with it. G_Layout_Simultaneous, // D3D12_BARRIER_LAYOUT_COMMON + D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS  G_Layout_Undefined, // D3D12_BARRIER_LAYOUT_UNDEFINED diff --git a/src/pp/pp_vis/pp_vis.lay b/src/pp/pp_vis/pp_vis.lay index f72dc528..2d916376 100644 --- a/src/pp/pp_vis/pp_vis.lay +++ b/src/pp/pp_vis/pp_vis.lay @@ -26,7 +26,7 @@ @ComputeShader V_CompositeCS @ComputeShader V_BloomDownCS @ComputeShader V_BloomUpCS -@ComputeShader V_PostProcessCS +@ComputeShader V_FinalizeCS @VertexShader V_DVertVS @PixelShader V_DVertPS  diff --git a/src/pp/pp_vis/pp_vis_core.c b/src/pp/pp_vis/pp_vis_core.c index f2f5e6b5..338036ba 100644 --- a/src/pp/pp_vis/pp_vis_core.c +++ b/src/pp/pp_vis/pp_vis_core.c @@ -416,7 +416,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R8_Uint, tiles_dims, - G_Layout_DirectQueue_ShaderRead, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory, .name = Lit("Tiles") ); @@ -441,7 +441,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Uint, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = StringF(perm, "Particle cells - layer %F", FmtSint(layer)) ); @@ -454,7 +454,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Uint, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = StringF(perm, "Particle densities - layer %F", FmtSint(layer)) ); @@ -469,7 +469,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R16G16B16A16_Float, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Stains") ); @@ -481,7 +481,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R16G16B16A16_Float, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Dry stains") ); @@ -493,7 +493,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Float, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Drynesses") ); @@ -505,7 +505,7 @@ void V_TickForever(WaveLaneCtx *lane) gpu_perm, cl, G_Format_R32_Uint, cells_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite, .name = Lit("Occluders cells") ); @@ -614,6 +614,8 @@ void V_TickForever(WaveLaneCtx *lane) frame->dt = SecondsFromNs(frame->dt_ns); frame->rand = prev_frame->rand;  + frame->should_tone_map = TweakBool("Tone mapping enabled", 1); + if (P_IsEntKeyNil(V.player_key)) { TrueRand(StringFromStruct(&V.player_key)); @@ -4918,18 +4920,17 @@ void V_TickForever(WaveLaneCtx *lane) frame->tile_descs[tile_kind] = tile_desc; } } + // Upload tiles if (frame->tiles_dirty) { // LogDebugF("Uploading tiles to gpu"); - G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_CopyWrite); G_CopyCpuToTexture( frame->cl, gpu_tiles_res, VEC3I32(0, 0, 0), local_world->tiles, VEC3I32(tiles_dims.x, tiles_dims.y, 1), RNG3I32(VEC3I32(0, 0, 0), VEC3I32(tiles_dims.x, tiles_dims.y, 1)) ); - G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_ShaderRead); }  // Screen texture @@ -4937,7 +4938,7 @@ void V_TickForever(WaveLaneCtx *lane) frame->gpu_arena, frame->cl, G_Format_R16G16B16A16_Float, frame->screen_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, .name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick)) ); @@ -4951,11 +4952,10 @@ void V_TickForever(WaveLaneCtx *lane) frame->gpu_arena, frame->cl, G_Format_R16G16B16A16_Float, G_DimsFromMip2D(G_Count2D(screen_target), 1), - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget, .name = StringF(frame->arena, "Bloom target [%F]", FmtSint(frame->tick)), - // .max_mips = 4 - .max_mips = 8 + .max_mips = 64 ); for (i32 mip_idx = 0; mip_idx < G_CountMips(bloom_target); ++mip_idx) { @@ -4979,7 +4979,7 @@ void V_TickForever(WaveLaneCtx *lane) frame->gpu_arena, frame->cl, G_Format_R16G16B16A16_Float, frame->shade_dims, - G_Layout_DirectQueue_ShaderReadWrite, + G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite, .flags = G_ResourceFlag_AllowShaderReadWrite, .name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick)) ); @@ -5091,6 +5091,9 @@ void V_TickForever(WaveLaneCtx *lane)  // Sync particles & occluders G_DumbGlobalMemorySync(frame->cl); + + // Transition albedo + G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite); }  ////////////////////////////// @@ -5113,83 +5116,63 @@ void V_TickForever(WaveLaneCtx *lane) G_Compute(frame->cl, V_ShadeCS, V_ThreadGroupSizeFromTexSize(frame->shade_dims)); }  - ////////////////////////////// - //- Transition G-buffers to readonly - - { - G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead); - G_DumbMemoryLayoutSync(frame->cl, shade_target, G_Layout_DirectQueue_ShaderRead); - } - ////////////////////////////// //- Composite pass  { G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));  - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead); + // Sync screen tex + G_DumbGlobalMemorySync(frame->cl); }  ////////////////////////////// //- Bloom passes  { - i32 mips_count = G_CountMips(bloom_target); + i32 mips_count = G_CountMips(bloom_target) + 1; + G_SetConstant(frame->cl, V_GpuConst_MipsCount, mips_count); + + // NOTE: Because bloom mip chain starts at half screen size, mip_idx 0 + // actually represents the screen texture, while mip_idx - 1 represents + // the first mip index in the bloom mip chain  //- Downsample + blur passes - for (i32 mip_idx = 0; mip_idx < mips_count; ++mip_idx) + for (i32 mip_idx = 1; mip_idx < mips_count; ++mip_idx) { - Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx); - if (mip_idx == 0) - { - // Init bloom pyramid from screen target on first pass (prefilter) - gpu_flags |= V_GpuFlag_InitBloom; - G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags); - G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->screen_ro); - } - else - { - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx - 1, mip_idx - 1)); - G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx - 1]); - } - G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]); - { - G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(dims)); - } - gpu_flags &= ~V_GpuFlag_InitBloom; - G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags); + Vec2I32 down_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx); + + G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); + G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims)); + + G_DumbGlobalMemorySync(frame->cl); }  //- Upsample passes for (i32 mip_idx = mips_count - 2; mip_idx >= 0; --mip_idx) { - Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx); - - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderReadWrite, .mips = RNGI32(mip_idx, mip_idx)); - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx + 1, mip_idx + 1)); + Vec2I32 up_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx);  - G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx + 1]); - G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]); + G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx); + G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims));  - G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(dims)); - } + G_DumbGlobalMemorySync(frame->cl); + } }  ////////////////////////////// - //- Post process pass + //- Finalization pass  { - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderReadWrite); - G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(0, 0)); - G_Compute(frame->cl, V_PostProcessCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); + G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims)); }  ////////////////////////////// //- Debug shapes pass  - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite); - { + G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite); + G_Rasterize( frame->cl, V_DVertVS, V_DVertPS, @@ -5198,12 +5181,13 @@ void V_TickForever(WaveLaneCtx *lane) screen_viewport, screen_scissor, G_RasterMode_TriangleList ); + + G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite); }  ////////////////////////////// //- Finalize screen target  - G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead); { Rng2 uv = Zi; uv.p0 = Vec2FromVec(screen_viewport.p0); diff --git a/src/pp/pp_vis/pp_vis_gpu.g b/src/pp/pp_vis/pp_vis_gpu.g index f8a254de..c0a9e47d 100644 --- a/src/pp/pp_vis/pp_vis_gpu.g +++ b/src/pp/pp_vis/pp_vis_gpu.g @@ -53,13 +53,6 @@ Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density) return result; }  -// ACES approximation by Krzysztof Narkowicz -// https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/ -Vec3 V_ToneMap(Vec3 v) -{ - return saturate((v * (2.51f * v + 0.03f)) / (v * (2.43f * v + 0.59f) + 0.14f)); -} - //////////////////////////////////////////////////////////// //~ Prepare frame  @@ -142,11 +135,11 @@ ComputeShader2D(V_PrepareCellsCS, 8, 8) } else if (over_stain.a > 0) { - Vec4 stain = dry_stains[cell_pos]; Vec4 dry_stain = max(dry_stains[cell_pos], 0); + Vec4 stain = dry_stain;  - stain = BlendPremul(over_stain, stain); dry_stain = BlendPremul(over_dry_stain, dry_stain); + stain = BlendPremul(over_stain, stain);  stains[cell_pos] = stain; dry_stains[cell_pos] = dry_stain; @@ -483,7 +476,7 @@ ComputeShader(V_SimParticlesCS, 64) particle.prev_occluder = occluder; }  - if (!AnyBit(desc.flags, V_ParticleFlag_NoPruneWhenStill) && dot(particle.velocity, particle.velocity) < 0.0001) + if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold)) { prune = 1; } @@ -723,7 +716,6 @@ ComputeShader2D(V_CompositeCS, 8, 8) Vec4 ground_particle_color = 0; Vec4 air_particle_color = 0;  - for (V_ParticleLayer layer = (V_ParticleLayer)0; layer < V_ParticleLayer_COUNT; layer += (V_ParticleLayer)1) { RWTexture2D cells = G_Dereference(frame.particle_cells[layer]); @@ -752,9 +744,9 @@ ComputeShader2D(V_CompositeCS, 8, 8) // Darken wall particles / stains if (tile == P_TileKind_Wall) { - ground_particle_color *= 0.25; - air_particle_color *= 0.25; - stain_color *= 0.25; + ground_particle_color *= 0.5; + air_particle_color *= 0.5; + stain_color *= 0.5; }  ////////////////////////////// @@ -972,57 +964,73 @@ ComputeShader2D(V_CompositeCS, 8, 8) //////////////////////////////////////////////////////////// //~ Bloom  +////////////////////////////// +//- Downsample + ComputeShader2D(V_BloomDownCS, 8, 8) { + i32 mips_count = V_GpuConst_MipsCount; + i32 mip_idx = V_GpuConst_MipIdx; + V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; - Texture2D bloom_up = G_Dereference(V_GpuConst_BloomRead); - RWTexture2D bloom_down = G_Dereference(V_GpuConst_BloomWrite); SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); + RWTexture2D bloom_down = G_Dereference(frame.bloom_mips_rw[mip_idx - 1]); + + Texture2D bloom_up; + b32 is_first_pass = mip_idx == 1; + if (is_first_pass) + { + bloom_up = G_Dereference(frame.screen_ro); + } + else + { + bloom_up = G_Dereference(frame.bloom_mips_ro[mip_idx - 2]); + }  - Vec2 up_dims = countof(bloom_up); Vec2 down_dims = countof(bloom_down);  Vec2 bloom_pos = SV_DispatchThreadID + 0.5; Vec2 bloom_uv = bloom_pos / down_dims; Vec2 off_uv = 0.5 / down_dims; - b32 is_first_pass = !!(V_GpuConst_Flags & V_GpuFlag_InitBloom);  - Struct(SampleDesc) { Vec2 uv; f32 weight; }; - SampleDesc samples[] = { - { bloom_uv + Vec2(0, 0), 0.5 }, - { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 }, - { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 }, - { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 }, - { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 }, - }; + f32 threshold = 0.25; + f32 knee = 0.75;  Vec4 result = 0; - for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx) { - SampleDesc desc = samples[sample_idx]; - Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0); - - f32 knee_weight = 1; - if (is_first_pass) + Struct(SampleDesc) { Vec2 uv; f32 weight; }; + SampleDesc samples[] = { + { bloom_uv + Vec2(0, 0), 0.5 }, + { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 }, + { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 }, + { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 }, + { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 }, + }; + for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx) { - f32 luminance = LuminanceFromColor(src); - f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance - f32 bright = max(luminance, (max_rgb - 1.0) * 0.5); - if (bright > 0) - { - f32 threshold = 1.0; - f32 knee = 0.5; - f32 over_threshold = max(bright - threshold, 0.0); - f32 ramp = saturate(over_threshold / knee); - knee_weight = (over_threshold * ramp * ramp) / bright; - } - else + SampleDesc desc = samples[sample_idx]; + Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0); + + f32 knee_weight = 1; + if (is_first_pass) { - knee_weight = 0; + f32 luminance = LuminanceFromColor(src); + f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance + f32 bright = max(luminance, (max_rgb - 1.0) * 0.5); + if (bright > 0) + { + f32 over_threshold = max(bright - threshold, 0.0); + f32 ramp = saturate(over_threshold / knee); + knee_weight = (over_threshold * ramp * ramp) / bright; + } + else + { + knee_weight = 0; + } } - }  - result += src * desc.weight * knee_weight; + result += src * desc.weight * knee_weight; + } }  if (IsInside(bloom_pos, down_dims)) @@ -1031,52 +1039,77 @@ ComputeShader2D(V_BloomDownCS, 8, 8) } }  +////////////////////////////// +//- Upsample + ComputeShader2D(V_BloomUpCS, 8, 8) { + i32 mips_count = V_GpuConst_MipsCount; + i32 mip_idx = V_GpuConst_MipIdx; + V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; - Texture2D bloom_down = G_Dereference(V_GpuConst_BloomRead); - RWTexture2D bloom_up = G_Dereference(V_GpuConst_BloomWrite); SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); + Texture2D bloom_down = G_Dereference(frame.bloom_mips_ro[mip_idx]); + + b32 is_last_pass = mip_idx == 0; + RWTexture2D bloom_up; + if (is_last_pass) + { + bloom_up = G_Dereference(frame.screen_rw); + } + else + { + bloom_up = G_Dereference(frame.bloom_mips_rw[mip_idx - 1]); + }  - Vec2 up_dims = countof(bloom_up); Vec2 down_dims = countof(bloom_down); + Vec2 up_dims = countof(bloom_up);  Vec2 bloom_pos = SV_DispatchThreadID + 0.5; Vec2 bloom_uv = bloom_pos / up_dims; - Vec2 off_uv = 1 / up_dims; + Vec2 off_uv0 = 1 / down_dims; + Vec2 off_uv1 = off_uv0 * 2;  Vec4 result = 0; { // Center - result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 4; - // Edges + result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 9.0f / 41.0f; + + // Outer Edges result += ( - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, 0), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, 0), 0) - ) * 2; - // Corners + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv1.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, 0), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv1.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, 0), 0) + ) * 3.0f / 41.0f; + + // Inner corners + result += ( + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, -off_uv0.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, -off_uv0.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, off_uv0.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, off_uv0.y), 0) + ) * 4.0f / 41.0f; + + // Outer corners result += ( - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, -off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, off_uv.y), 0) + - bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, off_uv.y), 0) - ); - // Normalize - result /= 16; + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, -off_uv1.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, -off_uv1.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, off_uv1.y), 0) + + bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, off_uv1.y), 0) + ) * 1.0f / 41.0f; }  if (IsInside(bloom_pos, up_dims)) { - bloom_up[bloom_pos] += result; + bloom_up[bloom_pos] += result * 0.75; } }  //////////////////////////////////////////////////////////// -//~ Post process +//~ Finalize  -ComputeShader2D(V_PostProcessCS, 8, 8) +ComputeShader2D(V_FinalizeCS, 8, 8) { V_SharedFrame frame = G_Dereference(V_GpuConst_Frame)[0]; SamplerState bilinear_sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]); @@ -1084,42 +1117,21 @@ ComputeShader2D(V_PostProcessCS, 8, 8) RWTexture2D screen_tex = G_Dereference(frame.screen_rw);  Vec2 screen_pos = SV_DispatchThreadID + 0.5; - Vec2 screen_uv = screen_pos / frame.screen_dims; b32 is_in_screen = IsInside(screen_pos, frame.screen_dims); - - ////////////////////////////// - //- Original - - Vec4 original = 0; if (is_in_screen) { - original = screen_tex[screen_pos]; - original.rgb *= original.a; - } + Vec4 result = screen_tex[screen_pos];  + //- Tone map + if (frame.should_tone_map) + { + // ACES approximation by Krzysztof Narkowicz + // https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/ + result.rgb = saturate((result.rgb * (2.51f * result.rgb + 0.03f)) / (result.rgb * (2.43f * result.rgb + 0.59f) + 0.14f)); + }  - ////////////////////////////// - //- Bloom - - Vec4 bloom = 0; - if (is_in_screen) - { - bloom = bloom_tex.SampleLevel(bilinear_sampler, screen_uv, 0); - // bloom.rgb *= bloom.a; - } - - ////////////////////////////// - //- Compose - - Vec4 result = Vec4(0, 0, 0, 1); - result = BlendPremul(original, result); - result += bloom; - // result.rgb = V_ToneMap(result); + result = Unpremul(result);  - result = Unpremul(result); - - if (is_in_screen) - { screen_tex[screen_pos] = result; } } diff --git a/src/pp/pp_vis/pp_vis_gpu.gh b/src/pp/pp_vis/pp_vis_gpu.gh index a47a2335..f176f2f8 100644 --- a/src/pp/pp_vis/pp_vis_gpu.gh +++ b/src/pp/pp_vis/pp_vis_gpu.gh @@ -46,7 +46,6 @@ Struct(V_DVertPSOutput)  f32 V_RandFromPos(Vec3 pos); Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density); -Vec3 V_ToneMap(Vec3 v);  //////////////////////////////////////////////////////////// //~ Shaders @@ -73,8 +72,8 @@ ComputeShader2D(V_CompositeCS, 8, 8); ComputeShader2D(V_BloomDownCS, 8, 8); ComputeShader2D(V_BloomUpCS, 8, 8);  -//- Post process -ComputeShader2D(V_PostProcessCS, 8, 8); +//- Finalize +ComputeShader2D(V_FinalizeCS, 8, 8);  //- Debug shapes VertexShader(V_DVertVS, V_DVertPSInput); diff --git a/src/pp/pp_vis/pp_vis_shared.cg b/src/pp/pp_vis/pp_vis_shared.cg index 2419a6f2..72f6ae8d 100644 --- a/src/pp/pp_vis/pp_vis_shared.cg +++ b/src/pp/pp_vis/pp_vis_shared.cg @@ -11,37 +11,42 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind) V_ParticleDesc result; { PERSIST Readonly V_ParticleFlag flags[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) flags, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) flags, V_ParticlesXList(X) #undef X }; PERSIST Readonly V_ParticleLayer layers[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) layer, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) layer, V_ParticlesXList(X) #undef X }; PERSIST Readonly f32 stain_rates[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) stain_rate, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) stain_rate, V_ParticlesXList(X) #undef X }; PERSIST Readonly f32 pen_rates[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) pen_rate, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) pen_rate, V_ParticlesXList(X) #undef X }; PERSIST Readonly f32 lifetimes[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) lifetime, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) lifetime, + V_ParticlesXList(X) + #undef X + }; + PERSIST Readonly f32 prune_speed_thresholds[V_ParticleKind_COUNT] = { + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) prune_speed_threshold, V_ParticlesXList(X) #undef X }; PERSIST Readonly Vec4 base_colors[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) base_color, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) base_color, V_ParticlesXList(X) #undef X }; PERSIST Readonly Vec4 dry_factor[V_ParticleKind_COUNT] = { - #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) dry_factor, + #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) dry_factor, V_ParticlesXList(X) #undef X }; @@ -51,6 +56,7 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind) result.stain_rate = stain_rates[kind]; result.pen_rate = pen_rates[kind]; result.lifetime = lifetimes[kind]; + result.prune_speed_threshold = prune_speed_thresholds[kind]; result.base_color = LinearFromSrgb(base_colors[kind]); result.dry_factor = LinearFromSrgb(dry_factor[kind]); } diff --git a/src/pp/pp_vis/pp_vis_shared.cgh b/src/pp/pp_vis/pp_vis_shared.cgh index 16ca6419..71d88ea5 100644 --- a/src/pp/pp_vis/pp_vis_shared.cgh +++ b/src/pp/pp_vis/pp_vis_shared.cgh @@ -9,14 +9,13 @@ Enum(V_GpuFlag) { V_GpuFlag_None = 0, - V_GpuFlag_InitBloom = (1 << 0), };  G_DeclConstant(V_GpuFlag, V_GpuConst_Flags, 0); G_DeclConstant(G_StructuredBufferRef, V_GpuConst_Frame, 1); G_DeclConstant(G_Texture3DRef, V_GpuConst_NoiseTex, 2); -G_DeclConstant(G_Texture2DRef, V_GpuConst_BloomRead, 3); -G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4); +G_DeclConstant(i32, V_GpuConst_MipsCount, 3); +G_DeclConstant(i32, V_GpuConst_MipIdx, 4);  //////////////////////////////////////////////////////////// //~ Particle types @@ -29,7 +28,6 @@ G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4); Enum(V_ParticleFlag) { V_ParticleFlag_None = 0, - V_ParticleFlag_NoPruneWhenStill = (1 << 0), V_ParticleFlag_StainWhenPruned = (1 << 1), V_ParticleFlag_NoReflect = (1 << 2), V_ParticleFlag_OnlyCollideWithWalls = (1 << 3), @@ -53,6 +51,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Ground, \ /* Stain rate, pen chance */ 30, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0, 0, 0, 0), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -64,8 +63,9 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Ground, \ /* Stain rate, pen chance */ 100, 0.25, \ /* Lifetime */ Inf, \ - /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.05), \ - /* Dry color factor */ CompVec4(0.5, 0.5, 0.5, 1) \ + /* Prune speed threshold */ 0.5, \ + /* Base color */ CompVec4(0.6, 0.1, 0.1, 0.05), \ + /* Dry color factor */ CompVec4(0.4, 0.4, 0.4, 1) \ ) \ X( \ /* Name */ BloodDebris, \ @@ -73,6 +73,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 30, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.8), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -82,6 +83,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.4, 0.3, 0.2, 1), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -91,6 +93,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.1, \ /* Base color */ CompVec4(2, 0.5, 0, 1), \ /* Dry color factor */ CompVec4(0.2, 0.1, 0.0, 1) \ ) \ @@ -102,6 +105,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ 0.075, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.8, 0.6, 0.2, 0.25), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -111,6 +115,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Air, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(0.25, 0.25, 0.25, 0.75), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -122,6 +127,7 @@ Enum(V_ParticleLayer) /* Layer */ V_ParticleLayer_Mid, \ /* Stain rate, pen chance */ 0, 0, \ /* Lifetime */ Inf, \ + /* Prune speed threshold */ 0.01, \ /* Base color */ CompVec4(1, 1, 0, 1), \ /* Dry color factor */ CompVec4(1, 1, 1, 1) \ ) \ @@ -168,6 +174,7 @@ Struct(V_ParticleDesc) f32 stain_rate; f32 pen_rate; f32 lifetime; + f32 prune_speed_threshold; Vec4 base_color; Vec4 dry_factor; }; @@ -264,6 +271,7 @@ Struct(V_SharedFrame)  b32 tiles_dirty; b32 should_clear_particles; + b32 should_tone_map;  b32 is_looking; b32 is_moving;