power_play/tatus

927 lines
48 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

diff --git a/src/gpu/gpu_common.c b/src/gpu/gpu_common.c
index a9686d87..43835793 100644
--- a/src/gpu/gpu_common.c
+++ b/src/gpu/gpu_common.c
@@ -25,7 +25,7 @@ void G_BootstrapCommon(void)
gpu_perm, cl,
G_Format_R8G8B8A8_Uint,
VEC2I32(8, 8),
- G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present,
+ G_Layout_Simultaneous,
.flags = G_ResourceFlag_ZeroMemory
);
G.blank_tex = G_PushTexture2DRef(gpu_perm, blank_tex);
@@ -44,7 +44,7 @@ void G_BootstrapCommon(void)
gpu_perm, cl,
G_Format_R16_Uint,
noise_dims,
- G_Layout_AnyQueue_ShaderRead_CopyRead_CopyWrite_Present
+ G_Layout_Simultaneous
);
G_CopyCpuToTexture(
cl,
@@ -143,30 +143,54 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList

//- Mip

-i32 G_DimsFromMip1D(i32 texture_dims, i32 mip)
+i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip)
{
- mip = ClampI32(mip, 0, 31);
+ mip = ClampI32(mip, -31, 31);
i32 result = 0;
- result = MaxI32(result >> mip, 1);
+ if (mip >= 0)
+ {
+ result = MaxI32(result >> mip, 1);
+ }
+ else
+ {
+ result = MaxI32(result << -mip, 1);
+ }
return result;
}

-Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip)
+Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip)
{
- mip = ClampI32(mip, 0, 31);
+ mip = ClampI32(mip, -31, 31);
Vec2I32 result = Zi;
- result.x = MaxI32(texture_dims.x >> mip, 1);
- result.y = MaxI32(texture_dims.y >> mip, 1);
+ if (mip >= 0)
+ {
+ result.x = MaxI32(mip0_dims.x >> mip, 1);
+ result.y = MaxI32(mip0_dims.y >> mip, 1);
+ }
+ else
+ {
+ result.x = MaxI32(mip0_dims.x << -mip, 1);
+ result.y = MaxI32(mip0_dims.y << -mip, 1);
+ }
return result;
}

-Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip)
+Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip)
{
- mip = ClampI32(mip, 0, 31);
+ mip = ClampI32(mip, -31, 31);
Vec3I32 result = Zi;
- result.x = MaxI32(texture_dims.x >> mip, 1);
- result.y = MaxI32(texture_dims.y >> mip, 1);
- result.z = MaxI32(texture_dims.z >> mip, 1);
+ if (mip >= 0)
+ {
+ result.x = MaxI32(mip0_dims.x >> mip, 1);
+ result.y = MaxI32(mip0_dims.y >> mip, 1);
+ result.z = MaxI32(mip0_dims.z >> mip, 1);
+ }
+ else
+ {
+ result.x = MaxI32(mip0_dims.x << -mip, 1);
+ result.y = MaxI32(mip0_dims.y << -mip, 1);
+ result.z = MaxI32(mip0_dims.z << -mip, 1);
+ }
return result;
}

diff --git a/src/gpu/gpu_common.h b/src/gpu/gpu_common.h
index eb3ee6d2..03927040 100644
--- a/src/gpu/gpu_common.h
+++ b/src/gpu/gpu_common.h
@@ -35,9 +35,9 @@ G_ResourceHandle G_PushBufferFromCpuCopy_(G_ArenaHandle gpu_arena, G_CommandList
G_PushBufferFromCpuCopy_((_arena), (_cl), (_src), (G_BufferDesc) { .size = (_src).len, __VA_ARGS__ })

//- Mip
-i32 G_DimsFromMip1D(i32 texture_dims, i32 mip);
-Vec2I32 G_DimsFromMip2D(Vec2I32 texture_dims, i32 mip);
-Vec3I32 G_DimsFromMip3D(Vec3I32 texture_dims, i32 mip);
+i32 G_DimsFromMip1D(i32 mip0_dims, i32 mip);
+Vec2I32 G_DimsFromMip2D(Vec2I32 mip0_dims, i32 mip);
+Vec3I32 G_DimsFromMip3D(Vec3I32 mip0_dims, i32 mip);

//- Viewport / scissor
Rng3 G_ViewportFromTexture(G_ResourceHandle texture);
diff --git a/src/gpu/gpu_core.h b/src/gpu/gpu_core.h
index 7e1b329a..bed18c93 100644
--- a/src/gpu/gpu_core.h
+++ b/src/gpu/gpu_core.h
@@ -242,18 +242,16 @@ Enum(G_Access)
G_Access_IndexBuffer = (1 << 8),
G_Access_IndirectArgument = (1 << 9),

- G_Access_All = 0xFFFFFFFF
+ G_Access_All = 0xFFFFFFFF // Represents all accesses relevant to the specified sync stage
};

Enum(G_Layout)
{
G_Layout_NoChange,

- // "Simultaneous" allows a resource to be used on any queue with any access
- // type, as long as there is only one writer at a time, and the writer is not
- // writing to any texels currently being read.
- // Resources cannot transition to/from this layout. They must be created
- // with it and are locked to it.
+ // Simultaneous layout allows a resource to be used on any queue with any
+ // access type (except depth-stencil). Resources cannot transition to/from
+ // this layout, they must be created with it.
G_Layout_Simultaneous, // D3D12_BARRIER_LAYOUT_COMMON + D3D12_RESOURCE_FLAG_ALLOW_SIMULTANEOUS_ACCESS

G_Layout_Undefined, // D3D12_BARRIER_LAYOUT_UNDEFINED
diff --git a/src/pp/pp_vis/pp_vis.lay b/src/pp/pp_vis/pp_vis.lay
index f72dc528..2d916376 100644
--- a/src/pp/pp_vis/pp_vis.lay
+++ b/src/pp/pp_vis/pp_vis.lay
@@ -26,7 +26,7 @@
@ComputeShader V_CompositeCS
@ComputeShader V_BloomDownCS
@ComputeShader V_BloomUpCS
-@ComputeShader V_PostProcessCS
+@ComputeShader V_FinalizeCS
@VertexShader V_DVertVS
@PixelShader V_DVertPS

diff --git a/src/pp/pp_vis/pp_vis_core.c b/src/pp/pp_vis/pp_vis_core.c
index f2f5e6b5..338036ba 100644
--- a/src/pp/pp_vis/pp_vis_core.c
+++ b/src/pp/pp_vis/pp_vis_core.c
@@ -416,7 +416,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R8_Uint,
tiles_dims,
- G_Layout_DirectQueue_ShaderRead,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory,
.name = Lit("Tiles")
);
@@ -441,7 +441,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Uint,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(perm, "Particle cells - layer %F", FmtSint(layer))
);
@@ -454,7 +454,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Uint,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(perm, "Particle densities - layer %F", FmtSint(layer))
);
@@ -469,7 +469,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R16G16B16A16_Float,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Stains")
);
@@ -481,7 +481,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R16G16B16A16_Float,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Dry stains")
);
@@ -493,7 +493,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Float,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Drynesses")
);
@@ -505,7 +505,7 @@ void V_TickForever(WaveLaneCtx *lane)
gpu_perm, cl,
G_Format_R32_Uint,
cells_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_ZeroMemory | G_ResourceFlag_AllowShaderReadWrite,
.name = Lit("Occluders cells")
);
@@ -614,6 +614,8 @@ void V_TickForever(WaveLaneCtx *lane)
frame->dt = SecondsFromNs(frame->dt_ns);
frame->rand = prev_frame->rand;

+ frame->should_tone_map = TweakBool("Tone mapping enabled", 1);
+
if (P_IsEntKeyNil(V.player_key))
{
TrueRand(StringFromStruct(&V.player_key));
@@ -4918,18 +4920,17 @@ void V_TickForever(WaveLaneCtx *lane)
frame->tile_descs[tile_kind] = tile_desc;
}
}
+
// Upload tiles
if (frame->tiles_dirty)
{
// LogDebugF("Uploading tiles to gpu");
- G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_CopyWrite);
G_CopyCpuToTexture(
frame->cl,
gpu_tiles_res, VEC3I32(0, 0, 0),
local_world->tiles, VEC3I32(tiles_dims.x, tiles_dims.y, 1),
RNG3I32(VEC3I32(0, 0, 0), VEC3I32(tiles_dims.x, tiles_dims.y, 1))
);
- G_DumbMemoryLayoutSync(frame->cl, gpu_tiles_res, G_Layout_DirectQueue_ShaderRead);
}

// Screen texture
@@ -4937,7 +4938,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float,
frame->screen_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget,
.name = StringF(frame->arena, "Screen target [%F]", FmtSint(frame->tick))
);
@@ -4951,11 +4952,10 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float,
G_DimsFromMip2D(G_Count2D(screen_target), 1),
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_AllowShaderReadWrite | G_ResourceFlag_AllowRenderTarget,
.name = StringF(frame->arena, "Bloom target [%F]", FmtSint(frame->tick)),
- // .max_mips = 4
- .max_mips = 8
+ .max_mips = 64
);
for (i32 mip_idx = 0; mip_idx < G_CountMips(bloom_target); ++mip_idx)
{
@@ -4979,7 +4979,7 @@ void V_TickForever(WaveLaneCtx *lane)
frame->gpu_arena, frame->cl,
G_Format_R16G16B16A16_Float,
frame->shade_dims,
- G_Layout_DirectQueue_ShaderReadWrite,
+ G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite,
.flags = G_ResourceFlag_AllowShaderReadWrite,
.name = StringF(frame->arena, "Shade target [%F]", FmtSint(frame->tick))
);
@@ -5091,6 +5091,9 @@ void V_TickForever(WaveLaneCtx *lane)

// Sync particles & occluders
G_DumbGlobalMemorySync(frame->cl);
+
+ // Transition albedo
+ G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite);
}

//////////////////////////////
@@ -5113,83 +5116,63 @@ void V_TickForever(WaveLaneCtx *lane)
G_Compute(frame->cl, V_ShadeCS, V_ThreadGroupSizeFromTexSize(frame->shade_dims));
}

- //////////////////////////////
- //- Transition G-buffers to readonly
-
- {
- G_DumbMemoryLayoutSync(frame->cl, albedo_target, G_Layout_DirectQueue_ShaderRead);
- G_DumbMemoryLayoutSync(frame->cl, shade_target, G_Layout_DirectQueue_ShaderRead);
- }
-
//////////////////////////////
//- Composite pass

{
G_Compute(frame->cl, V_CompositeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));

- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead);
+ // Sync screen tex
+ G_DumbGlobalMemorySync(frame->cl);
}

//////////////////////////////
//- Bloom passes

{
- i32 mips_count = G_CountMips(bloom_target);
+ i32 mips_count = G_CountMips(bloom_target) + 1;
+ G_SetConstant(frame->cl, V_GpuConst_MipsCount, mips_count);
+
+ // NOTE: Because bloom mip chain starts at half screen size, mip_idx 0
+ // actually represents the screen texture, while mip_idx - 1 represents
+ // the first mip index in the bloom mip chain

//- Downsample + blur passes
- for (i32 mip_idx = 0; mip_idx < mips_count; ++mip_idx)
+ for (i32 mip_idx = 1; mip_idx < mips_count; ++mip_idx)
{
- Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx);
- if (mip_idx == 0)
- {
- // Init bloom pyramid from screen target on first pass (prefilter)
- gpu_flags |= V_GpuFlag_InitBloom;
- G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags);
- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->screen_ro);
- }
- else
- {
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx - 1, mip_idx - 1));
- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx - 1]);
- }
- G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]);
- {
- G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(dims));
- }
- gpu_flags &= ~V_GpuFlag_InitBloom;
- G_SetConstant(frame->cl, V_GpuConst_Flags, gpu_flags);
+ Vec2I32 down_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx);
+
+ G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx);
+ G_Compute(frame->cl, V_BloomDownCS, V_ThreadGroupSizeFromTexSize(down_dims));
+
+ G_DumbGlobalMemorySync(frame->cl);
}

//- Upsample passes
for (i32 mip_idx = mips_count - 2; mip_idx >= 0; --mip_idx)
{
- Vec2I32 dims = G_DimsFromMip2D(G_Count2D(bloom_target), mip_idx);
-
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderReadWrite, .mips = RNGI32(mip_idx, mip_idx));
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(mip_idx + 1, mip_idx + 1));
+ Vec2I32 up_dims = G_DimsFromMip2D(G_Count2D(screen_target), mip_idx);

- G_SetConstant(frame->cl, V_GpuConst_BloomRead, frame->bloom_mips_ro[mip_idx + 1]);
- G_SetConstant(frame->cl, V_GpuConst_BloomWrite, frame->bloom_mips_rw[mip_idx]);
+ G_SetConstant(frame->cl, V_GpuConst_MipIdx, mip_idx);
+ G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(up_dims));

- G_Compute(frame->cl, V_BloomUpCS, V_ThreadGroupSizeFromTexSize(dims));
- }
+ G_DumbGlobalMemorySync(frame->cl);
+ }
}

//////////////////////////////
- //- Post process pass
+ //- Finalization pass

{
- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderReadWrite);
- G_DumbMemoryLayoutSync(frame->cl, bloom_target, G_Layout_DirectQueue_ShaderRead, .mips = RNGI32(0, 0));
- G_Compute(frame->cl, V_PostProcessCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
+ G_Compute(frame->cl, V_FinalizeCS, V_ThreadGroupSizeFromTexSize(frame->screen_dims));
}

//////////////////////////////
//- Debug shapes pass

- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite);
-
{
+ G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_RenderTargetWrite);
+
G_Rasterize(
frame->cl,
V_DVertVS, V_DVertPS,
@@ -5198,12 +5181,13 @@ void V_TickForever(WaveLaneCtx *lane)
screen_viewport, screen_scissor,
G_RasterMode_TriangleList
);
+
+ G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead_ShaderReadWrite_CopyRead_CopyWrite);
}

//////////////////////////////
//- Finalize screen target

- G_DumbMemoryLayoutSync(frame->cl, screen_target, G_Layout_DirectQueue_ShaderRead);
{
Rng2 uv = Zi;
uv.p0 = Vec2FromVec(screen_viewport.p0);
diff --git a/src/pp/pp_vis/pp_vis_gpu.g b/src/pp/pp_vis/pp_vis_gpu.g
index f8a254de..c0a9e47d 100644
--- a/src/pp/pp_vis/pp_vis_gpu.g
+++ b/src/pp/pp_vis/pp_vis_gpu.g
@@ -53,13 +53,6 @@ Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density)
return result;
}

-// ACES approximation by Krzysztof Narkowicz
-// https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
-Vec3 V_ToneMap(Vec3 v)
-{
- return saturate((v * (2.51f * v + 0.03f)) / (v * (2.43f * v + 0.59f) + 0.14f));
-}
-
////////////////////////////////////////////////////////////
//~ Prepare frame

@@ -142,11 +135,11 @@ ComputeShader2D(V_PrepareCellsCS, 8, 8)
}
else if (over_stain.a > 0)
{
- Vec4 stain = dry_stains[cell_pos];
Vec4 dry_stain = max(dry_stains[cell_pos], 0);
+ Vec4 stain = dry_stain;

- stain = BlendPremul(over_stain, stain);
dry_stain = BlendPremul(over_dry_stain, dry_stain);
+ stain = BlendPremul(over_stain, stain);

stains[cell_pos] = stain;
dry_stains[cell_pos] = dry_stain;
@@ -483,7 +476,7 @@ ComputeShader(V_SimParticlesCS, 64)
particle.prev_occluder = occluder;
}

- if (!AnyBit(desc.flags, V_ParticleFlag_NoPruneWhenStill) && dot(particle.velocity, particle.velocity) < 0.0001)
+ if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold))
{
prune = 1;
}
@@ -723,7 +716,6 @@ ComputeShader2D(V_CompositeCS, 8, 8)
Vec4 ground_particle_color = 0;
Vec4 air_particle_color = 0;

-
for (V_ParticleLayer layer = (V_ParticleLayer)0; layer < V_ParticleLayer_COUNT; layer += (V_ParticleLayer)1)
{
RWTexture2D<u32> cells = G_Dereference<u32>(frame.particle_cells[layer]);
@@ -752,9 +744,9 @@ ComputeShader2D(V_CompositeCS, 8, 8)
// Darken wall particles / stains
if (tile == P_TileKind_Wall)
{
- ground_particle_color *= 0.25;
- air_particle_color *= 0.25;
- stain_color *= 0.25;
+ ground_particle_color *= 0.5;
+ air_particle_color *= 0.5;
+ stain_color *= 0.5;
}

//////////////////////////////
@@ -972,57 +964,73 @@ ComputeShader2D(V_CompositeCS, 8, 8)
////////////////////////////////////////////////////////////
//~ Bloom

+//////////////////////////////
+//- Downsample
+
ComputeShader2D(V_BloomDownCS, 8, 8)
{
+ i32 mips_count = V_GpuConst_MipsCount;
+ i32 mip_idx = V_GpuConst_MipIdx;
+
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
- Texture2D<Vec4> bloom_up = G_Dereference<Vec4>(V_GpuConst_BloomRead);
- RWTexture2D<Vec4> bloom_down = G_Dereference<Vec4>(V_GpuConst_BloomWrite);
SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]);
+ RWTexture2D<Vec4> bloom_down = G_Dereference<Vec4>(frame.bloom_mips_rw[mip_idx - 1]);
+
+ Texture2D<Vec4> bloom_up;
+ b32 is_first_pass = mip_idx == 1;
+ if (is_first_pass)
+ {
+ bloom_up = G_Dereference<Vec4>(frame.screen_ro);
+ }
+ else
+ {
+ bloom_up = G_Dereference<Vec4>(frame.bloom_mips_ro[mip_idx - 2]);
+ }

- Vec2 up_dims = countof(bloom_up);
Vec2 down_dims = countof(bloom_down);

Vec2 bloom_pos = SV_DispatchThreadID + 0.5;
Vec2 bloom_uv = bloom_pos / down_dims;
Vec2 off_uv = 0.5 / down_dims;
- b32 is_first_pass = !!(V_GpuConst_Flags & V_GpuFlag_InitBloom);

- Struct(SampleDesc) { Vec2 uv; f32 weight; };
- SampleDesc samples[] = {
- { bloom_uv + Vec2(0, 0), 0.5 },
- { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 },
- { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 },
- { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 },
- { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 },
- };
+ f32 threshold = 0.25;
+ f32 knee = 0.75;

Vec4 result = 0;
- for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx)
{
- SampleDesc desc = samples[sample_idx];
- Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0);
-
- f32 knee_weight = 1;
- if (is_first_pass)
+ Struct(SampleDesc) { Vec2 uv; f32 weight; };
+ SampleDesc samples[] = {
+ { bloom_uv + Vec2(0, 0), 0.5 },
+ { bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0.125 },
+ { bloom_uv + Vec2(off_uv.x, -off_uv.y), 0.125 },
+ { bloom_uv + Vec2(off_uv.x, off_uv.y), 0.125 },
+ { bloom_uv + Vec2(-off_uv.x, off_uv.y), 0.125 },
+ };
+ for (u32 sample_idx = 0; sample_idx < countof(samples); ++sample_idx)
{
- f32 luminance = LuminanceFromColor(src);
- f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance
- f32 bright = max(luminance, (max_rgb - 1.0) * 0.5);
- if (bright > 0)
- {
- f32 threshold = 1.0;
- f32 knee = 0.5;
- f32 over_threshold = max(bright - threshold, 0.0);
- f32 ramp = saturate(over_threshold / knee);
- knee_weight = (over_threshold * ramp * ramp) / bright;
- }
- else
+ SampleDesc desc = samples[sample_idx];
+ Vec4 src = bloom_up.SampleLevel(sampler, desc.uv, 0);
+
+ f32 knee_weight = 1;
+ if (is_first_pass)
{
- knee_weight = 0;
+ f32 luminance = LuminanceFromColor(src);
+ f32 max_rgb = max(max(src.r, src.g), src.b); // So that we can get bloom on colors with high rgb, not just high luminance
+ f32 bright = max(luminance, (max_rgb - 1.0) * 0.5);
+ if (bright > 0)
+ {
+ f32 over_threshold = max(bright - threshold, 0.0);
+ f32 ramp = saturate(over_threshold / knee);
+ knee_weight = (over_threshold * ramp * ramp) / bright;
+ }
+ else
+ {
+ knee_weight = 0;
+ }
}
- }

- result += src * desc.weight * knee_weight;
+ result += src * desc.weight * knee_weight;
+ }
}

if (IsInside(bloom_pos, down_dims))
@@ -1031,52 +1039,77 @@ ComputeShader2D(V_BloomDownCS, 8, 8)
}
}

+//////////////////////////////
+//- Upsample
+
ComputeShader2D(V_BloomUpCS, 8, 8)
{
+ i32 mips_count = V_GpuConst_MipsCount;
+ i32 mip_idx = V_GpuConst_MipIdx;
+
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
- Texture2D<Vec4> bloom_down = G_Dereference<Vec4>(V_GpuConst_BloomRead);
- RWTexture2D<Vec4> bloom_up = G_Dereference<Vec4>(V_GpuConst_BloomWrite);
SamplerState sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]);
+ Texture2D<Vec4> bloom_down = G_Dereference<Vec4>(frame.bloom_mips_ro[mip_idx]);
+
+ b32 is_last_pass = mip_idx == 0;
+ RWTexture2D<Vec4> bloom_up;
+ if (is_last_pass)
+ {
+ bloom_up = G_Dereference<Vec4>(frame.screen_rw);
+ }
+ else
+ {
+ bloom_up = G_Dereference<Vec4>(frame.bloom_mips_rw[mip_idx - 1]);
+ }

- Vec2 up_dims = countof(bloom_up);
Vec2 down_dims = countof(bloom_down);
+ Vec2 up_dims = countof(bloom_up);

Vec2 bloom_pos = SV_DispatchThreadID + 0.5;
Vec2 bloom_uv = bloom_pos / up_dims;
- Vec2 off_uv = 1 / up_dims;
+ Vec2 off_uv0 = 1 / down_dims;
+ Vec2 off_uv1 = off_uv0 * 2;

Vec4 result = 0;
{
// Center
- result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 4;
- // Edges
+ result += bloom_down.SampleLevel(sampler, bloom_uv, 0) * 9.0f / 41.0f;
+
+ // Outer Edges
result += (
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, 0), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, 0), 0)
- ) * 2;
- // Corners
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, -off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, 0), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(0, off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, 0), 0)
+ ) * 3.0f / 41.0f;
+
+ // Inner corners
+ result += (
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, -off_uv0.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, -off_uv0.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv0.x, off_uv0.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv0.x, off_uv0.y), 0)
+ ) * 4.0f / 41.0f;
+
+ // Outer corners
result += (
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, -off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, -off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv.x, off_uv.y), 0) +
- bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv.x, off_uv.y), 0)
- );
- // Normalize
- result /= 16;
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, -off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, -off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(off_uv1.x, off_uv1.y), 0) +
+ bloom_down.SampleLevel(sampler, bloom_uv + Vec2(-off_uv1.x, off_uv1.y), 0)
+ ) * 1.0f / 41.0f;
}

if (IsInside(bloom_pos, up_dims))
{
- bloom_up[bloom_pos] += result;
+ bloom_up[bloom_pos] += result * 0.75;
}
}

////////////////////////////////////////////////////////////
-//~ Post process
+//~ Finalize

-ComputeShader2D(V_PostProcessCS, 8, 8)
+ComputeShader2D(V_FinalizeCS, 8, 8)
{
V_SharedFrame frame = G_Dereference<V_SharedFrame>(V_GpuConst_Frame)[0];
SamplerState bilinear_sampler = G_Dereference(frame.basic_samplers[G_BasicSamplerKind_BilinearClamp]);
@@ -1084,42 +1117,21 @@ ComputeShader2D(V_PostProcessCS, 8, 8)
RWTexture2D<Vec4> screen_tex = G_Dereference<Vec4>(frame.screen_rw);

Vec2 screen_pos = SV_DispatchThreadID + 0.5;
- Vec2 screen_uv = screen_pos / frame.screen_dims;
b32 is_in_screen = IsInside(screen_pos, frame.screen_dims);
-
- //////////////////////////////
- //- Original
-
- Vec4 original = 0;
if (is_in_screen)
{
- original = screen_tex[screen_pos];
- original.rgb *= original.a;
- }
+ Vec4 result = screen_tex[screen_pos];

+ //- Tone map
+ if (frame.should_tone_map)
+ {
+ // ACES approximation by Krzysztof Narkowicz
+ // https://knarkowicz.wordpress.com/2016/01/06/aces-filmic-tone-mapping-curve/
+ result.rgb = saturate((result.rgb * (2.51f * result.rgb + 0.03f)) / (result.rgb * (2.43f * result.rgb + 0.59f) + 0.14f));
+ }

- //////////////////////////////
- //- Bloom
-
- Vec4 bloom = 0;
- if (is_in_screen)
- {
- bloom = bloom_tex.SampleLevel(bilinear_sampler, screen_uv, 0);
- // bloom.rgb *= bloom.a;
- }
-
- //////////////////////////////
- //- Compose
-
- Vec4 result = Vec4(0, 0, 0, 1);
- result = BlendPremul(original, result);
- result += bloom;
- // result.rgb = V_ToneMap(result);
+ result = Unpremul(result);

- result = Unpremul(result);
-
- if (is_in_screen)
- {
screen_tex[screen_pos] = result;
}
}
diff --git a/src/pp/pp_vis/pp_vis_gpu.gh b/src/pp/pp_vis/pp_vis_gpu.gh
index a47a2335..f176f2f8 100644
--- a/src/pp/pp_vis/pp_vis_gpu.gh
+++ b/src/pp/pp_vis/pp_vis_gpu.gh
@@ -46,7 +46,6 @@ Struct(V_DVertPSOutput)

f32 V_RandFromPos(Vec3 pos);
Vec4 V_ColorFromParticle(V_ParticleDesc desc, u32 particle_idx, u32 density);
-Vec3 V_ToneMap(Vec3 v);

////////////////////////////////////////////////////////////
//~ Shaders
@@ -73,8 +72,8 @@ ComputeShader2D(V_CompositeCS, 8, 8);
ComputeShader2D(V_BloomDownCS, 8, 8);
ComputeShader2D(V_BloomUpCS, 8, 8);

-//- Post process
-ComputeShader2D(V_PostProcessCS, 8, 8);
+//- Finalize
+ComputeShader2D(V_FinalizeCS, 8, 8);

//- Debug shapes
VertexShader(V_DVertVS, V_DVertPSInput);
diff --git a/src/pp/pp_vis/pp_vis_shared.cg b/src/pp/pp_vis/pp_vis_shared.cg
index 2419a6f2..72f6ae8d 100644
--- a/src/pp/pp_vis/pp_vis_shared.cg
+++ b/src/pp/pp_vis/pp_vis_shared.cg
@@ -11,37 +11,42 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind)
V_ParticleDesc result;
{
PERSIST Readonly V_ParticleFlag flags[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) flags,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) flags,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly V_ParticleLayer layers[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) layer,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) layer,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly f32 stain_rates[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) stain_rate,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) stain_rate,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly f32 pen_rates[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) pen_rate,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) pen_rate,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly f32 lifetimes[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) lifetime,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) lifetime,
+ V_ParticlesXList(X)
+ #undef X
+ };
+ PERSIST Readonly f32 prune_speed_thresholds[V_ParticleKind_COUNT] = {
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) prune_speed_threshold,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly Vec4 base_colors[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) base_color,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) base_color,
V_ParticlesXList(X)
#undef X
};
PERSIST Readonly Vec4 dry_factor[V_ParticleKind_COUNT] = {
- #define X(name, flags, layer, stain_rate, pen_rate, lifetime, base_color, dry_factor) dry_factor,
+ #define X(name, flags, layer, stain_rate, pen_rate, lifetime, prune_speed_threshold, base_color, dry_factor) dry_factor,
V_ParticlesXList(X)
#undef X
};
@@ -51,6 +56,7 @@ V_ParticleDesc V_DescFromParticleKind(V_ParticleKind kind)
result.stain_rate = stain_rates[kind];
result.pen_rate = pen_rates[kind];
result.lifetime = lifetimes[kind];
+ result.prune_speed_threshold = prune_speed_thresholds[kind];
result.base_color = LinearFromSrgb(base_colors[kind]);
result.dry_factor = LinearFromSrgb(dry_factor[kind]);
}
diff --git a/src/pp/pp_vis/pp_vis_shared.cgh b/src/pp/pp_vis/pp_vis_shared.cgh
index 16ca6419..71d88ea5 100644
--- a/src/pp/pp_vis/pp_vis_shared.cgh
+++ b/src/pp/pp_vis/pp_vis_shared.cgh
@@ -9,14 +9,13 @@
Enum(V_GpuFlag)
{
V_GpuFlag_None = 0,
- V_GpuFlag_InitBloom = (1 << 0),
};

G_DeclConstant(V_GpuFlag, V_GpuConst_Flags, 0);
G_DeclConstant(G_StructuredBufferRef, V_GpuConst_Frame, 1);
G_DeclConstant(G_Texture3DRef, V_GpuConst_NoiseTex, 2);
-G_DeclConstant(G_Texture2DRef, V_GpuConst_BloomRead, 3);
-G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4);
+G_DeclConstant(i32, V_GpuConst_MipsCount, 3);
+G_DeclConstant(i32, V_GpuConst_MipIdx, 4);

////////////////////////////////////////////////////////////
//~ Particle types
@@ -29,7 +28,6 @@ G_DeclConstant(G_RWTexture2DRef, V_GpuConst_BloomWrite, 4);
Enum(V_ParticleFlag)
{
V_ParticleFlag_None = 0,
- V_ParticleFlag_NoPruneWhenStill = (1 << 0),
V_ParticleFlag_StainWhenPruned = (1 << 1),
V_ParticleFlag_NoReflect = (1 << 2),
V_ParticleFlag_OnlyCollideWithWalls = (1 << 3),
@@ -53,6 +51,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Ground, \
/* Stain rate, pen chance */ 30, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0, 0, 0, 0), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -64,8 +63,9 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Ground, \
/* Stain rate, pen chance */ 100, 0.25, \
/* Lifetime */ Inf, \
- /* Base color */ CompVec4(0.5, 0.1, 0.1, 0.05), \
- /* Dry color factor */ CompVec4(0.5, 0.5, 0.5, 1) \
+ /* Prune speed threshold */ 0.5, \
+ /* Base color */ CompVec4(0.6, 0.1, 0.1, 0.05), \
+ /* Dry color factor */ CompVec4(0.4, 0.4, 0.4, 1) \
) \
X( \
/* Name */ BloodDebris, \
@@ -73,6 +73,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 30, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.5, 0.1, 0.1, 0.8), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -82,6 +83,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.4, 0.3, 0.2, 1), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -91,6 +93,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.1, \
/* Base color */ CompVec4(2, 0.5, 0, 1), \
/* Dry color factor */ CompVec4(0.2, 0.1, 0.0, 1) \
) \
@@ -102,6 +105,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ 0.075, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.8, 0.6, 0.2, 0.25), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -111,6 +115,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Air, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(0.25, 0.25, 0.25, 0.75), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -122,6 +127,7 @@ Enum(V_ParticleLayer)
/* Layer */ V_ParticleLayer_Mid, \
/* Stain rate, pen chance */ 0, 0, \
/* Lifetime */ Inf, \
+ /* Prune speed threshold */ 0.01, \
/* Base color */ CompVec4(1, 1, 0, 1), \
/* Dry color factor */ CompVec4(1, 1, 1, 1) \
) \
@@ -168,6 +174,7 @@ Struct(V_ParticleDesc)
f32 stain_rate;
f32 pen_rate;
f32 lifetime;
+ f32 prune_speed_threshold;
Vec4 base_color;
Vec4 dry_factor;
};
@@ -264,6 +271,7 @@ Struct(V_SharedFrame)

b32 tiles_dirty;
b32 should_clear_particles;
+ b32 should_tone_map;

b32 is_looking;
b32 is_moving;