use dimension-specific vector types for compute shader parameters

2026-03-19 17:01:55 -05:00 · 2026-03-19 17:01:55 -05:00 · cbcec3639f
commit cbcec3639f
parent b63b6197a6
3 changed files with 310 additions and 258 deletions
--- a/src/base/base.cgh
+++ b/src/base/base.cgh
@ -744,32 +744,34 @@ Struct(VertexShaderDesc)    { ResourceKey resource; u32 x, y, z; };
 Struct(PixelShaderDesc)     { ResourceKey resource; u32 x, y, z; };
 Struct(ComputeShaderDesc)   { ResourceKey resource; u32 x, y, z; };
-#define GroupSize(name) VEC3U32(CAT(name, __GroupSize_X), CAT(name, __GroupSize_Y), CAT(name, __GroupSize_Z))
+#define GroupSize(name) VEC3U32(CAT(name,__GroupSize_X), CAT(name,__GroupSize_Y), CAT(name,__GroupSize_Z))
 #if IsGpu
  #define Semantic(name) name : name
-  #define VertexShader(name, return_type)         return_type name(u32 Semantic(SV_InstanceID), u32 Semantic(SV_VertexID))
+  #define VertexShader(name, return_type)       return_type name(u32 Semantic(SV_InstanceID), u32 Semantic(SV_VertexID))
-  #define PixelShader(name, return_type, ...)     return_type name(__VA_ARGS__)
+  #define PixelShader(name, return_type, ...)   return_type name(__VA_ARGS__)
-  #define ComputeShader(name)                                                                           \
+  #define ComputeShader(name)                                                               \
-    [numthreads(CAT(name, __GroupSize_X), CAT(name, __GroupSize_Y), CAT(name, __GroupSize_Z))]          \
+    [numthreads(CAT(name,__GroupSize_X), CAT(name,__GroupSize_Y), CAT(name,__GroupSize_Z))] \
-    void name(                                                                                          \
+    void name(                                                                              \
-      u32 Semantic(SV_GroupIndex),                                                                      \
+      u32 Semantic(SV_GroupIndex),                                                          \
-      Vec3U32 Semantic(SV_GroupID),                                                                     \
+      CAT(name,__ThreadDimsType) Semantic(SV_GroupID),                                      \
-      Vec3U32 Semantic(SV_GroupThreadID),                                                               \
+      CAT(name,__ThreadDimsType) Semantic(SV_GroupThreadID),                                \
-      Vec3U32 Semantic(SV_DispatchThreadID)                                                             \
+      CAT(name,__ThreadDimsType) Semantic(SV_DispatchThreadID)                              \
-    )
+    )                                                                                       \
    /* ----------------------------------------------------------------------------------- */
 #endif
 #if IsCpu
-  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name, __GroupSize_X) = x, CAT(name, __GroupSize_Y) = y, CAT(name, __GroupSize_Z) = z }; static ComputeShaderDesc name = { resource_hash, x, y, z }
+  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name,__GroupSize_X) = x, CAT(name,__GroupSize_Y) = y, CAT(name,__GroupSize_Z) = z }; static ComputeShaderDesc name = { resource_hash, x, y, z }
  #define DeclVertexShader(name, resource_hash)             static VertexShaderDesc name = { resource_hash, 1, 1, 1 }
  #define DeclPixelShader(name, resource_hash)              static PixelShaderDesc name = { resource_hash, 1, 1, 1 }
 #elif IsGpu
-  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name, __GroupSize_X) = x, CAT(name, __GroupSize_Y) = y, CAT(name, __GroupSize_Z) = z };
+  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name,__GroupSize_X) = x, CAT(name,__GroupSize_Y) = y, CAT(name,__GroupSize_Z) = z };
  #define DeclVertexShader(name, resource_hash)
  #define DeclPixelShader(name, resource_hash)
 #endif
 ////////////////////////////////////////////////////////////
 //~ Dynamic api linkage
--- a/src/meta/meta.c
+++ b/src/meta/meta.c
@ -669,6 +669,7 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
    //- Generate C file
    StringList shader_lines = Zi;
    StringList shader_thread_dim_type_lines = Zi;
    {
      StringList c_store_lines = Zi;
      StringList c_include_lines = Zi;
@ -715,14 +716,9 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
            {
              if (arg0_tok->valid)
              {
                String decl_type = (
                  kind == M_EntryKind_VertexShader  ? Lit("DeclVertexShader") :
                  kind == M_EntryKind_PixelShader   ? Lit("DeclPixelShader") :
                  kind == M_EntryKind_ComputeShader ? Lit("DeclComputeShader") :
                  Lit("")
                );
                String shader_name = arg0_tok->s;
-                Vec3U32 thread_count = Zi;
+                Vec3U32 thread_dims = Zi;
                i32 thread_dims_count = 1;
                {
                  StringList thread_count_args = Zi;
                  for (i32 arg_idx = 1; arg_idx < countof(entry->arg_tokens); ++arg_idx)
@ -739,36 +735,70 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
                  }
                  String thread_count_str = StringFromList(perm, thread_count_args, Lit(" "));
                  Vec3 tmp = CR_Vec3FromString(thread_count_str);
-                  thread_count.x = MaxI32(tmp.x, 1);
+                  thread_dims.x = MaxI32(tmp.x, 1);
-                  thread_count.y = MaxI32(tmp.y, 1);
+                  thread_dims.y = MaxI32(tmp.y, 1);
-                  thread_count.z = MaxI32(tmp.z, 1);
+                  thread_dims.z = MaxI32(tmp.z, 1);
                  // Determine compute shader dimensions by counting comma-separated values in dimensions string
                  for (u64 char_idx = 0; char_idx < thread_count_str.len; ++char_idx)
                  {
                    u8 c = thread_count_str.text[char_idx];
                    if (c == ',')
                    {
                      thread_dims_count += 1;
                    }
                  }
                  thread_dims_count = ClampI32(thread_dims_count, 1, 3);
                }
                String decl_type = (
                  kind == M_EntryKind_VertexShader  ? Lit("DeclVertexShader") :
                  kind == M_EntryKind_PixelShader   ? Lit("DeclPixelShader") :
                  kind == M_EntryKind_ComputeShader ? Lit("DeclComputeShader") :
                  Lit("")
                );
                u64 shader_resource_hash = HashStringEx(shader_store_hash, StringF(perm, "%F.dxil", FmtString(shader_name)));
-                String lines = Zi;
+                // Dims type line
                if (kind == M_EntryKind_ComputeShader)
                {
-                  lines = StringF(
+                  String line = StringF(
                    perm,
-                    "%F(%F, 0x%F, %F, %F, %F);",
+                    "#define %F__ThreadDimsType %F",
                    FmtString(decl_type),
                    FmtString(shader_name),
-                    FmtHex(shader_resource_hash),
+                    FmtString(
-                    FmtUint(thread_count.x),
+                      thread_dims_count == 1 ? Lit("u32") :
-                    FmtUint(thread_count.y),
+                      thread_dims_count == 2 ? Lit("Vec2U32") :
-                    FmtUint(thread_count.z)
+                      Lit("Vec3U32")
                    )
                  );
                  PushStringToList(perm, &shader_thread_dim_type_lines, line);
                }
-                else
+                // Shader line
                {
-                  lines = StringF(
+                  String line = Zi;
-                    perm,
+                  if (kind == M_EntryKind_ComputeShader)
-                    "%F(%F, 0x%F);",
+                  {
-                    FmtString(decl_type),
+                    line = StringF(
-                    FmtString(shader_name),
+                      perm,
-                    FmtHex(shader_resource_hash)
+                      "%F(%F, 0x%F, %F, %F, %F);",
-                  );
+                      FmtString(decl_type),
                      FmtString(shader_name),
                      FmtHex(shader_resource_hash),
                      FmtUint(thread_dims.x),
                      FmtUint(thread_dims.y),
                      FmtUint(thread_dims.z)
                    );
                  }
                  else
                  {
                    line = StringF(
                      perm,
                      "%F(%F, 0x%F);",
                      FmtString(decl_type),
                      FmtString(shader_name),
                      FmtHex(shader_resource_hash)
                    );
                  }
                  PushStringToList(perm, &shader_lines, line);
                }
                PushStringToList(perm, &shader_lines, lines);
              }
              else
              {
@ -836,6 +866,16 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
            PushStringToList(perm, &c_out_lines, n->s);
          }
        }
        // Define shader dimension types
        if (shader_thread_dim_type_lines.count > 0)
        {
          PushStringToList(perm, &c_out_lines, Lit(""));
          PushStringToList(perm, &c_out_lines, Lit("//- Shader thread dimension types"));
          for (StringListNode *n = shader_thread_dim_type_lines.first; n; n = n->next)
          {
            PushStringToList(perm, &c_out_lines, n->s);
          }
        }
        // Define shaders
        if (shader_lines.count > 0)
        {
@ -975,6 +1015,16 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
            PushStringToList(perm, &gpu_out_lines, Lit("//- Base layer includes"));
            PushStringToList(perm, &gpu_out_lines, StringF(perm, "#include \"%F\"", FmtString(base_inc_path)));
          }
          // Define shader dimension types
          if (shader_thread_dim_type_lines.count > 0)
          {
            PushStringToList(perm, &gpu_out_lines, Lit(""));
            PushStringToList(perm, &gpu_out_lines, Lit("//- Shader thread dimension types"));
            for (StringListNode *n = shader_thread_dim_type_lines.first; n; n = n->next)
            {
              PushStringToList(perm, &gpu_out_lines, n->s);
            }
          }
          // Define shaders
          if (shader_lines.count > 0)
          {
--- a/src/pp/pp_vis/pp_vis_gpu.g
+++ b/src/pp/pp_vis/pp_vis_gpu.g
@ -373,10 +373,10 @@ ComputeShader(V_EmitParticlesCS)
    {
      u32 particle_idx = (emitter.first_particle_seq + emitter_particle_idx) % (u32)V_ParticlesCap;
-      // InterlockedMin guarantees that the highest emitter index (reflected
+      // Using InterlockedMin guarantees that the highest emitter index
-      // as negative particle kind) will be used to initialize the particle
+      // (reflected as negative particle kind) will be used to initialize the
-      // this frame, in case multiple emitters target the same particle (e.g.
+      // particle this frame, in case multiple emitters target the same particle
-      // more particles pushed this frame than are available in the buffer)
+      // (e.g. more particles were pushed this frame than are available in the buffer)
      InterlockedMin(particles[particle_idx].kind, semantic_particle_kind);
    }
  }
@ -393,267 +393,267 @@ ComputeShader(V_SimParticlesCS)
  Texture2D<u32> occluders = G_Deref(frame.occluders, Texture2D<u32>);
  u32 particle_idx = SV_DispatchThreadID;
-  if (particle_idx < V_ParticlesCap)
+  if (particle_idx < V_ParticlesCap && particles[particle_idx].kind != V_ParticleKind_None)
  {
    V_Particle particle = particles[particle_idx];
    b32 prune = 0;
    u64 seed0 = MixU64(V_ParticleSimBasis ^ particle_idx);
    f32 rand_offset = Norm16(seed0 >> 0);
    f32 rand_angle = Norm16(seed0 >> 16);
    f32 rand_speed = Norm16(seed0 >> 32);
    f32 rand_falloff = Norm16(seed0 >> 48);
    //////////////////////////////
-    //- Initialize particle
+    //- Init particle
-    if (particle.kind != V_ParticleKind_None)
+    if (particle.kind < 0)
    {
-      u64 seed0 = MixU64(V_ParticleSimBasis ^ particle_idx);
+      u32 emitter_idx = -particle.kind - 1;
-      f32 rand_offset = Norm16(seed0 >> 0);
+      V_Emitter emitter = G_Deref(frame.emitters, StructuredBuffer<V_Emitter>)[emitter_idx];
      f32 rand_angle = Norm16(seed0 >> 16);
      f32 rand_speed = Norm16(seed0 >> 32);
      f32 rand_falloff = Norm16(seed0 >> 48);
-      //////////////////////////////
+      f32 initial_angle = lerp(emitter.angle.min, emitter.angle.max, rand_angle);
-      //- Init
+      f32 initial_speed = lerp(emitter.speed.min, emitter.speed.max, rand_speed);
-      if (particle.kind < 0)
+      particle = (V_Particle)0;
      particle.kind = emitter.kind;
      particle.life = 0;
      particle.pos = lerp(emitter.pos.p0, emitter.pos.p1, rand_offset);
      particle.velocity = Vec2(cos(initial_angle), sin(initial_angle)) * initial_speed;
    }
    //////////////////////////////
    //- Simulate
    if (particle.kind > V_ParticleKind_None && particle.kind < V_ParticleKind_COUNT && !prune)
    {
      V_ParticleDesc desc = V_DescFromParticleKind((V_ParticleKind)particle.kind);
      RWTexture2D<u32> cells = G_Deref(frame.particle_cells[desc.layer], RWTexture2D<u32>);
      RWTexture2D<u32> densities = G_Deref(frame.particle_densities[desc.layer], RWTexture2D<u32>);
      u32 packed = 0;
      packed |= (particle_idx & ((1 >> 24) - 1)) << 0;
      packed |= (particle.kind & 0xFF) << 24;
      StaticAssert(V_ParticlesCap <= (1 << 24));  // particle idx must fit in 24 bits
      StaticAssert(V_ParticleKind_COUNT <= 0x7F); // particle kind must fit in 7 bits
      if (particle.life == 0)
      {
-        u32 emitter_idx = -particle.kind - 1;
+        Vec2 cell_pos = mul(frame.af.world_to_cell, Vec3(particle.pos, 1));
-        V_Emitter emitter = G_Deref(frame.emitters, StructuredBuffer<V_Emitter>)[emitter_idx];
+        if (IsInside(cell_pos, P_WorldCellsDims))
        f32 initial_angle = lerp(emitter.angle.min, emitter.angle.max, rand_angle);
        f32 initial_speed = lerp(emitter.speed.min, emitter.speed.max, rand_speed);
        particle = (V_Particle)0;
        particle.kind = emitter.kind;
        particle.life = 0;
        particle.pos = lerp(emitter.pos.p0, emitter.pos.p1, rand_offset);
        particle.velocity = Vec2(cos(initial_angle), sin(initial_angle)) * initial_speed;
      }
      if (particle.kind > V_ParticleKind_None && particle.kind < V_ParticleKind_COUNT && !prune)
      {
        V_ParticleDesc desc = V_DescFromParticleKind((V_ParticleKind)particle.kind);
        RWTexture2D<u32> cells = G_Deref(frame.particle_cells[desc.layer], RWTexture2D<u32>);
        RWTexture2D<u32> densities = G_Deref(frame.particle_densities[desc.layer], RWTexture2D<u32>);
        u32 packed = 0;
        packed |= (particle_idx & ((1 >> 24) - 1)) << 0;
        packed |= (particle.kind & 0xFF) << 24;
        StaticAssert(V_ParticlesCap <= (1 << 24));  // particle idx must fit in 24 bits
        StaticAssert(V_ParticleKind_COUNT <= 0x7F); // particle kind must fit in 7 bits
        if (particle.life == 0)
        {
-          Vec2 cell_pos = mul(frame.af.world_to_cell, Vec3(particle.pos, 1));
+          u32 occluder = occluders[cell_pos];
-          if (IsInside(cell_pos, P_WorldCellsDims))
+          b32 occluder_is_wall = occluder == 0xFFFFFFFF;
          if (!(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall))
          {
-            u32 occluder = occluders[cell_pos];
+            particle.origin_occluder = occluders[cell_pos];
-            b32 occluder_is_wall = occluder == 0xFFFFFFFF;
+            particle.prev_occluder = particle.origin_occluder;
            if (!(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall))
            {
              particle.origin_occluder = occluders[cell_pos];
              particle.prev_occluder = particle.origin_occluder;
            }
          }
          else
          {
            prune = 1;
          }
        }
-
+        else
        //////////////////////////////
        //- Move
        b32 collision = 0;
        // TODO: Clip to avoid unnecessary iterations outside of world bounds
        if (!prune)
        {
-          Vec2 p0 = particle.pos;
+          prune = 1;
-          Vec2 p1 = particle.pos + particle.velocity * frame.dt;
+        }
-          f32 t = 1;
+      }
      //////////////////////////////
      //- Move
      b32 collision = 0;
      // TODO: Clip to avoid unnecessary iterations outside of world bounds
      if (!prune)
      {
        Vec2 p0 = particle.pos;
        Vec2 p1 = particle.pos + particle.velocity * frame.dt;
        f32 t = 1;
        {
          Vec2 occluder_p0 = mul(frame.af.world_to_cell, Vec3(p0, 1));
          Vec2 occluder_p1 = mul(frame.af.world_to_cell, Vec3(p1, 1));
          Vec2I32 cell_p0 = floor(occluder_p0);
          Vec2I32 cell_p1 = floor(occluder_p1);
          Vec2 delta = occluder_p1 - occluder_p0;
          Vec2 inv_delta = 1.0 / delta;
          Vec2 dda_step_dir = Vec2((delta.x > 0) - (delta.x < 0), (delta.y > 0) - (delta.y < 0));
          Vec2 t_delta = abs(inv_delta);
          Vec2 t_max = cell_p0 - occluder_p0;
          t_max.x += dda_step_dir.x > 0;
          t_max.y += dda_step_dir.y > 0;
          t_max *= inv_delta;
          t_max = abs(t_max);
          Vec2 t_hit = 0;
          Vec2I32 cell_pos = cell_p0;
          b32 stepped_x = 0;
          b32 stepped_y = 0;
          // TODO: Tune this
          u32 max_iterations = 128;
          b32 done = 0;
          f32 t_diff = 0;
          u32 iteration_idx = 0;
          for (; iteration_idx < max_iterations && !done; ++iteration_idx)
          {
-            Vec2 occluder_p0 = mul(frame.af.world_to_cell, Vec3(p0, 1));
+            if (cell_pos.x == cell_p1.x && cell_pos.y == cell_p1.y)
            Vec2 occluder_p1 = mul(frame.af.world_to_cell, Vec3(p1, 1));
            Vec2I32 cell_p0 = floor(occluder_p0);
            Vec2I32 cell_p1 = floor(occluder_p1);
            Vec2 delta = occluder_p1 - occluder_p0;
            Vec2 inv_delta = 1.0 / delta;
            Vec2 dda_step_dir = Vec2((delta.x > 0) - (delta.x < 0), (delta.y > 0) - (delta.y < 0));
            Vec2 t_delta = abs(inv_delta);
            Vec2 t_max = cell_p0 - occluder_p0;
            t_max.x += dda_step_dir.x > 0;
            t_max.y += dda_step_dir.y > 0;
            t_max *= inv_delta;
            t_max = abs(t_max);
            Vec2 t_hit = 0;
            Vec2I32 cell_pos = cell_p0;
            b32 stepped_x = 0;
            b32 stepped_y = 0;
            // TODO: Tune this
            u32 max_iterations = 128;
            b32 done = 0;
            f32 t_diff = 0;
            u32 iteration_idx = 0;
            for (; iteration_idx < max_iterations && !done; ++iteration_idx)
            {
-              if (cell_pos.x == cell_p1.x && cell_pos.y == cell_p1.y)
+              done = 1;
-              {
+            }
-                done = 1;
+            else if (t_max.x < t_max.y)
-              }
+            {
-              else if (t_max.x < t_max.y)
+              cell_pos.x += dda_step_dir.x;
-              {
+              f32 old = t_hit.x;
-                cell_pos.x += dda_step_dir.x;
+              t_hit.x = t_max.x - t_delta.x;
-                f32 old = t_hit.x;
+              t_diff = t_hit.x - old;
-                t_hit.x = t_max.x - t_delta.x;
+              t_max.x += t_delta.x;
-                t_diff = t_hit.x - old;
+              stepped_x = 1;
-                t_max.x += t_delta.x;
+              stepped_y = 0;
-                stepped_x = 1;
+            }
-                stepped_y = 0;
+            else
-              }
+            {
-              else
+              cell_pos.y += dda_step_dir.y;
-              {
+              f32 old = t_hit.y;
-                cell_pos.y += dda_step_dir.y;
+              t_hit.y = t_max.y - t_delta.y;
-                f32 old = t_hit.y;
+              t_diff = t_hit.y - old;
-                t_hit.y = t_max.y - t_delta.y;
+              t_max.y += t_delta.y;
-                t_diff = t_hit.y - old;
+              stepped_x = 0;
-                t_max.y += t_delta.y;
+              stepped_y = 1;
-                stepped_x = 0;
+            }
                stepped_y = 1;
              }
-              Vec2 cell_screen_pos_p0 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(floor(cell_pos), 1)), 1));
+            Vec2 cell_screen_pos_p0 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(floor(cell_pos), 1)), 1));
-              Vec2 cell_screen_pos_p1 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(ceil(cell_pos), 1)), 1));
+            Vec2 cell_screen_pos_p1 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(ceil(cell_pos), 1)), 1));
-              cell_screen_pos_p1 = max(cell_screen_pos_p1, cell_screen_pos_p0 + 1);
+            cell_screen_pos_p1 = max(cell_screen_pos_p1, cell_screen_pos_p0 + 1);
-              b32 is_in_world = IsInside(cell_pos, P_WorldCellsDims);
+            b32 is_in_world = IsInside(cell_pos, P_WorldCellsDims);
-              b32 is_visible = all(cell_screen_pos_p1 >= 0) && all(cell_screen_pos_p0 < frame.screen_dims);
+            b32 is_visible = all(cell_screen_pos_p1 >= 0) && all(cell_screen_pos_p0 < frame.screen_dims);
-              if (is_in_world)
+            if (is_in_world)
            {
              f32 stain_delta = abs(t_diff) * desc.stain_rate * frame.dt;
              particle.stain_accum += stain_delta;
              //- Handle collision
              {
-                f32 stain_delta = abs(t_diff) * desc.stain_rate * frame.dt;
+                u32 occluder = occluders[cell_pos];
-                particle.stain_accum += stain_delta;
+                b32 occluder_is_wall = occluder == 0xFFFFFFFF;
-
+                if (occluder != particle.origin_occluder)
                //- Handle collision
                {
-                  u32 occluder = occluders[cell_pos];
+                  particle.origin_occluder = 0;
-                  b32 occluder_is_wall = occluder == 0xFFFFFFFF;
+                }
-                  if (occluder != particle.origin_occluder)
+                if (
                  occluder != 0 &&
                  !(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall) &&
                  occluder != particle.origin_occluder
                )
                {
                  u64 collision_seed = MixU64(V_ParticleCellBasis ^ seed0 ^ particle.cells_count);
                  f32 rand_collision_angle = Norm16(collision_seed >> 0);
                  f32 rand_collision_velocity = Norm16(collision_seed >> 16);
                  f32 rand_collision_penetration = Norm16(collision_seed >> 32);
                  if (rand_collision_penetration >= desc.pen_rate)
                  {
-                    particle.origin_occluder = 0;
+                    collision = 1;
-                  }
+                    done = 1;
                  if (
                    occluder != 0 &&
                    !(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall) &&
                    occluder != particle.origin_occluder
                  )
                  {
                    u64 collision_seed = MixU64(V_ParticleCellBasis ^ seed0 ^ particle.cells_count);
                    f32 rand_collision_angle = Norm16(collision_seed >> 0);
                    f32 rand_collision_velocity = Norm16(collision_seed >> 16);
                    f32 rand_collision_penetration = Norm16(collision_seed >> 32);
                    if (rand_collision_penetration >= desc.pen_rate)
                    {
-                      collision = 1;
+                      if (stepped_x)
                      done = 1;
                      {
-                        if (stepped_x)
+                        if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
                        {
-                          if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
+                          particle.velocity.x *= -1;
                          {
                            particle.velocity.x *= -1;
                          }
                          t = saturate(t_hit.x);
                        }
-                        else if (stepped_y)
+                        t = saturate(t_hit.x);
                      }
                      else if (stepped_y)
                      {
                        if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
                        {
-                          if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
+                          particle.velocity.y *= -1;
                          {
                            particle.velocity.y *= -1;
                          }
                          t = saturate(t_hit.y);
                        }
-                        {
+                        t = saturate(t_hit.y);
-                          f32 collision_angle = lerp(-0.05 * Tau, 0.05 * Tau, rand_collision_angle);
+                      }
-                          // f32 collision_angle = 0;
+                      {
                        f32 collision_angle = lerp(-0.05 * Tau, 0.05 * Tau, rand_collision_angle);
                        // f32 collision_angle = 0;
-                          // f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
+                        // f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
-                          // f32 collision_velocity_falloff = lerp(5000, 10000, rand_collision_velocity);
+                        // f32 collision_velocity_falloff = lerp(5000, 10000, rand_collision_velocity);
-                          // f32 collision_velocity_falloff = lerp(500, 10000, rand_collision_velocity);
+                        // f32 collision_velocity_falloff = lerp(500, 10000, rand_collision_velocity);
-                          f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
+                        f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
-                          // f32 collision_velocity_falloff = 0;
+                        // f32 collision_velocity_falloff = 0;
-                          particle.velocity = RotateVec2Angle(particle.velocity, collision_angle);
+                        particle.velocity = RotateVec2Angle(particle.velocity, collision_angle);
-                          particle.velocity *= 1.0f - saturate(collision_velocity_falloff * frame.dt);
+                        particle.velocity *= 1.0f - saturate(collision_velocity_falloff * frame.dt);
                        }
                      }
                    }
                  }
                  particle.prev_occluder = occluder;
                }
                if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold))
                {
                  prune = 1;
                }
                if (prune)
                {
                  done = 1;
                  if (AnyBit(desc.flags, V_ParticleFlag_StainWhenPruned))
                  {
                    // particle.stain_accum = max(particle.stain_accum, 1);
                    particle.stain_accum += 1;
                    packed |= 1 << 31;
                  }
                }
                if (!collision && particle.origin_occluder != 0xFFFFFFFF)
                {
                  u32 stain_count = floor(particle.stain_accum);
                  u32 density = 1 + stain_count;
                  u32 commit = packed;
                  if (stain_count > 0)
                  {
                    commit |= (1 << 31);
                  }
                  InterlockedMax(cells[cell_pos], commit);
                  InterlockedAdd(densities[cell_pos], density);
                  particle.stain_accum -= stain_count;
                }
                particle.prev_occluder = occluder;
              }
-              else
+
              if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold))
              {
                done = 1;
                prune = 1;
              }
-              particle.cells_count += 1;
+              if (prune)
              {
                done = 1;
                if (AnyBit(desc.flags, V_ParticleFlag_StainWhenPruned))
                {
                  // particle.stain_accum = max(particle.stain_accum, 1);
                  particle.stain_accum += 1;
                  packed |= 1 << 31;
                }
              }
              if (!collision && particle.origin_occluder != 0xFFFFFFFF)
              {
                u32 stain_count = floor(particle.stain_accum);
                u32 density = 1 + stain_count;
                u32 commit = packed;
                if (stain_count > 0)
                {
                  commit |= (1 << 31);
                }
                InterlockedMax(cells[cell_pos], commit);
                InterlockedAdd(densities[cell_pos], density);
                particle.stain_accum -= stain_count;
              }
            }
            else
            {
              done = 1;
              prune = 1;
            }
            particle.cells_count += 1;
          }
          f32 falloff = saturate(lerp(10, 20, rand_falloff) * frame.dt);
          // f32 falloff = saturate(lerp(1, 2, rand_falloff) * frame.dt);
          particle.velocity *= 1.0f - falloff;
          particle.pos = p0 + (p1 - p0) * t;
        }
-        particle.life += frame.dt;
+        f32 falloff = saturate(lerp(10, 20, rand_falloff) * frame.dt);
        // f32 falloff = saturate(lerp(1, 2, rand_falloff) * frame.dt);
        particle.velocity *= 1.0f - falloff;
        particle.pos = p0 + (p1 - p0) * t;
      }
-      if (prune)
+      particle.life += frame.dt;
      {
        particle.kind = V_ParticleKind_None;
      }
      particles[particle_idx] = particle;
    }
    //////////////////////////////
    //- Commit
    if (prune)
    {
      particle.kind = V_ParticleKind_None;
    }
    particles[particle_idx] = particle;
  }
 }