use dimension-specific vector types for compute shader parameters

2026-03-19 17:01:55 -05:00 · 2026-03-19 17:01:55 -05:00 · cbcec3639f
commit cbcec3639f
parent b63b6197a6
3 changed files with 310 additions and 258 deletions
--- a/src/base/base.cgh
+++ b/src/base/base.cgh
@ -744,32 +744,34 @@ Struct(VertexShaderDesc)    { ResourceKey resource; u32 x, y, z; };
 Struct(PixelShaderDesc)     { ResourceKey resource; u32 x, y, z; };
 Struct(ComputeShaderDesc)   { ResourceKey resource; u32 x, y, z; };

-#define GroupSize(name) VEC3U32(CAT(name, __GroupSize_X), CAT(name, __GroupSize_Y), CAT(name, __GroupSize_Z))
+#define GroupSize(name) VEC3U32(CAT(name,__GroupSize_X), CAT(name,__GroupSize_Y), CAT(name,__GroupSize_Z))

 #if IsGpu
  #define Semantic(name) name : name
-  #define VertexShader(name, return_type)         return_type name(u32 Semantic(SV_InstanceID), u32 Semantic(SV_VertexID))
-  #define PixelShader(name, return_type, ...)     return_type name(__VA_ARGS__)
-  #define ComputeShader(name)                                                                           \
-    [numthreads(CAT(name, __GroupSize_X), CAT(name, __GroupSize_Y), CAT(name, __GroupSize_Z))]          \
-    void name(                                                                                          \
-      u32 Semantic(SV_GroupIndex),                                                                      \
-      Vec3U32 Semantic(SV_GroupID),                                                                     \
-      Vec3U32 Semantic(SV_GroupThreadID),                                                               \
-      Vec3U32 Semantic(SV_DispatchThreadID)                                                             \
-    )
+  #define VertexShader(name, return_type)       return_type name(u32 Semantic(SV_InstanceID), u32 Semantic(SV_VertexID))
+  #define PixelShader(name, return_type, ...)   return_type name(__VA_ARGS__)
+  #define ComputeShader(name)                                                               \
+    [numthreads(CAT(name,__GroupSize_X), CAT(name,__GroupSize_Y), CAT(name,__GroupSize_Z))] \
+    void name(                                                                              \
+      u32 Semantic(SV_GroupIndex),                                                          \
+      CAT(name,__ThreadDimsType) Semantic(SV_GroupID),                                      \
+      CAT(name,__ThreadDimsType) Semantic(SV_GroupThreadID),                                \
+      CAT(name,__ThreadDimsType) Semantic(SV_DispatchThreadID)                              \
+    )                                                                                       \
+    /* ----------------------------------------------------------------------------------- */
 #endif

 #if IsCpu
-  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name, __GroupSize_X) = x, CAT(name, __GroupSize_Y) = y, CAT(name, __GroupSize_Z) = z }; static ComputeShaderDesc name = { resource_hash, x, y, z }
+  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name,__GroupSize_X) = x, CAT(name,__GroupSize_Y) = y, CAT(name,__GroupSize_Z) = z }; static ComputeShaderDesc name = { resource_hash, x, y, z }
  #define DeclVertexShader(name, resource_hash)             static VertexShaderDesc name = { resource_hash, 1, 1, 1 }
  #define DeclPixelShader(name, resource_hash)              static PixelShaderDesc name = { resource_hash, 1, 1, 1 }
 #elif IsGpu
-  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name, __GroupSize_X) = x, CAT(name, __GroupSize_Y) = y, CAT(name, __GroupSize_Z) = z };
+  #define DeclComputeShader(name, resource_hash, x, y, z)   enum { CAT(name,__GroupSize_X) = x, CAT(name,__GroupSize_Y) = y, CAT(name,__GroupSize_Z) = z };
  #define DeclVertexShader(name, resource_hash)
  #define DeclPixelShader(name, resource_hash)
 #endif

+
 ////////////////////////////////////////////////////////////
 //~ Dynamic api linkage

--- a/src/meta/meta.c
+++ b/src/meta/meta.c
@ -669,6 +669,7 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)

    //- Generate C file
    StringList shader_lines = Zi;
+    StringList shader_thread_dim_type_lines = Zi;
    {
      StringList c_store_lines = Zi;
      StringList c_include_lines = Zi;
@ -715,14 +716,9 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
            {
              if (arg0_tok->valid)
              {
-                String decl_type = (
-                  kind == M_EntryKind_VertexShader  ? Lit("DeclVertexShader") :
-                  kind == M_EntryKind_PixelShader   ? Lit("DeclPixelShader") :
-                  kind == M_EntryKind_ComputeShader ? Lit("DeclComputeShader") :
-                  Lit("")
-                );
                String shader_name = arg0_tok->s;
-                Vec3U32 thread_count = Zi;
+                Vec3U32 thread_dims = Zi;
+                i32 thread_dims_count = 1;
                {
                  StringList thread_count_args = Zi;
                  for (i32 arg_idx = 1; arg_idx < countof(entry->arg_tokens); ++arg_idx)
@ -739,36 +735,70 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
                  }
                  String thread_count_str = StringFromList(perm, thread_count_args, Lit(" "));
                  Vec3 tmp = CR_Vec3FromString(thread_count_str);
-                  thread_count.x = MaxI32(tmp.x, 1);
-                  thread_count.y = MaxI32(tmp.y, 1);
-                  thread_count.z = MaxI32(tmp.z, 1);
+                  thread_dims.x = MaxI32(tmp.x, 1);
+                  thread_dims.y = MaxI32(tmp.y, 1);
+                  thread_dims.z = MaxI32(tmp.z, 1);
+                  // Determine compute shader dimensions by counting comma-separated values in dimensions string
+                  for (u64 char_idx = 0; char_idx < thread_count_str.len; ++char_idx)
+                  {
+                    u8 c = thread_count_str.text[char_idx];
+                    if (c == ',')
+                    {
+                      thread_dims_count += 1;
+                    }
+                  }
+                  thread_dims_count = ClampI32(thread_dims_count, 1, 3);
                }
+                String decl_type = (
+                  kind == M_EntryKind_VertexShader  ? Lit("DeclVertexShader") :
+                  kind == M_EntryKind_PixelShader   ? Lit("DeclPixelShader") :
+                  kind == M_EntryKind_ComputeShader ? Lit("DeclComputeShader") :
+                  Lit("")
+                );
                u64 shader_resource_hash = HashStringEx(shader_store_hash, StringF(perm, "%F.dxil", FmtString(shader_name)));
-                String lines = Zi;
+                // Dims type line
                if (kind == M_EntryKind_ComputeShader)
                {
-                  lines = StringF(
+                  String line = StringF(
                    perm,
-                    "%F(%F, 0x%F, %F, %F, %F);",
-                    FmtString(decl_type),
+                    "#define %F__ThreadDimsType %F",
                    FmtString(shader_name),
-                    FmtHex(shader_resource_hash),
-                    FmtUint(thread_count.x),
-                    FmtUint(thread_count.y),
-                    FmtUint(thread_count.z)
+                    FmtString(
+                      thread_dims_count == 1 ? Lit("u32") :
+                      thread_dims_count == 2 ? Lit("Vec2U32") :
+                      Lit("Vec3U32")
+                    )
                  );
+                  PushStringToList(perm, &shader_thread_dim_type_lines, line);
                }
-                else
+                // Shader line
                {
-                  lines = StringF(
-                    perm,
-                    "%F(%F, 0x%F);",
-                    FmtString(decl_type),
-                    FmtString(shader_name),
-                    FmtHex(shader_resource_hash)
-                  );
+                  String line = Zi;
+                  if (kind == M_EntryKind_ComputeShader)
+                  {
+                    line = StringF(
+                      perm,
+                      "%F(%F, 0x%F, %F, %F, %F);",
+                      FmtString(decl_type),
+                      FmtString(shader_name),
+                      FmtHex(shader_resource_hash),
+                      FmtUint(thread_dims.x),
+                      FmtUint(thread_dims.y),
+                      FmtUint(thread_dims.z)
+                    );
+                  }
+                  else
+                  {
+                    line = StringF(
+                      perm,
+                      "%F(%F, 0x%F);",
+                      FmtString(decl_type),
+                      FmtString(shader_name),
+                      FmtHex(shader_resource_hash)
+                    );
+                  }
+                  PushStringToList(perm, &shader_lines, line);
                }
-                PushStringToList(perm, &shader_lines, lines);
              }
              else
              {
@ -836,6 +866,16 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
            PushStringToList(perm, &c_out_lines, n->s);
          }
        }
+        // Define shader dimension types
+        if (shader_thread_dim_type_lines.count > 0)
+        {
+          PushStringToList(perm, &c_out_lines, Lit(""));
+          PushStringToList(perm, &c_out_lines, Lit("//- Shader thread dimension types"));
+          for (StringListNode *n = shader_thread_dim_type_lines.first; n; n = n->next)
+          {
+            PushStringToList(perm, &c_out_lines, n->s);
+          }
+        }
        // Define shaders
        if (shader_lines.count > 0)
        {
@ -975,6 +1015,16 @@ void M_BuildEntryPoint(WaveLaneCtx *lane)
            PushStringToList(perm, &gpu_out_lines, Lit("//- Base layer includes"));
            PushStringToList(perm, &gpu_out_lines, StringF(perm, "#include \"%F\"", FmtString(base_inc_path)));
          }
+          // Define shader dimension types
+          if (shader_thread_dim_type_lines.count > 0)
+          {
+            PushStringToList(perm, &gpu_out_lines, Lit(""));
+            PushStringToList(perm, &gpu_out_lines, Lit("//- Shader thread dimension types"));
+            for (StringListNode *n = shader_thread_dim_type_lines.first; n; n = n->next)
+            {
+              PushStringToList(perm, &gpu_out_lines, n->s);
+            }
+          }
          // Define shaders
          if (shader_lines.count > 0)
          {
--- a/src/pp/pp_vis/pp_vis_gpu.g
+++ b/src/pp/pp_vis/pp_vis_gpu.g
@ -373,10 +373,10 @@ ComputeShader(V_EmitParticlesCS)
    {
      u32 particle_idx = (emitter.first_particle_seq + emitter_particle_idx) % (u32)V_ParticlesCap;

-      // InterlockedMin guarantees that the highest emitter index (reflected
-      // as negative particle kind) will be used to initialize the particle
-      // this frame, in case multiple emitters target the same particle (e.g.
-      // more particles pushed this frame than are available in the buffer)
+      // Using InterlockedMin guarantees that the highest emitter index
+      // (reflected as negative particle kind) will be used to initialize the
+      // particle this frame, in case multiple emitters target the same particle
+      // (e.g. more particles were pushed this frame than are available in the buffer)
      InterlockedMin(particles[particle_idx].kind, semantic_particle_kind);
    }
  }
@ -393,267 +393,267 @@ ComputeShader(V_SimParticlesCS)
  Texture2D<u32> occluders = G_Deref(frame.occluders, Texture2D<u32>);

  u32 particle_idx = SV_DispatchThreadID;
-  if (particle_idx < V_ParticlesCap)
+  if (particle_idx < V_ParticlesCap && particles[particle_idx].kind != V_ParticleKind_None)
  {
    V_Particle particle = particles[particle_idx];
    b32 prune = 0;

+    u64 seed0 = MixU64(V_ParticleSimBasis ^ particle_idx);
+    f32 rand_offset = Norm16(seed0 >> 0);
+    f32 rand_angle = Norm16(seed0 >> 16);
+    f32 rand_speed = Norm16(seed0 >> 32);
+    f32 rand_falloff = Norm16(seed0 >> 48);
+
    //////////////////////////////
-    //- Initialize particle
+    //- Init particle

-    if (particle.kind != V_ParticleKind_None)
+    if (particle.kind < 0)
    {
-      u64 seed0 = MixU64(V_ParticleSimBasis ^ particle_idx);
-      f32 rand_offset = Norm16(seed0 >> 0);
-      f32 rand_angle = Norm16(seed0 >> 16);
-      f32 rand_speed = Norm16(seed0 >> 32);
-      f32 rand_falloff = Norm16(seed0 >> 48);
+      u32 emitter_idx = -particle.kind - 1;
+      V_Emitter emitter = G_Deref(frame.emitters, StructuredBuffer<V_Emitter>)[emitter_idx];

-      //////////////////////////////
-      //- Init
+      f32 initial_angle = lerp(emitter.angle.min, emitter.angle.max, rand_angle);
+      f32 initial_speed = lerp(emitter.speed.min, emitter.speed.max, rand_speed);

-      if (particle.kind < 0)
+      particle = (V_Particle)0;
+      particle.kind = emitter.kind;
+      particle.life = 0;
+      particle.pos = lerp(emitter.pos.p0, emitter.pos.p1, rand_offset);
+      particle.velocity = Vec2(cos(initial_angle), sin(initial_angle)) * initial_speed;
+    }
+
+    //////////////////////////////
+    //- Simulate
+
+    if (particle.kind > V_ParticleKind_None && particle.kind < V_ParticleKind_COUNT && !prune)
+    {
+      V_ParticleDesc desc = V_DescFromParticleKind((V_ParticleKind)particle.kind);
+      RWTexture2D<u32> cells = G_Deref(frame.particle_cells[desc.layer], RWTexture2D<u32>);
+      RWTexture2D<u32> densities = G_Deref(frame.particle_densities[desc.layer], RWTexture2D<u32>);
+
+      u32 packed = 0;
+      packed |= (particle_idx & ((1 >> 24) - 1)) << 0;
+      packed |= (particle.kind & 0xFF) << 24;
+      StaticAssert(V_ParticlesCap <= (1 << 24));  // particle idx must fit in 24 bits
+      StaticAssert(V_ParticleKind_COUNT <= 0x7F); // particle kind must fit in 7 bits
+
+      if (particle.life == 0)
      {
-        u32 emitter_idx = -particle.kind - 1;
-        V_Emitter emitter = G_Deref(frame.emitters, StructuredBuffer<V_Emitter>)[emitter_idx];
-
-        f32 initial_angle = lerp(emitter.angle.min, emitter.angle.max, rand_angle);
-        f32 initial_speed = lerp(emitter.speed.min, emitter.speed.max, rand_speed);
-
-        particle = (V_Particle)0;
-        particle.kind = emitter.kind;
-        particle.life = 0;
-        particle.pos = lerp(emitter.pos.p0, emitter.pos.p1, rand_offset);
-        particle.velocity = Vec2(cos(initial_angle), sin(initial_angle)) * initial_speed;
-      }
-
-      if (particle.kind > V_ParticleKind_None && particle.kind < V_ParticleKind_COUNT && !prune)
-      {
-        V_ParticleDesc desc = V_DescFromParticleKind((V_ParticleKind)particle.kind);
-        RWTexture2D<u32> cells = G_Deref(frame.particle_cells[desc.layer], RWTexture2D<u32>);
-        RWTexture2D<u32> densities = G_Deref(frame.particle_densities[desc.layer], RWTexture2D<u32>);
-
-        u32 packed = 0;
-        packed |= (particle_idx & ((1 >> 24) - 1)) << 0;
-        packed |= (particle.kind & 0xFF) << 24;
-        StaticAssert(V_ParticlesCap <= (1 << 24));  // particle idx must fit in 24 bits
-        StaticAssert(V_ParticleKind_COUNT <= 0x7F); // particle kind must fit in 7 bits
-
-        if (particle.life == 0)
+        Vec2 cell_pos = mul(frame.af.world_to_cell, Vec3(particle.pos, 1));
+        if (IsInside(cell_pos, P_WorldCellsDims))
        {
-          Vec2 cell_pos = mul(frame.af.world_to_cell, Vec3(particle.pos, 1));
-          if (IsInside(cell_pos, P_WorldCellsDims))
+          u32 occluder = occluders[cell_pos];
+          b32 occluder_is_wall = occluder == 0xFFFFFFFF;
+          if (!(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall))
          {
-            u32 occluder = occluders[cell_pos];
-            b32 occluder_is_wall = occluder == 0xFFFFFFFF;
-            if (!(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall))
-            {
-              particle.origin_occluder = occluders[cell_pos];
-              particle.prev_occluder = particle.origin_occluder;
-            }
-          }
-          else
-          {
-            prune = 1;
+            particle.origin_occluder = occluders[cell_pos];
+            particle.prev_occluder = particle.origin_occluder;
          }
        }
-
-        //////////////////////////////
-        //- Move
-
-        b32 collision = 0;
-
-        // TODO: Clip to avoid unnecessary iterations outside of world bounds
-        if (!prune)
+        else
        {
-          Vec2 p0 = particle.pos;
-          Vec2 p1 = particle.pos + particle.velocity * frame.dt;
-          f32 t = 1;
+          prune = 1;
+        }
+      }
+
+      //////////////////////////////
+      //- Move
+
+      b32 collision = 0;
+
+      // TODO: Clip to avoid unnecessary iterations outside of world bounds
+      if (!prune)
+      {
+        Vec2 p0 = particle.pos;
+        Vec2 p1 = particle.pos + particle.velocity * frame.dt;
+        f32 t = 1;
+        {
+          Vec2 occluder_p0 = mul(frame.af.world_to_cell, Vec3(p0, 1));
+          Vec2 occluder_p1 = mul(frame.af.world_to_cell, Vec3(p1, 1));
+          Vec2I32 cell_p0 = floor(occluder_p0);
+          Vec2I32 cell_p1 = floor(occluder_p1);
+
+          Vec2 delta = occluder_p1 - occluder_p0;
+          Vec2 inv_delta = 1.0 / delta;
+          Vec2 dda_step_dir = Vec2((delta.x > 0) - (delta.x < 0), (delta.y > 0) - (delta.y < 0));
+          Vec2 t_delta = abs(inv_delta);
+          Vec2 t_max = cell_p0 - occluder_p0;
+          t_max.x += dda_step_dir.x > 0;
+          t_max.y += dda_step_dir.y > 0;
+          t_max *= inv_delta;
+          t_max = abs(t_max);
+
+          Vec2 t_hit = 0;
+
+          Vec2I32 cell_pos = cell_p0;
+
+          b32 stepped_x = 0;
+          b32 stepped_y = 0;
+
+          // TODO: Tune this
+          u32 max_iterations = 128;
+
+          b32 done = 0;
+          f32 t_diff = 0;
+          u32 iteration_idx = 0;
+          for (; iteration_idx < max_iterations && !done; ++iteration_idx)
          {
-            Vec2 occluder_p0 = mul(frame.af.world_to_cell, Vec3(p0, 1));
-            Vec2 occluder_p1 = mul(frame.af.world_to_cell, Vec3(p1, 1));
-            Vec2I32 cell_p0 = floor(occluder_p0);
-            Vec2I32 cell_p1 = floor(occluder_p1);
-
-            Vec2 delta = occluder_p1 - occluder_p0;
-            Vec2 inv_delta = 1.0 / delta;
-            Vec2 dda_step_dir = Vec2((delta.x > 0) - (delta.x < 0), (delta.y > 0) - (delta.y < 0));
-            Vec2 t_delta = abs(inv_delta);
-            Vec2 t_max = cell_p0 - occluder_p0;
-            t_max.x += dda_step_dir.x > 0;
-            t_max.y += dda_step_dir.y > 0;
-            t_max *= inv_delta;
-            t_max = abs(t_max);
-
-            Vec2 t_hit = 0;
-
-            Vec2I32 cell_pos = cell_p0;
-
-            b32 stepped_x = 0;
-            b32 stepped_y = 0;
-
-            // TODO: Tune this
-            u32 max_iterations = 128;
-
-            b32 done = 0;
-            f32 t_diff = 0;
-            u32 iteration_idx = 0;
-            for (; iteration_idx < max_iterations && !done; ++iteration_idx)
+            if (cell_pos.x == cell_p1.x && cell_pos.y == cell_p1.y)
            {
-              if (cell_pos.x == cell_p1.x && cell_pos.y == cell_p1.y)
-              {
-                done = 1;
-              }
-              else if (t_max.x < t_max.y)
-              {
-                cell_pos.x += dda_step_dir.x;
-                f32 old = t_hit.x;
-                t_hit.x = t_max.x - t_delta.x;
-                t_diff = t_hit.x - old;
-                t_max.x += t_delta.x;
-                stepped_x = 1;
-                stepped_y = 0;
-              }
-              else
-              {
-                cell_pos.y += dda_step_dir.y;
-                f32 old = t_hit.y;
-                t_hit.y = t_max.y - t_delta.y;
-                t_diff = t_hit.y - old;
-                t_max.y += t_delta.y;
-                stepped_x = 0;
-                stepped_y = 1;
-              }
+              done = 1;
+            }
+            else if (t_max.x < t_max.y)
+            {
+              cell_pos.x += dda_step_dir.x;
+              f32 old = t_hit.x;
+              t_hit.x = t_max.x - t_delta.x;
+              t_diff = t_hit.x - old;
+              t_max.x += t_delta.x;
+              stepped_x = 1;
+              stepped_y = 0;
+            }
+            else
+            {
+              cell_pos.y += dda_step_dir.y;
+              f32 old = t_hit.y;
+              t_hit.y = t_max.y - t_delta.y;
+              t_diff = t_hit.y - old;
+              t_max.y += t_delta.y;
+              stepped_x = 0;
+              stepped_y = 1;
+            }

-              Vec2 cell_screen_pos_p0 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(floor(cell_pos), 1)), 1));
-              Vec2 cell_screen_pos_p1 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(ceil(cell_pos), 1)), 1));
-              cell_screen_pos_p1 = max(cell_screen_pos_p1, cell_screen_pos_p0 + 1);
+            Vec2 cell_screen_pos_p0 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(floor(cell_pos), 1)), 1));
+            Vec2 cell_screen_pos_p1 = mul(frame.af.world_to_screen, Vec3(mul(frame.af.cell_to_world, Vec3(ceil(cell_pos), 1)), 1));
+            cell_screen_pos_p1 = max(cell_screen_pos_p1, cell_screen_pos_p0 + 1);

-              b32 is_in_world = IsInside(cell_pos, P_WorldCellsDims);
-              b32 is_visible = all(cell_screen_pos_p1 >= 0) && all(cell_screen_pos_p0 < frame.screen_dims);
+            b32 is_in_world = IsInside(cell_pos, P_WorldCellsDims);
+            b32 is_visible = all(cell_screen_pos_p1 >= 0) && all(cell_screen_pos_p0 < frame.screen_dims);

-              if (is_in_world)
+            if (is_in_world)
+            {
+              f32 stain_delta = abs(t_diff) * desc.stain_rate * frame.dt;
+              particle.stain_accum += stain_delta;
+
+              //- Handle collision
              {
-                f32 stain_delta = abs(t_diff) * desc.stain_rate * frame.dt;
-                particle.stain_accum += stain_delta;
-
-                //- Handle collision
+                u32 occluder = occluders[cell_pos];
+                b32 occluder_is_wall = occluder == 0xFFFFFFFF;
+                if (occluder != particle.origin_occluder)
                {
-                  u32 occluder = occluders[cell_pos];
-                  b32 occluder_is_wall = occluder == 0xFFFFFFFF;
-                  if (occluder != particle.origin_occluder)
+                  particle.origin_occluder = 0;
+                }
+                if (
+                  occluder != 0 &&
+                  !(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall) &&
+                  occluder != particle.origin_occluder
+                )
+                {
+                  u64 collision_seed = MixU64(V_ParticleCellBasis ^ seed0 ^ particle.cells_count);
+                  f32 rand_collision_angle = Norm16(collision_seed >> 0);
+                  f32 rand_collision_velocity = Norm16(collision_seed >> 16);
+                  f32 rand_collision_penetration = Norm16(collision_seed >> 32);
+                  if (rand_collision_penetration >= desc.pen_rate)
                  {
-                    particle.origin_occluder = 0;
-                  }
-                  if (
-                    occluder != 0 &&
-                    !(AnyBit(desc.flags, V_ParticleFlag_OnlyCollideWithWalls) && !occluder_is_wall) &&
-                    occluder != particle.origin_occluder
-                  )
-                  {
-                    u64 collision_seed = MixU64(V_ParticleCellBasis ^ seed0 ^ particle.cells_count);
-                    f32 rand_collision_angle = Norm16(collision_seed >> 0);
-                    f32 rand_collision_velocity = Norm16(collision_seed >> 16);
-                    f32 rand_collision_penetration = Norm16(collision_seed >> 32);
-                    if (rand_collision_penetration >= desc.pen_rate)
+                    collision = 1;
+                    done = 1;
                    {
-                      collision = 1;
-                      done = 1;
+                      if (stepped_x)
                      {
-                        if (stepped_x)
+                        if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
                        {
-                          if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
-                          {
-                            particle.velocity.x *= -1;
-                          }
-                          t = saturate(t_hit.x);
+                          particle.velocity.x *= -1;
                        }
-                        else if (stepped_y)
+                        t = saturate(t_hit.x);
+                      }
+                      else if (stepped_y)
+                      {
+                        if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
                        {
-                          if (!AnyBit(desc.flags, V_ParticleFlag_NoReflect))
-                          {
-                            particle.velocity.y *= -1;
-                          }
-                          t = saturate(t_hit.y);
+                          particle.velocity.y *= -1;
                        }
-                        {
-                          f32 collision_angle = lerp(-0.05 * Tau, 0.05 * Tau, rand_collision_angle);
-                          // f32 collision_angle = 0;
+                        t = saturate(t_hit.y);
+                      }
+                      {
+                        f32 collision_angle = lerp(-0.05 * Tau, 0.05 * Tau, rand_collision_angle);
+                        // f32 collision_angle = 0;

-                          // f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
-                          // f32 collision_velocity_falloff = lerp(5000, 10000, rand_collision_velocity);
-                          // f32 collision_velocity_falloff = lerp(500, 10000, rand_collision_velocity);
-                          f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
-                          // f32 collision_velocity_falloff = 0;
+                        // f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
+                        // f32 collision_velocity_falloff = lerp(5000, 10000, rand_collision_velocity);
+                        // f32 collision_velocity_falloff = lerp(500, 10000, rand_collision_velocity);
+                        f32 collision_velocity_falloff = lerp(50, 100, rand_collision_velocity);
+                        // f32 collision_velocity_falloff = 0;

-                          particle.velocity = RotateVec2Angle(particle.velocity, collision_angle);
-                          particle.velocity *= 1.0f - saturate(collision_velocity_falloff * frame.dt);
-                        }
+                        particle.velocity = RotateVec2Angle(particle.velocity, collision_angle);
+                        particle.velocity *= 1.0f - saturate(collision_velocity_falloff * frame.dt);
                      }
                    }
                  }
-                  particle.prev_occluder = occluder;
-                }
-
-                if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold))
-                {
-                  prune = 1;
-                }
-
-                if (prune)
-                {
-                  done = 1;
-                  if (AnyBit(desc.flags, V_ParticleFlag_StainWhenPruned))
-                  {
-                    // particle.stain_accum = max(particle.stain_accum, 1);
-                    particle.stain_accum += 1;
-                    packed |= 1 << 31;
-                  }
-                }
-
-                if (!collision && particle.origin_occluder != 0xFFFFFFFF)
-                {
-                  u32 stain_count = floor(particle.stain_accum);
-                  u32 density = 1 + stain_count;
-
-                  u32 commit = packed;
-                  if (stain_count > 0)
-                  {
-                    commit |= (1 << 31);
-                  }
-
-                  InterlockedMax(cells[cell_pos], commit);
-                  InterlockedAdd(densities[cell_pos], density);
-                  particle.stain_accum -= stain_count;
                }
+                particle.prev_occluder = occluder;
              }
-              else
+
+              if (dot(particle.velocity, particle.velocity) < (desc.prune_speed_threshold * desc.prune_speed_threshold))
              {
-                done = 1;
                prune = 1;
              }

-              particle.cells_count += 1;
+              if (prune)
+              {
+                done = 1;
+                if (AnyBit(desc.flags, V_ParticleFlag_StainWhenPruned))
+                {
+                  // particle.stain_accum = max(particle.stain_accum, 1);
+                  particle.stain_accum += 1;
+                  packed |= 1 << 31;
+                }
+              }
+
+              if (!collision && particle.origin_occluder != 0xFFFFFFFF)
+              {
+                u32 stain_count = floor(particle.stain_accum);
+                u32 density = 1 + stain_count;
+
+                u32 commit = packed;
+                if (stain_count > 0)
+                {
+                  commit |= (1 << 31);
+                }
+
+                InterlockedMax(cells[cell_pos], commit);
+                InterlockedAdd(densities[cell_pos], density);
+                particle.stain_accum -= stain_count;
+              }
            }
+            else
+            {
+              done = 1;
+              prune = 1;
+            }
+
+            particle.cells_count += 1;
          }
-
-          f32 falloff = saturate(lerp(10, 20, rand_falloff) * frame.dt);
-          // f32 falloff = saturate(lerp(1, 2, rand_falloff) * frame.dt);
-          particle.velocity *= 1.0f - falloff;
-
-          particle.pos = p0 + (p1 - p0) * t;
        }

-        particle.life += frame.dt;
+        f32 falloff = saturate(lerp(10, 20, rand_falloff) * frame.dt);
+        // f32 falloff = saturate(lerp(1, 2, rand_falloff) * frame.dt);
+        particle.velocity *= 1.0f - falloff;
+
+        particle.pos = p0 + (p1 - p0) * t;
      }

-      if (prune)
-      {
-        particle.kind = V_ParticleKind_None;
-      }
-
-      particles[particle_idx] = particle;
+      particle.life += frame.dt;
    }
+
+    //////////////////////////////
+    //- Commit
+
+    if (prune)
+    {
+      particle.kind = V_ParticleKind_None;
+    }
+
+    particles[particle_idx] = particle;
  }
 }