From 14091dcdca649b75824b951d8481dce1d5b941ce Mon Sep 17 00:00:00 2001
From: catbot <catbot@bot.local>
Date: Sun, 31 May 2026 15:48:29 +0000
Subject: [PATCH] WebGPU RT: enable TLAS spatial sort via bitonic network
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the disabled LSD radix sort in lbvhBuildMain with a data-oblivious
workgroup bitonic sorting network and enable it. The radix scatter was gated
behind `if (false)` because it produced count/distribution-dependent
corruption (TODO-lbvh-sort.md) — a memory-ordering bug in the Hillis-Steele
scan / parallel scatter that surfaced only for certain Morton distributions
(a small object beside a tight cluster), making geometry flicker.

A bitonic network's compare-exchange schedule depends only on N_PADDED, never
on key values, so it sidesteps that entire class of distribution-dependent
races (TODO strategy #5). 105 sub-stages over 2^14 keys, single workgroup of
1024 threads, 8 compare-exchanges/thread/sub-stage, operating in-place on
sortA with a storageBarrier between sub-stages. Sentinel keys (0xFFFFFFFF)
compare largest and settle at the tail, exactly where Phase 4 expects them.
Restores Morton (Z-order) spatial coherence to TLAS BVH leaves, which the
many-instance case needs. Removes the now-dead radix histogram/scan workgroup
memory and constants.

Verified on the Firefox/Dawn WebGPU stack: a GPU unit test diffs the kernel
output against a CPU oracle across all three required distributions
(all-uniform, all-one-bucket, small-object-next-to-cluster) plus random,
reverse, and empty inputs — all match bit-for-bit with a valid index
permutation. Sponza renders correctly with the sort live.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 TODO-lbvh-sort.md        |  15 ++++
 additional/dom-webgpu.js | 183 +++++++++++----------------------------
 2 files changed, 66 insertions(+), 132 deletions(-)
diff --git a/TODO-lbvh-sort.md b/TODO-lbvh-sort.md
index f86373e..7204b68 100644
--- a/TODO-lbvh-sort.md
+++ b/TODO-lbvh-sort.md
@@ -1,5 +1,20 @@
 # LBVH parallel radix sort: count-dependent corruption
 
+> **RESOLVED (strategy #5 — bitonic sort).** The LSD radix scatter was
+> replaced with a data-oblivious workgroup **bitonic sorting network** in
+> `lbvhBuildMain` (`additional/dom-webgpu.js`, Phase 2). Because a bitonic
+> network's compare-exchange schedule depends only on N_PADDED — never on
+> the key distribution — it cannot exhibit the count-dependent corruption
+> documented below. The sort is now enabled (the old `if (false)` guard is
+> gone) so TLAS leaves are Morton (Z-order) coherent again.
+>
+> Verified on the Firefox/Dawn WebGPU stack with a GPU unit test that diffs
+> the kernel output against a CPU oracle across all three required
+> distributions (all-uniform, all-one-bucket, and the "small object next to
+> a tight cluster" repro) plus random/reverse/empty edge cases — all match
+> bit-for-bit, with a valid index permutation. Sponza renders correctly with
+> the sort live. The historical analysis below is retained for context.
+
 ## Summary
 
 The parallel radix sort in `lbvhBuildMain` (additional/dom-webgpu.js) produces
diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js
index 4ebb12a..e035d97 100644
--- a/additional/dom-webgpu.js
+++ b/additional/dom-webgpu.js
@@ -1999,26 +1999,23 @@ fn tlasBuildMain(@builtin(global_invocation_id) gid: vec3<u32>) {
 //   1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel
 //      slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to
 //      0xFFFE so no real key collides with the sentinel.
-//   2. LSD radix sort — 8 passes × 4 bits, ping-pong sortA ↔ sortB.
-//      STAGE 1: single-thread (thread 0) sequential scatter for stable
-//      ordering. This is the slow-but-trivially-correct baseline; Stage
-//      2 will parallelize the scatter using per-thread local histograms
-//      + a cross-thread scan.
+//   2. Bitonic sort of the packed keys ascending (in-place in sortA).
+//      Data-oblivious network — 105 compare-exchange sub-stages over
+//      2^14 keys — so it cannot exhibit the count-dependent corruption
+//      the old LSD radix scatter did (TODO-lbvh-sort.md, strategy #5).
 //   3. Write sorted instance permutation into outOrder.
 //   4. Initialize BVH leaf AABBs from sorted instances.
 //   5. Bottom-up sweep-tree refit, log2(N_PADDED) levels.
 //
 // Storage-barrier pattern: workgroupBarrier() fences workgroup memory
 // only per WGSL spec; storageBarrier() is required between R/W phases
-// on sortA/B and outBvh. Both are called at every storage boundary —
-// minor perf cost, eliminates the class of bug that hung the GPU on
-// the previous radix attempt.
+// on sortA and outBvh — including between every bitonic sub-stage, which
+// reads and writes sortA. Both are called at every storage boundary.
 //
-// Hard cap: LBVH_MAX = 16384. Parallel scatter (per-bucket Hillis-Steele
-// scan over 1024-lane indicators) made the build cost flat at ~0.5 ms
-// regardless of N_PADDED, and the degenerate-AABB fix in _rtAabb keeps
-// sentinel-only subtrees from being traversed. Per-ray cost scales with
-// log2(N_real), not log2(N_PADDED).
+// Hard cap: LBVH_MAX = 16384. The bitonic sort gives BVH leaves Morton
+// (Z-order) spatial coherence, and the degenerate-AABB fix in _rtAabb
+// keeps sentinel-only subtrees from being traversed. Per-ray cost scales
+// with log2(N_real), not log2(N_PADDED).
 const LBVH_MAX = 16384;
 const lbvhBuildWgsl = String.raw`
 struct TLASEntryStub {
@@ -2065,17 +2062,7 @@ const THREADS:       u32 = 1024u;
 const K_PER:         u32 = 16u;    // = N_PADDED / THREADS
 const REDUCE_LANES:  u32 = 256u;
 const REDUCE_K_PER:  u32 = 64u;    // = N_PADDED / REDUCE_LANES
-const BUCKETS:       u32 = 16u;
-const PASSES:        u32 = 8u;
 const LEVELS:        u32 = 14u;    // log2(N_PADDED)
-const SCAN_STEPS:    u32 = 10u;    // log2(THREADS)
-
-var<workgroup> shHist:    array<atomic<u32>, BUCKETS>;
-var<workgroup> shOffsets: array<u32, BUCKETS>;
-// Hillis-Steele scratch for per-bucket exclusive prefix sum over 1024
-// per-thread bucket counts. 4 KB. Reused across all 8 × 16 bucket scans
-// in the radix passes.
-var<workgroup> shScan:    array<u32, THREADS>;
 
 // Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is
 // 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB
@@ -2162,122 +2149,54 @@ fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3<u32>) {
     workgroupBarrier();
     storageBarrier();
 
-    // ── Phase 2: stable LSD radix sort, fully parallel scatter ──────────
-    // Per pass:
-    //   1. Histogram (parallel atomicAdd to shHist).
-    //   2. Global bucket starts: exclusive prefix scan of shHist into
-    //      shOffsets[16] — small, done sequentially by thread 0.
-    //   3. For each of the 16 buckets:
-    //        a. Each thread counts its bucket-b source elements (K_PER
-    //           re-reads, no caching — storage reads are L1-cheap).
-    //        b. Hillis-Steele exclusive prefix scan of those counts
-    //           across all THREADS lanes (log2(THREADS)=10 levels).
-    //           Single-buffered with a read/barrier/write/barrier pattern
-    //           per step so reads and writes don't race.
-    //        c. Each thread re-walks its K_PER elements in source order
-    //           and writes bucket-b ones to dst = shOffsets[b]
-    //              + my_exclusive_prefix
-    //              + my_local_idx_so_far.
-    //      Stability holds because per-thread iteration is in source
-    //      order and the cross-thread offsets respect thread index order.
-    // WORKAROUND: the parallel radix sort below corrupts sortA in a
-    // way that's count-dependent — symptom was mid-game geometry
-    // flicker as soon as ANY extra TLAS instance was added beyond the
-    // initial scene (e.g. firing a projectile would make fort braces
-    // appear to disappear in patterns deterministic on the projectile's
-    // angle). Bisected by skipping each LBVH phase in turn: with the
-    // sort skipped, no flicker. With the sort enabled, flicker.
+    // ── Phase 2: bitonic sort of sortA[0..N_PADDED) ascending ────────────
+    // Replaces the previous LSD radix scatter, which produced
+    // count-dependent corruption (TODO-lbvh-sort.md): a memory-ordering
+    // bug in the Hillis-Steele scan / parallel scatter that surfaced only
+    // for certain Morton distributions (a small object beside a tight
+    // cluster), making fort geometry flicker. Despite careful review the
+    // exact race was never pinned down.
     //
-    // The exact bug in the Hillis-Steele scan + parallel scatter
-    // hasn't been identified despite careful review — likely a subtle
-    // memory-ordering / barrier issue that triggers only with the
-    // specific Morton-code distribution that arises when a small object
-    // (projectile) sits next to a large cluster (fort).
+    // A bitonic sorting network is DATA-OBLIVIOUS: the sequence of
+    // compare-exchanges depends only on N_PADDED, never on the key values.
+    // That eliminates the entire class of distribution-dependent races the
+    // radix sort tripped over (TODO strategy #5). N_PADDED is a power of
+    // two so the network is exact; sentinel keys (0xFFFFFFFF) compare
+    // largest and settle at the tail — exactly where Phase 4 expects them.
     //
-    // Skipping the sort means BVH leaves are in instance-index order
-    // (no spatial coherence). Ray traversal still descends the BVH
-    // tree, but parent AABBs are larger than they would be with sorted
-    // leaves, so more leaves get tested per ray. With the fort's
-    // ~1011-entry scale that's still fast enough; revisit if the
-    // entry count grows toward the LBVH_MAX cap.
-    if (false) {
-    for (var p: u32 = 0u; p < PASSES; p = p + 1u) {
-        let shift = p * 4u;
-        let srcIsA = (p & 1u) == 0u;
-
-        // Clear histogram.
-        if (tid < BUCKETS) {
-            atomicStore(&shHist[tid], 0u);
-        }
-        workgroupBarrier();
-
-        // Histogram pass — K_PER elements per thread.
-        for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
-            let i = k * THREADS + tid;
-            var myKey: u32;
-            if (srcIsA) { myKey = sortA[i]; } else { myKey = sortB[i]; }
-            let bucket = (myKey >> shift) & 0xFu;
-            atomicAdd(&shHist[bucket], 1u);
-        }
-        workgroupBarrier();
-
-        // Global bucket starts (16-wide; thread 0 does it sequentially).
-        if (tid == 0u) {
-            var s2: u32 = 0u;
-            for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
-                shOffsets[b] = s2;
-                s2 = s2 + atomicLoad(&shHist[b]);
-            }
-        }
-        workgroupBarrier();
-
-        // Per-bucket parallel scatter.
-        for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
-            // (a) Count my bucket-b elements.
-            var localCount: u32 = 0u;
-            for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
-                let i = k * THREADS + tid;
-                var srcKey: u32;
-                if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
-                let bk = (srcKey >> shift) & 0xFu;
-                if (bk == b) { localCount = localCount + 1u; }
-            }
-            shScan[tid] = localCount;
-            workgroupBarrier();
-
-            // (b) Hillis-Steele inclusive prefix scan across 1024 lanes.
-            // Single-buffered: read snapshot → barrier → write → barrier.
-            for (var step: u32 = 0u; step < SCAN_STEPS; step = step + 1u) {
-                let offset = 1u << step;
-                let v = shScan[tid];
-                var prev: u32 = 0u;
-                if (tid >= offset) { prev = shScan[tid - offset]; }
-                workgroupBarrier();
-                shScan[tid] = v + prev;
-                workgroupBarrier();
-            }
-            // Convert inclusive→exclusive by subtracting own contribution.
-            let myExclusivePrefix = shScan[tid] - localCount;
-
-            // (c) Scatter my bucket-b elements at the computed positions.
-            var localIdx: u32 = 0u;
-            for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
-                let i = k * THREADS + tid;
-                var srcKey: u32;
-                if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
-                let bk = (srcKey >> shift) & 0xFu;
-                if (bk == b) {
-                    let dst = shOffsets[b] + myExclusivePrefix + localIdx;
-                    if (srcIsA) { sortB[dst] = srcKey; } else { sortA[dst] = srcKey; }
-                    localIdx = localIdx + 1u;
+    // Single workgroup, storage-resident: 16384 u32 = 64 KB exceeds the
+    // workgroup-storage cap, so the keys stay in sortA. Each of the 1024
+    // threads owns PAIRS_PER_THREAD = (N_PADDED/2)/THREADS = 8 compare-
+    // exchanges per sub-stage. A bitonic network over 2^14 keys has
+    // sum(p for p in 1..=14) = 105 sub-stages; storageBarrier() fences
+    // sortA between each so one sub-stage's writes are visible to the next.
+    // sortB is unused by this path (left bound; harmless).
+    const PAIRS:            u32 = N_PADDED / 2u;     // 8192 compare-exchanges / sub-stage
+    const PAIRS_PER_THREAD: u32 = PAIRS / THREADS;   // 8
+    for (var k: u32 = 2u; k <= N_PADDED; k = k << 1u) {
+        for (var j: u32 = k >> 1u; j > 0u; j = j >> 1u) {
+            for (var t: u32 = 0u; t < PAIRS_PER_THREAD; t = t + 1u) {
+                // Linear pair id p in [0, N_PADDED/2). Map it to the lower
+                // index lo of the compared pair by inserting a 0 bit at
+                // position log2(j): lo has that bit clear, hi = lo | j.
+                let p  = t * THREADS + tid;
+                let lo = ((p & ~(j - 1u)) << 1u) | (p & (j - 1u));
+                let hi = lo | j;
+                let a  = sortA[lo];
+                let b  = sortA[hi];
+                // Sort direction for this bitonic block. lo and hi differ
+                // only in bit log2(j) (< log2(k)), so both agree on (x & k).
+                let ascending = (lo & k) == 0u;
+                if ((a > b) == ascending) {
+                    sortA[lo] = b;
+                    sortA[hi] = a;
                 }
             }
+            storageBarrier();
             workgroupBarrier();
         }
-        storageBarrier();
     }
-    }
-    // After 8 ping-pongs (even count) the sorted keys live in sortA.
+    // Sorted keys (ascending; sentinels last) now live in sortA.
 
     // ── Phase 3: write sorted instance permutation into outOrder ─────────
     for (var k: u32 = 0u; k < K_PER; k = k + 1u) {