Merge pull request 'WebGPU RT: enable TLAS spatial sort via bitonic network (plan phase 3)' (#2) from claude/issue-1 into master

2026-05-31 17:49:38 +02:00 · 2026-05-31 17:49:38 +02:00 · e0d72f57f2
commit e0d72f57f2
parent 162d98cf5b 14091dcdca
2 changed files with 66 additions and 132 deletions
--- a/TODO-lbvh-sort.md
+++ b/TODO-lbvh-sort.md
@ -1,5 +1,20 @@
 # LBVH parallel radix sort: count-dependent corruption
 > **RESOLVED (strategy #5 — bitonic sort).** The LSD radix scatter was
 > replaced with a data-oblivious workgroup **bitonic sorting network** in
 > `lbvhBuildMain` (`additional/dom-webgpu.js`, Phase 2). Because a bitonic
 > network's compare-exchange schedule depends only on N_PADDED — never on
 > the key distribution — it cannot exhibit the count-dependent corruption
 > documented below. The sort is now enabled (the old `if (false)` guard is
 > gone) so TLAS leaves are Morton (Z-order) coherent again.
 >
 > Verified on the Firefox/Dawn WebGPU stack with a GPU unit test that diffs
 > the kernel output against a CPU oracle across all three required
 > distributions (all-uniform, all-one-bucket, and the "small object next to
 > a tight cluster" repro) plus random/reverse/empty edge cases — all match
 > bit-for-bit, with a valid index permutation. Sponza renders correctly with
 > the sort live. The historical analysis below is retained for context.
 ## Summary
 The parallel radix sort in `lbvhBuildMain` (additional/dom-webgpu.js) produces
--- a/additional/dom-webgpu.js
+++ b/additional/dom-webgpu.js
@ -1999,26 +1999,23 @@ fn tlasBuildMain(@builtin(global_invocation_id) gid: vec3<u32>) {
 //   1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel
 //      slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to
 //      0xFFFE so no real key collides with the sentinel.
-//   2. LSD radix sort — 8 passes × 4 bits, ping-pong sortA ↔ sortB.
+//   2. Bitonic sort of the packed keys ascending (in-place in sortA).
-//      STAGE 1: single-thread (thread 0) sequential scatter for stable
+//      Data-oblivious network — 105 compare-exchange sub-stages over
-//      ordering. This is the slow-but-trivially-correct baseline; Stage
+//      2^14 keys — so it cannot exhibit the count-dependent corruption
-//      2 will parallelize the scatter using per-thread local histograms
+//      the old LSD radix scatter did (TODO-lbvh-sort.md, strategy #5).
 //      + a cross-thread scan.
 //   3. Write sorted instance permutation into outOrder.
 //   4. Initialize BVH leaf AABBs from sorted instances.
 //   5. Bottom-up sweep-tree refit, log2(N_PADDED) levels.
 //
 // Storage-barrier pattern: workgroupBarrier() fences workgroup memory
 // only per WGSL spec; storageBarrier() is required between R/W phases
-// on sortA/B and outBvh. Both are called at every storage boundary —
+// on sortA and outBvh — including between every bitonic sub-stage, which
-// minor perf cost, eliminates the class of bug that hung the GPU on
+// reads and writes sortA. Both are called at every storage boundary.
 // the previous radix attempt.
 //
-// Hard cap: LBVH_MAX = 16384. Parallel scatter (per-bucket Hillis-Steele
+// Hard cap: LBVH_MAX = 16384. The bitonic sort gives BVH leaves Morton
-// scan over 1024-lane indicators) made the build cost flat at ~0.5 ms
+// (Z-order) spatial coherence, and the degenerate-AABB fix in _rtAabb
-// regardless of N_PADDED, and the degenerate-AABB fix in _rtAabb keeps
+// keeps sentinel-only subtrees from being traversed. Per-ray cost scales
-// sentinel-only subtrees from being traversed. Per-ray cost scales with
+// with log2(N_real), not log2(N_PADDED).
 // log2(N_real), not log2(N_PADDED).
 const LBVH_MAX = 16384;
 const lbvhBuildWgsl = String.raw`
 struct TLASEntryStub {
@ -2065,17 +2062,7 @@ const THREADS:       u32 = 1024u;
 const K_PER:         u32 = 16u;    // = N_PADDED / THREADS
 const REDUCE_LANES:  u32 = 256u;
 const REDUCE_K_PER:  u32 = 64u;    // = N_PADDED / REDUCE_LANES
 const BUCKETS:       u32 = 16u;
 const PASSES:        u32 = 8u;
 const LEVELS:        u32 = 14u;    // log2(N_PADDED)
 const SCAN_STEPS:    u32 = 10u;    // log2(THREADS)
 var<workgroup> shHist:    array<atomic<u32>, BUCKETS>;
 var<workgroup> shOffsets: array<u32, BUCKETS>;
 // Hillis-Steele scratch for per-bucket exclusive prefix sum over 1024
 // per-thread bucket counts. 4 KB. Reused across all 8 × 16 bucket scans
 // in the radix passes.
 var<workgroup> shScan:    array<u32, THREADS>;
 // Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is
 // 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB
@ -2162,122 +2149,54 @@ fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3<u32>) {
    workgroupBarrier();
    storageBarrier();
-    // ── Phase 2: stable LSD radix sort, fully parallel scatter ──────────
+    // ── Phase 2: bitonic sort of sortA[0..N_PADDED) ascending ────────────
-    // Per pass:
+    // Replaces the previous LSD radix scatter, which produced
-    //   1. Histogram (parallel atomicAdd to shHist).
+    // count-dependent corruption (TODO-lbvh-sort.md): a memory-ordering
-    //   2. Global bucket starts: exclusive prefix scan of shHist into
+    // bug in the Hillis-Steele scan / parallel scatter that surfaced only
-    //      shOffsets[16] — small, done sequentially by thread 0.
+    // for certain Morton distributions (a small object beside a tight
-    //   3. For each of the 16 buckets:
+    // cluster), making fort geometry flicker. Despite careful review the
-    //        a. Each thread counts its bucket-b source elements (K_PER
+    // exact race was never pinned down.
    //           re-reads, no caching — storage reads are L1-cheap).
    //        b. Hillis-Steele exclusive prefix scan of those counts
    //           across all THREADS lanes (log2(THREADS)=10 levels).
    //           Single-buffered with a read/barrier/write/barrier pattern
    //           per step so reads and writes don't race.
    //        c. Each thread re-walks its K_PER elements in source order
    //           and writes bucket-b ones to dst = shOffsets[b]
    //              + my_exclusive_prefix
    //              + my_local_idx_so_far.
    //      Stability holds because per-thread iteration is in source
    //      order and the cross-thread offsets respect thread index order.
    // WORKAROUND: the parallel radix sort below corrupts sortA in a
    // way that's count-dependent — symptom was mid-game geometry
    // flicker as soon as ANY extra TLAS instance was added beyond the
    // initial scene (e.g. firing a projectile would make fort braces
    // appear to disappear in patterns deterministic on the projectile's
    // angle). Bisected by skipping each LBVH phase in turn: with the
    // sort skipped, no flicker. With the sort enabled, flicker.
    //
-    // The exact bug in the Hillis-Steele scan + parallel scatter
+    // A bitonic sorting network is DATA-OBLIVIOUS: the sequence of
-    // hasn't been identified despite careful review — likely a subtle
+    // compare-exchanges depends only on N_PADDED, never on the key values.
-    // memory-ordering / barrier issue that triggers only with the
+    // That eliminates the entire class of distribution-dependent races the
-    // specific Morton-code distribution that arises when a small object
+    // radix sort tripped over (TODO strategy #5). N_PADDED is a power of
-    // (projectile) sits next to a large cluster (fort).
+    // two so the network is exact; sentinel keys (0xFFFFFFFF) compare
    // largest and settle at the tail — exactly where Phase 4 expects them.
    //
-    // Skipping the sort means BVH leaves are in instance-index order
+    // Single workgroup, storage-resident: 16384 u32 = 64 KB exceeds the
-    // (no spatial coherence). Ray traversal still descends the BVH
+    // workgroup-storage cap, so the keys stay in sortA. Each of the 1024
-    // tree, but parent AABBs are larger than they would be with sorted
+    // threads owns PAIRS_PER_THREAD = (N_PADDED/2)/THREADS = 8 compare-
-    // leaves, so more leaves get tested per ray. With the fort's
+    // exchanges per sub-stage. A bitonic network over 2^14 keys has
-    // ~1011-entry scale that's still fast enough; revisit if the
+    // sum(p for p in 1..=14) = 105 sub-stages; storageBarrier() fences
-    // entry count grows toward the LBVH_MAX cap.
+    // sortA between each so one sub-stage's writes are visible to the next.
-    if (false) {
+    // sortB is unused by this path (left bound; harmless).
-    for (var p: u32 = 0u; p < PASSES; p = p + 1u) {
+    const PAIRS:            u32 = N_PADDED / 2u;     // 8192 compare-exchanges / sub-stage
-        let shift = p * 4u;
+    const PAIRS_PER_THREAD: u32 = PAIRS / THREADS;   // 8
-        let srcIsA = (p & 1u) == 0u;
+    for (var k: u32 = 2u; k <= N_PADDED; k = k << 1u) {
-
+        for (var j: u32 = k >> 1u; j > 0u; j = j >> 1u) {
-        // Clear histogram.
+            for (var t: u32 = 0u; t < PAIRS_PER_THREAD; t = t + 1u) {
-        if (tid < BUCKETS) {
+                // Linear pair id p in [0, N_PADDED/2). Map it to the lower
-            atomicStore(&shHist[tid], 0u);
+                // index lo of the compared pair by inserting a 0 bit at
                // position log2(j): lo has that bit clear, hi = lo | j.
                let p  = t * THREADS + tid;
                let lo = ((p & ~(j - 1u)) << 1u) | (p & (j - 1u));
                let hi = lo | j;
                let a  = sortA[lo];
                let b  = sortA[hi];
                // Sort direction for this bitonic block. lo and hi differ
                // only in bit log2(j) (< log2(k)), so both agree on (x & k).
                let ascending = (lo & k) == 0u;
                if ((a > b) == ascending) {
                    sortA[lo] = b;
                    sortA[hi] = a;
                }
        workgroupBarrier();
        // Histogram pass — K_PER elements per thread.
        for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
            let i = k * THREADS + tid;
            var myKey: u32;
            if (srcIsA) { myKey = sortA[i]; } else { myKey = sortB[i]; }
            let bucket = (myKey >> shift) & 0xFu;
            atomicAdd(&shHist[bucket], 1u);
        }
        workgroupBarrier();
        // Global bucket starts (16-wide; thread 0 does it sequentially).
        if (tid == 0u) {
            var s2: u32 = 0u;
            for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
                shOffsets[b] = s2;
                s2 = s2 + atomicLoad(&shHist[b]);
            }
        }
        workgroupBarrier();
        // Per-bucket parallel scatter.
        for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
            // (a) Count my bucket-b elements.
            var localCount: u32 = 0u;
            for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
                let i = k * THREADS + tid;
                var srcKey: u32;
                if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
                let bk = (srcKey >> shift) & 0xFu;
                if (bk == b) { localCount = localCount + 1u; }
            }
            shScan[tid] = localCount;
            workgroupBarrier();
            // (b) Hillis-Steele inclusive prefix scan across 1024 lanes.
            // Single-buffered: read snapshot → barrier → write → barrier.
            for (var step: u32 = 0u; step < SCAN_STEPS; step = step + 1u) {
                let offset = 1u << step;
                let v = shScan[tid];
                var prev: u32 = 0u;
                if (tid >= offset) { prev = shScan[tid - offset]; }
                workgroupBarrier();
                shScan[tid] = v + prev;
                workgroupBarrier();
            }
            // Convert inclusive→exclusive by subtracting own contribution.
            let myExclusivePrefix = shScan[tid] - localCount;
            // (c) Scatter my bucket-b elements at the computed positions.
            var localIdx: u32 = 0u;
            for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
                let i = k * THREADS + tid;
                var srcKey: u32;
                if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
                let bk = (srcKey >> shift) & 0xFu;
                if (bk == b) {
                    let dst = shOffsets[b] + myExclusivePrefix + localIdx;
                    if (srcIsA) { sortB[dst] = srcKey; } else { sortA[dst] = srcKey; }
                    localIdx = localIdx + 1u;
                }
            }
            workgroupBarrier();
            }
            storageBarrier();
            workgroupBarrier();
        }
    }
-    // After 8 ping-pongs (even count) the sorted keys live in sortA.
+    // Sorted keys (ascending; sentinels last) now live in sortA.
    // ── Phase 3: write sorted instance permutation into outOrder ─────────
    for (var k: u32 = 0u; k < K_PER; k = k + 1u) {