From 14091dcdca649b75824b951d8481dce1d5b941ce Mon Sep 17 00:00:00 2001 From: catbot Date: Sun, 31 May 2026 15:48:29 +0000 Subject: [PATCH] WebGPU RT: enable TLAS spatial sort via bitonic network MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the disabled LSD radix sort in lbvhBuildMain with a data-oblivious workgroup bitonic sorting network and enable it. The radix scatter was gated behind `if (false)` because it produced count/distribution-dependent corruption (TODO-lbvh-sort.md) — a memory-ordering bug in the Hillis-Steele scan / parallel scatter that surfaced only for certain Morton distributions (a small object beside a tight cluster), making geometry flicker. A bitonic network's compare-exchange schedule depends only on N_PADDED, never on key values, so it sidesteps that entire class of distribution-dependent races (TODO strategy #5). 105 sub-stages over 2^14 keys, single workgroup of 1024 threads, 8 compare-exchanges/thread/sub-stage, operating in-place on sortA with a storageBarrier between sub-stages. Sentinel keys (0xFFFFFFFF) compare largest and settle at the tail, exactly where Phase 4 expects them. Restores Morton (Z-order) spatial coherence to TLAS BVH leaves, which the many-instance case needs. Removes the now-dead radix histogram/scan workgroup memory and constants. Verified on the Firefox/Dawn WebGPU stack: a GPU unit test diffs the kernel output against a CPU oracle across all three required distributions (all-uniform, all-one-bucket, small-object-next-to-cluster) plus random, reverse, and empty inputs — all match bit-for-bit with a valid index permutation. Sponza renders correctly with the sort live. Co-Authored-By: Claude Opus 4.8 --- TODO-lbvh-sort.md | 15 ++++ additional/dom-webgpu.js | 183 +++++++++++---------------------------- 2 files changed, 66 insertions(+), 132 deletions(-) diff --git a/TODO-lbvh-sort.md b/TODO-lbvh-sort.md index f86373e..7204b68 100644 --- a/TODO-lbvh-sort.md +++ b/TODO-lbvh-sort.md @@ -1,5 +1,20 @@ # LBVH parallel radix sort: count-dependent corruption +> **RESOLVED (strategy #5 — bitonic sort).** The LSD radix scatter was +> replaced with a data-oblivious workgroup **bitonic sorting network** in +> `lbvhBuildMain` (`additional/dom-webgpu.js`, Phase 2). Because a bitonic +> network's compare-exchange schedule depends only on N_PADDED — never on +> the key distribution — it cannot exhibit the count-dependent corruption +> documented below. The sort is now enabled (the old `if (false)` guard is +> gone) so TLAS leaves are Morton (Z-order) coherent again. +> +> Verified on the Firefox/Dawn WebGPU stack with a GPU unit test that diffs +> the kernel output against a CPU oracle across all three required +> distributions (all-uniform, all-one-bucket, and the "small object next to +> a tight cluster" repro) plus random/reverse/empty edge cases — all match +> bit-for-bit, with a valid index permutation. Sponza renders correctly with +> the sort live. The historical analysis below is retained for context. + ## Summary The parallel radix sort in `lbvhBuildMain` (additional/dom-webgpu.js) produces diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js index 4ebb12a..e035d97 100644 --- a/additional/dom-webgpu.js +++ b/additional/dom-webgpu.js @@ -1999,26 +1999,23 @@ fn tlasBuildMain(@builtin(global_invocation_id) gid: vec3) { // 1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel // slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to // 0xFFFE so no real key collides with the sentinel. -// 2. LSD radix sort — 8 passes × 4 bits, ping-pong sortA ↔ sortB. -// STAGE 1: single-thread (thread 0) sequential scatter for stable -// ordering. This is the slow-but-trivially-correct baseline; Stage -// 2 will parallelize the scatter using per-thread local histograms -// + a cross-thread scan. +// 2. Bitonic sort of the packed keys ascending (in-place in sortA). +// Data-oblivious network — 105 compare-exchange sub-stages over +// 2^14 keys — so it cannot exhibit the count-dependent corruption +// the old LSD radix scatter did (TODO-lbvh-sort.md, strategy #5). // 3. Write sorted instance permutation into outOrder. // 4. Initialize BVH leaf AABBs from sorted instances. // 5. Bottom-up sweep-tree refit, log2(N_PADDED) levels. // // Storage-barrier pattern: workgroupBarrier() fences workgroup memory // only per WGSL spec; storageBarrier() is required between R/W phases -// on sortA/B and outBvh. Both are called at every storage boundary — -// minor perf cost, eliminates the class of bug that hung the GPU on -// the previous radix attempt. +// on sortA and outBvh — including between every bitonic sub-stage, which +// reads and writes sortA. Both are called at every storage boundary. // -// Hard cap: LBVH_MAX = 16384. Parallel scatter (per-bucket Hillis-Steele -// scan over 1024-lane indicators) made the build cost flat at ~0.5 ms -// regardless of N_PADDED, and the degenerate-AABB fix in _rtAabb keeps -// sentinel-only subtrees from being traversed. Per-ray cost scales with -// log2(N_real), not log2(N_PADDED). +// Hard cap: LBVH_MAX = 16384. The bitonic sort gives BVH leaves Morton +// (Z-order) spatial coherence, and the degenerate-AABB fix in _rtAabb +// keeps sentinel-only subtrees from being traversed. Per-ray cost scales +// with log2(N_real), not log2(N_PADDED). const LBVH_MAX = 16384; const lbvhBuildWgsl = String.raw` struct TLASEntryStub { @@ -2065,17 +2062,7 @@ const THREADS: u32 = 1024u; const K_PER: u32 = 16u; // = N_PADDED / THREADS const REDUCE_LANES: u32 = 256u; const REDUCE_K_PER: u32 = 64u; // = N_PADDED / REDUCE_LANES -const BUCKETS: u32 = 16u; -const PASSES: u32 = 8u; const LEVELS: u32 = 14u; // log2(N_PADDED) -const SCAN_STEPS: u32 = 10u; // log2(THREADS) - -var shHist: array, BUCKETS>; -var shOffsets: array; -// Hillis-Steele scratch for per-bucket exclusive prefix sum over 1024 -// per-thread bucket counts. 4 KB. Reused across all 8 × 16 bucket scans -// in the radix passes. -var shScan: array; // Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is // 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB @@ -2162,122 +2149,54 @@ fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3) { workgroupBarrier(); storageBarrier(); - // ── Phase 2: stable LSD radix sort, fully parallel scatter ────────── - // Per pass: - // 1. Histogram (parallel atomicAdd to shHist). - // 2. Global bucket starts: exclusive prefix scan of shHist into - // shOffsets[16] — small, done sequentially by thread 0. - // 3. For each of the 16 buckets: - // a. Each thread counts its bucket-b source elements (K_PER - // re-reads, no caching — storage reads are L1-cheap). - // b. Hillis-Steele exclusive prefix scan of those counts - // across all THREADS lanes (log2(THREADS)=10 levels). - // Single-buffered with a read/barrier/write/barrier pattern - // per step so reads and writes don't race. - // c. Each thread re-walks its K_PER elements in source order - // and writes bucket-b ones to dst = shOffsets[b] - // + my_exclusive_prefix - // + my_local_idx_so_far. - // Stability holds because per-thread iteration is in source - // order and the cross-thread offsets respect thread index order. - // WORKAROUND: the parallel radix sort below corrupts sortA in a - // way that's count-dependent — symptom was mid-game geometry - // flicker as soon as ANY extra TLAS instance was added beyond the - // initial scene (e.g. firing a projectile would make fort braces - // appear to disappear in patterns deterministic on the projectile's - // angle). Bisected by skipping each LBVH phase in turn: with the - // sort skipped, no flicker. With the sort enabled, flicker. + // ── Phase 2: bitonic sort of sortA[0..N_PADDED) ascending ──────────── + // Replaces the previous LSD radix scatter, which produced + // count-dependent corruption (TODO-lbvh-sort.md): a memory-ordering + // bug in the Hillis-Steele scan / parallel scatter that surfaced only + // for certain Morton distributions (a small object beside a tight + // cluster), making fort geometry flicker. Despite careful review the + // exact race was never pinned down. // - // The exact bug in the Hillis-Steele scan + parallel scatter - // hasn't been identified despite careful review — likely a subtle - // memory-ordering / barrier issue that triggers only with the - // specific Morton-code distribution that arises when a small object - // (projectile) sits next to a large cluster (fort). + // A bitonic sorting network is DATA-OBLIVIOUS: the sequence of + // compare-exchanges depends only on N_PADDED, never on the key values. + // That eliminates the entire class of distribution-dependent races the + // radix sort tripped over (TODO strategy #5). N_PADDED is a power of + // two so the network is exact; sentinel keys (0xFFFFFFFF) compare + // largest and settle at the tail — exactly where Phase 4 expects them. // - // Skipping the sort means BVH leaves are in instance-index order - // (no spatial coherence). Ray traversal still descends the BVH - // tree, but parent AABBs are larger than they would be with sorted - // leaves, so more leaves get tested per ray. With the fort's - // ~1011-entry scale that's still fast enough; revisit if the - // entry count grows toward the LBVH_MAX cap. - if (false) { - for (var p: u32 = 0u; p < PASSES; p = p + 1u) { - let shift = p * 4u; - let srcIsA = (p & 1u) == 0u; - - // Clear histogram. - if (tid < BUCKETS) { - atomicStore(&shHist[tid], 0u); - } - workgroupBarrier(); - - // Histogram pass — K_PER elements per thread. - for (var k: u32 = 0u; k < K_PER; k = k + 1u) { - let i = k * THREADS + tid; - var myKey: u32; - if (srcIsA) { myKey = sortA[i]; } else { myKey = sortB[i]; } - let bucket = (myKey >> shift) & 0xFu; - atomicAdd(&shHist[bucket], 1u); - } - workgroupBarrier(); - - // Global bucket starts (16-wide; thread 0 does it sequentially). - if (tid == 0u) { - var s2: u32 = 0u; - for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) { - shOffsets[b] = s2; - s2 = s2 + atomicLoad(&shHist[b]); - } - } - workgroupBarrier(); - - // Per-bucket parallel scatter. - for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) { - // (a) Count my bucket-b elements. - var localCount: u32 = 0u; - for (var k: u32 = 0u; k < K_PER; k = k + 1u) { - let i = k * THREADS + tid; - var srcKey: u32; - if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; } - let bk = (srcKey >> shift) & 0xFu; - if (bk == b) { localCount = localCount + 1u; } - } - shScan[tid] = localCount; - workgroupBarrier(); - - // (b) Hillis-Steele inclusive prefix scan across 1024 lanes. - // Single-buffered: read snapshot → barrier → write → barrier. - for (var step: u32 = 0u; step < SCAN_STEPS; step = step + 1u) { - let offset = 1u << step; - let v = shScan[tid]; - var prev: u32 = 0u; - if (tid >= offset) { prev = shScan[tid - offset]; } - workgroupBarrier(); - shScan[tid] = v + prev; - workgroupBarrier(); - } - // Convert inclusive→exclusive by subtracting own contribution. - let myExclusivePrefix = shScan[tid] - localCount; - - // (c) Scatter my bucket-b elements at the computed positions. - var localIdx: u32 = 0u; - for (var k: u32 = 0u; k < K_PER; k = k + 1u) { - let i = k * THREADS + tid; - var srcKey: u32; - if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; } - let bk = (srcKey >> shift) & 0xFu; - if (bk == b) { - let dst = shOffsets[b] + myExclusivePrefix + localIdx; - if (srcIsA) { sortB[dst] = srcKey; } else { sortA[dst] = srcKey; } - localIdx = localIdx + 1u; + // Single workgroup, storage-resident: 16384 u32 = 64 KB exceeds the + // workgroup-storage cap, so the keys stay in sortA. Each of the 1024 + // threads owns PAIRS_PER_THREAD = (N_PADDED/2)/THREADS = 8 compare- + // exchanges per sub-stage. A bitonic network over 2^14 keys has + // sum(p for p in 1..=14) = 105 sub-stages; storageBarrier() fences + // sortA between each so one sub-stage's writes are visible to the next. + // sortB is unused by this path (left bound; harmless). + const PAIRS: u32 = N_PADDED / 2u; // 8192 compare-exchanges / sub-stage + const PAIRS_PER_THREAD: u32 = PAIRS / THREADS; // 8 + for (var k: u32 = 2u; k <= N_PADDED; k = k << 1u) { + for (var j: u32 = k >> 1u; j > 0u; j = j >> 1u) { + for (var t: u32 = 0u; t < PAIRS_PER_THREAD; t = t + 1u) { + // Linear pair id p in [0, N_PADDED/2). Map it to the lower + // index lo of the compared pair by inserting a 0 bit at + // position log2(j): lo has that bit clear, hi = lo | j. + let p = t * THREADS + tid; + let lo = ((p & ~(j - 1u)) << 1u) | (p & (j - 1u)); + let hi = lo | j; + let a = sortA[lo]; + let b = sortA[hi]; + // Sort direction for this bitonic block. lo and hi differ + // only in bit log2(j) (< log2(k)), so both agree on (x & k). + let ascending = (lo & k) == 0u; + if ((a > b) == ascending) { + sortA[lo] = b; + sortA[hi] = a; } } + storageBarrier(); workgroupBarrier(); } - storageBarrier(); } - } - // After 8 ping-pongs (even count) the sorted keys live in sortA. + // Sorted keys (ascending; sentinels last) now live in sortA. // ── Phase 3: write sorted instance permutation into outOrder ───────── for (var k: u32 = 0u; k < K_PER; k = k + 1u) {