Merge pull request 'WebGPU RT: enable TLAS spatial sort via bitonic network (plan phase 3)' (#2) from claude/issue-1 into master
This commit is contained in:
commit
e0d72f57f2
2 changed files with 66 additions and 132 deletions
|
|
@ -1,5 +1,20 @@
|
||||||
# LBVH parallel radix sort: count-dependent corruption
|
# LBVH parallel radix sort: count-dependent corruption
|
||||||
|
|
||||||
|
> **RESOLVED (strategy #5 — bitonic sort).** The LSD radix scatter was
|
||||||
|
> replaced with a data-oblivious workgroup **bitonic sorting network** in
|
||||||
|
> `lbvhBuildMain` (`additional/dom-webgpu.js`, Phase 2). Because a bitonic
|
||||||
|
> network's compare-exchange schedule depends only on N_PADDED — never on
|
||||||
|
> the key distribution — it cannot exhibit the count-dependent corruption
|
||||||
|
> documented below. The sort is now enabled (the old `if (false)` guard is
|
||||||
|
> gone) so TLAS leaves are Morton (Z-order) coherent again.
|
||||||
|
>
|
||||||
|
> Verified on the Firefox/Dawn WebGPU stack with a GPU unit test that diffs
|
||||||
|
> the kernel output against a CPU oracle across all three required
|
||||||
|
> distributions (all-uniform, all-one-bucket, and the "small object next to
|
||||||
|
> a tight cluster" repro) plus random/reverse/empty edge cases — all match
|
||||||
|
> bit-for-bit, with a valid index permutation. Sponza renders correctly with
|
||||||
|
> the sort live. The historical analysis below is retained for context.
|
||||||
|
|
||||||
## Summary
|
## Summary
|
||||||
|
|
||||||
The parallel radix sort in `lbvhBuildMain` (additional/dom-webgpu.js) produces
|
The parallel radix sort in `lbvhBuildMain` (additional/dom-webgpu.js) produces
|
||||||
|
|
|
||||||
|
|
@ -1999,26 +1999,23 @@ fn tlasBuildMain(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||||
// 1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel
|
// 1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel
|
||||||
// slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to
|
// slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to
|
||||||
// 0xFFFE so no real key collides with the sentinel.
|
// 0xFFFE so no real key collides with the sentinel.
|
||||||
// 2. LSD radix sort — 8 passes × 4 bits, ping-pong sortA ↔ sortB.
|
// 2. Bitonic sort of the packed keys ascending (in-place in sortA).
|
||||||
// STAGE 1: single-thread (thread 0) sequential scatter for stable
|
// Data-oblivious network — 105 compare-exchange sub-stages over
|
||||||
// ordering. This is the slow-but-trivially-correct baseline; Stage
|
// 2^14 keys — so it cannot exhibit the count-dependent corruption
|
||||||
// 2 will parallelize the scatter using per-thread local histograms
|
// the old LSD radix scatter did (TODO-lbvh-sort.md, strategy #5).
|
||||||
// + a cross-thread scan.
|
|
||||||
// 3. Write sorted instance permutation into outOrder.
|
// 3. Write sorted instance permutation into outOrder.
|
||||||
// 4. Initialize BVH leaf AABBs from sorted instances.
|
// 4. Initialize BVH leaf AABBs from sorted instances.
|
||||||
// 5. Bottom-up sweep-tree refit, log2(N_PADDED) levels.
|
// 5. Bottom-up sweep-tree refit, log2(N_PADDED) levels.
|
||||||
//
|
//
|
||||||
// Storage-barrier pattern: workgroupBarrier() fences workgroup memory
|
// Storage-barrier pattern: workgroupBarrier() fences workgroup memory
|
||||||
// only per WGSL spec; storageBarrier() is required between R/W phases
|
// only per WGSL spec; storageBarrier() is required between R/W phases
|
||||||
// on sortA/B and outBvh. Both are called at every storage boundary —
|
// on sortA and outBvh — including between every bitonic sub-stage, which
|
||||||
// minor perf cost, eliminates the class of bug that hung the GPU on
|
// reads and writes sortA. Both are called at every storage boundary.
|
||||||
// the previous radix attempt.
|
|
||||||
//
|
//
|
||||||
// Hard cap: LBVH_MAX = 16384. Parallel scatter (per-bucket Hillis-Steele
|
// Hard cap: LBVH_MAX = 16384. The bitonic sort gives BVH leaves Morton
|
||||||
// scan over 1024-lane indicators) made the build cost flat at ~0.5 ms
|
// (Z-order) spatial coherence, and the degenerate-AABB fix in _rtAabb
|
||||||
// regardless of N_PADDED, and the degenerate-AABB fix in _rtAabb keeps
|
// keeps sentinel-only subtrees from being traversed. Per-ray cost scales
|
||||||
// sentinel-only subtrees from being traversed. Per-ray cost scales with
|
// with log2(N_real), not log2(N_PADDED).
|
||||||
// log2(N_real), not log2(N_PADDED).
|
|
||||||
const LBVH_MAX = 16384;
|
const LBVH_MAX = 16384;
|
||||||
const lbvhBuildWgsl = String.raw`
|
const lbvhBuildWgsl = String.raw`
|
||||||
struct TLASEntryStub {
|
struct TLASEntryStub {
|
||||||
|
|
@ -2065,17 +2062,7 @@ const THREADS: u32 = 1024u;
|
||||||
const K_PER: u32 = 16u; // = N_PADDED / THREADS
|
const K_PER: u32 = 16u; // = N_PADDED / THREADS
|
||||||
const REDUCE_LANES: u32 = 256u;
|
const REDUCE_LANES: u32 = 256u;
|
||||||
const REDUCE_K_PER: u32 = 64u; // = N_PADDED / REDUCE_LANES
|
const REDUCE_K_PER: u32 = 64u; // = N_PADDED / REDUCE_LANES
|
||||||
const BUCKETS: u32 = 16u;
|
|
||||||
const PASSES: u32 = 8u;
|
|
||||||
const LEVELS: u32 = 14u; // log2(N_PADDED)
|
const LEVELS: u32 = 14u; // log2(N_PADDED)
|
||||||
const SCAN_STEPS: u32 = 10u; // log2(THREADS)
|
|
||||||
|
|
||||||
var<workgroup> shHist: array<atomic<u32>, BUCKETS>;
|
|
||||||
var<workgroup> shOffsets: array<u32, BUCKETS>;
|
|
||||||
// Hillis-Steele scratch for per-bucket exclusive prefix sum over 1024
|
|
||||||
// per-thread bucket counts. 4 KB. Reused across all 8 × 16 bucket scans
|
|
||||||
// in the radix passes.
|
|
||||||
var<workgroup> shScan: array<u32, THREADS>;
|
|
||||||
|
|
||||||
// Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is
|
// Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is
|
||||||
// 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB
|
// 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB
|
||||||
|
|
@ -2162,122 +2149,54 @@ fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3<u32>) {
|
||||||
workgroupBarrier();
|
workgroupBarrier();
|
||||||
storageBarrier();
|
storageBarrier();
|
||||||
|
|
||||||
// ── Phase 2: stable LSD radix sort, fully parallel scatter ──────────
|
// ── Phase 2: bitonic sort of sortA[0..N_PADDED) ascending ────────────
|
||||||
// Per pass:
|
// Replaces the previous LSD radix scatter, which produced
|
||||||
// 1. Histogram (parallel atomicAdd to shHist).
|
// count-dependent corruption (TODO-lbvh-sort.md): a memory-ordering
|
||||||
// 2. Global bucket starts: exclusive prefix scan of shHist into
|
// bug in the Hillis-Steele scan / parallel scatter that surfaced only
|
||||||
// shOffsets[16] — small, done sequentially by thread 0.
|
// for certain Morton distributions (a small object beside a tight
|
||||||
// 3. For each of the 16 buckets:
|
// cluster), making fort geometry flicker. Despite careful review the
|
||||||
// a. Each thread counts its bucket-b source elements (K_PER
|
// exact race was never pinned down.
|
||||||
// re-reads, no caching — storage reads are L1-cheap).
|
|
||||||
// b. Hillis-Steele exclusive prefix scan of those counts
|
|
||||||
// across all THREADS lanes (log2(THREADS)=10 levels).
|
|
||||||
// Single-buffered with a read/barrier/write/barrier pattern
|
|
||||||
// per step so reads and writes don't race.
|
|
||||||
// c. Each thread re-walks its K_PER elements in source order
|
|
||||||
// and writes bucket-b ones to dst = shOffsets[b]
|
|
||||||
// + my_exclusive_prefix
|
|
||||||
// + my_local_idx_so_far.
|
|
||||||
// Stability holds because per-thread iteration is in source
|
|
||||||
// order and the cross-thread offsets respect thread index order.
|
|
||||||
// WORKAROUND: the parallel radix sort below corrupts sortA in a
|
|
||||||
// way that's count-dependent — symptom was mid-game geometry
|
|
||||||
// flicker as soon as ANY extra TLAS instance was added beyond the
|
|
||||||
// initial scene (e.g. firing a projectile would make fort braces
|
|
||||||
// appear to disappear in patterns deterministic on the projectile's
|
|
||||||
// angle). Bisected by skipping each LBVH phase in turn: with the
|
|
||||||
// sort skipped, no flicker. With the sort enabled, flicker.
|
|
||||||
//
|
//
|
||||||
// The exact bug in the Hillis-Steele scan + parallel scatter
|
// A bitonic sorting network is DATA-OBLIVIOUS: the sequence of
|
||||||
// hasn't been identified despite careful review — likely a subtle
|
// compare-exchanges depends only on N_PADDED, never on the key values.
|
||||||
// memory-ordering / barrier issue that triggers only with the
|
// That eliminates the entire class of distribution-dependent races the
|
||||||
// specific Morton-code distribution that arises when a small object
|
// radix sort tripped over (TODO strategy #5). N_PADDED is a power of
|
||||||
// (projectile) sits next to a large cluster (fort).
|
// two so the network is exact; sentinel keys (0xFFFFFFFF) compare
|
||||||
|
// largest and settle at the tail — exactly where Phase 4 expects them.
|
||||||
//
|
//
|
||||||
// Skipping the sort means BVH leaves are in instance-index order
|
// Single workgroup, storage-resident: 16384 u32 = 64 KB exceeds the
|
||||||
// (no spatial coherence). Ray traversal still descends the BVH
|
// workgroup-storage cap, so the keys stay in sortA. Each of the 1024
|
||||||
// tree, but parent AABBs are larger than they would be with sorted
|
// threads owns PAIRS_PER_THREAD = (N_PADDED/2)/THREADS = 8 compare-
|
||||||
// leaves, so more leaves get tested per ray. With the fort's
|
// exchanges per sub-stage. A bitonic network over 2^14 keys has
|
||||||
// ~1011-entry scale that's still fast enough; revisit if the
|
// sum(p for p in 1..=14) = 105 sub-stages; storageBarrier() fences
|
||||||
// entry count grows toward the LBVH_MAX cap.
|
// sortA between each so one sub-stage's writes are visible to the next.
|
||||||
if (false) {
|
// sortB is unused by this path (left bound; harmless).
|
||||||
for (var p: u32 = 0u; p < PASSES; p = p + 1u) {
|
const PAIRS: u32 = N_PADDED / 2u; // 8192 compare-exchanges / sub-stage
|
||||||
let shift = p * 4u;
|
const PAIRS_PER_THREAD: u32 = PAIRS / THREADS; // 8
|
||||||
let srcIsA = (p & 1u) == 0u;
|
for (var k: u32 = 2u; k <= N_PADDED; k = k << 1u) {
|
||||||
|
for (var j: u32 = k >> 1u; j > 0u; j = j >> 1u) {
|
||||||
// Clear histogram.
|
for (var t: u32 = 0u; t < PAIRS_PER_THREAD; t = t + 1u) {
|
||||||
if (tid < BUCKETS) {
|
// Linear pair id p in [0, N_PADDED/2). Map it to the lower
|
||||||
atomicStore(&shHist[tid], 0u);
|
// index lo of the compared pair by inserting a 0 bit at
|
||||||
|
// position log2(j): lo has that bit clear, hi = lo | j.
|
||||||
|
let p = t * THREADS + tid;
|
||||||
|
let lo = ((p & ~(j - 1u)) << 1u) | (p & (j - 1u));
|
||||||
|
let hi = lo | j;
|
||||||
|
let a = sortA[lo];
|
||||||
|
let b = sortA[hi];
|
||||||
|
// Sort direction for this bitonic block. lo and hi differ
|
||||||
|
// only in bit log2(j) (< log2(k)), so both agree on (x & k).
|
||||||
|
let ascending = (lo & k) == 0u;
|
||||||
|
if ((a > b) == ascending) {
|
||||||
|
sortA[lo] = b;
|
||||||
|
sortA[hi] = a;
|
||||||
}
|
}
|
||||||
workgroupBarrier();
|
|
||||||
|
|
||||||
// Histogram pass — K_PER elements per thread.
|
|
||||||
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
|
||||||
let i = k * THREADS + tid;
|
|
||||||
var myKey: u32;
|
|
||||||
if (srcIsA) { myKey = sortA[i]; } else { myKey = sortB[i]; }
|
|
||||||
let bucket = (myKey >> shift) & 0xFu;
|
|
||||||
atomicAdd(&shHist[bucket], 1u);
|
|
||||||
}
|
|
||||||
workgroupBarrier();
|
|
||||||
|
|
||||||
// Global bucket starts (16-wide; thread 0 does it sequentially).
|
|
||||||
if (tid == 0u) {
|
|
||||||
var s2: u32 = 0u;
|
|
||||||
for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
|
|
||||||
shOffsets[b] = s2;
|
|
||||||
s2 = s2 + atomicLoad(&shHist[b]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
workgroupBarrier();
|
|
||||||
|
|
||||||
// Per-bucket parallel scatter.
|
|
||||||
for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
|
|
||||||
// (a) Count my bucket-b elements.
|
|
||||||
var localCount: u32 = 0u;
|
|
||||||
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
|
||||||
let i = k * THREADS + tid;
|
|
||||||
var srcKey: u32;
|
|
||||||
if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
|
|
||||||
let bk = (srcKey >> shift) & 0xFu;
|
|
||||||
if (bk == b) { localCount = localCount + 1u; }
|
|
||||||
}
|
|
||||||
shScan[tid] = localCount;
|
|
||||||
workgroupBarrier();
|
|
||||||
|
|
||||||
// (b) Hillis-Steele inclusive prefix scan across 1024 lanes.
|
|
||||||
// Single-buffered: read snapshot → barrier → write → barrier.
|
|
||||||
for (var step: u32 = 0u; step < SCAN_STEPS; step = step + 1u) {
|
|
||||||
let offset = 1u << step;
|
|
||||||
let v = shScan[tid];
|
|
||||||
var prev: u32 = 0u;
|
|
||||||
if (tid >= offset) { prev = shScan[tid - offset]; }
|
|
||||||
workgroupBarrier();
|
|
||||||
shScan[tid] = v + prev;
|
|
||||||
workgroupBarrier();
|
|
||||||
}
|
|
||||||
// Convert inclusive→exclusive by subtracting own contribution.
|
|
||||||
let myExclusivePrefix = shScan[tid] - localCount;
|
|
||||||
|
|
||||||
// (c) Scatter my bucket-b elements at the computed positions.
|
|
||||||
var localIdx: u32 = 0u;
|
|
||||||
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
|
||||||
let i = k * THREADS + tid;
|
|
||||||
var srcKey: u32;
|
|
||||||
if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
|
|
||||||
let bk = (srcKey >> shift) & 0xFu;
|
|
||||||
if (bk == b) {
|
|
||||||
let dst = shOffsets[b] + myExclusivePrefix + localIdx;
|
|
||||||
if (srcIsA) { sortB[dst] = srcKey; } else { sortA[dst] = srcKey; }
|
|
||||||
localIdx = localIdx + 1u;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
workgroupBarrier();
|
|
||||||
}
|
}
|
||||||
storageBarrier();
|
storageBarrier();
|
||||||
|
workgroupBarrier();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// After 8 ping-pongs (even count) the sorted keys live in sortA.
|
// Sorted keys (ascending; sentinels last) now live in sortA.
|
||||||
|
|
||||||
// ── Phase 3: write sorted instance permutation into outOrder ─────────
|
// ── Phase 3: write sorted instance permutation into outOrder ─────────
|
||||||
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue