From afc0292fab6841e281af006bd781a627ae2a5fd0 Mon Sep 17 00:00:00 2001 From: catbot Date: Sun, 31 May 2026 20:28:12 +0000 Subject: [PATCH] WebGPU RT: dynamic TLAS sweep-tree depth (next_pow2 instances) The LBVH bitonic sort still runs over the full 16384 (sentinels sink to the tail), but the sweep tree is now built and traced at depth log2(next_pow2(nReal)) instead of a fixed 14. Add nPadded to LbvhPC; leaf init + bottom-up refit use it; the host passes the same next_pow2 to the trace via WfParams.tlasNPadded. Renders correctly at 512 instances (depth 9). The fragile sort phases are untouched. Co-Authored-By: Claude Opus 4.8 --- additional/dom-webgpu.js | 69 ++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js index d3a9c72..c0e83d1 100644 --- a/additional/dom-webgpu.js +++ b/additional/dom-webgpu.js @@ -2281,7 +2281,11 @@ struct BvhNode { // runtime resize-on-grow caused subtle BVH corruption (driver-level // memory recycling, suspected) and was the root cause of mid-game // geometry flicker when projectiles entered the TLAS. -struct LbvhPC { nReal: u32, _pad0: u32, _pad1: u32, _pad2: u32 }; +// nPadded = next_pow2(max(nReal,1)), supplied by the host. The bitonic +// sort still runs over the full N_PADDED (sentinels sink to the tail), but +// the sweep tree is built (and traced) at depth log2(nPadded) so descent +// tracks the real instance count instead of a fixed 14. +struct LbvhPC { nReal: u32, nPadded: u32, _pad1: u32, _pad2: u32 }; @group(0) @binding(5) var lbvhPc : LbvhPC; const N_PADDED: u32 = 16384u; @@ -2436,29 +2440,36 @@ fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3) { storageBarrier(); // ── Phase 4: initialize BVH leaf AABBs ─────────────────────────────── - for (var k: u32 = 0u; k < K_PER; k = k + 1u) { + // Only the first nPadded sorted slots become leaves of the (smaller) + // sweep tree; reals occupy [0,nReal), the rest sink as sentinels. + let nPadded = max(lbvhPc.nPadded, 1u); + let leafPerThread = (nPadded + THREADS - 1u) / THREADS; + for (var k: u32 = 0u; k < leafPerThread; k = k + 1u) { let i = k * THREADS + tid; - let leafIdx = N_PADDED - 1u + i; - let leafKey = sortA[i]; - if (leafKey == 0xFFFFFFFFu) { - outBvh[leafIdx].aabbMin = vec3( 1e30); - outBvh[leafIdx].aabbMax = vec3(-1e30); - } else { - let e = entries[leafKey & 0xFFFFu]; - outBvh[leafIdx].aabbMin = e.aabbMin; - outBvh[leafIdx].aabbMax = e.aabbMax; + if (i < nPadded) { + let leafIdx = nPadded - 1u + i; + let leafKey = sortA[i]; + if (leafKey == 0xFFFFFFFFu) { + outBvh[leafIdx].aabbMin = vec3( 1e30); + outBvh[leafIdx].aabbMax = vec3(-1e30); + } else { + let e = entries[leafKey & 0xFFFFu]; + outBvh[leafIdx].aabbMin = e.aabbMin; + outBvh[leafIdx].aabbMax = e.aabbMax; + } } } workgroupBarrier(); storageBarrier(); - // ── Phase 5: bottom-up sweep-tree refit, LEVELS iterations ────────── - // Deepest internal level has N_PADDED/2 nodes; perThread = ceil of - // levelCount / THREADS is uniform per step, so workgroupBarrier - // stays in uniform control flow. - var levelCount: u32 = N_PADDED / 2u; - var levelStart: u32 = N_PADDED / 2u - 1u; - for (var step: u32 = 0u; step < LEVELS; step = step + 1u) { + // ── Phase 5: bottom-up sweep-tree refit, log2(nPadded) levels ─────── + // Deepest internal level has nPadded/2 nodes. The loop bound is uniform + // across the workgroup (depends only on nPadded), so the barriers stay + // in uniform control flow. + var levelCount: u32 = nPadded / 2u; + var levelStart: u32 = nPadded / 2u - 1u; + loop { + if (levelCount == 0u) { break; } let perThread = (levelCount + THREADS - 1u) / THREADS; for (var k: u32 = 0u; k < perThread; k = k + 1u) { let nodeOff = k * THREADS + tid; @@ -2723,11 +2734,12 @@ env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle, { binding: 4, resource: { buffer: morton } }, ], }); - // Write the real instance count to the LBVH count uniform so the - // shader can iterate exactly the right number of entries even - // though the storage buffers stay sized for N_PADDED. + // Write the real instance count + the dynamic padded leaf count + // (next_pow2) to the LBVH uniform. The sort still runs over the full + // N_PADDED, but the sweep tree is built at depth log2(nPadded). const countBuf = new Uint32Array(4); countBuf[0] = instanceCount; + countBuf[1] = wfNextPow2(instanceCount); queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf); const lbvhBg = device.createBindGroup({ @@ -2798,7 +2810,15 @@ const WF_PAYLOAD_BYTES = 64; // Dynamic-offset uniform ring: one WfParams slot per wavefront pass. 128 // slots covers maxDepth up to ~42 (1 + 3·maxDepth + 1 passes). const WF_PARAM_SLOTS = 128; -const WF_FIXED_TLAS_NPADDED = 16384; // matches lbvhBuildWgsl N_PADDED +const WF_TLAS_MAX_NPADDED = 16384; // LBVH sort capacity (N_PADDED) +// Smallest power of two >= max(n,1), clamped to the LBVH capacity. The +// TLAS sweep tree is built and traced at this depth so descent tracks the +// real instance count instead of a fixed 16384-leaf (depth-14) tree. +function wfNextPow2(n) { + let p = 1; + while (p < n && p < WF_TLAS_MAX_NPADDED) p <<= 1; + return p; +} function ensureWavefrontBuffers(W, H) { const cap = W * H; @@ -3041,11 +3061,14 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, // 1+3*d .. +2 PREP / TRACE / SHADE for bounce d // 1+3*depth RESOLVE const passCount = 2 + 3 * depth; + // TLAS descent depth = log2(tlasNPadded); must match the value the + // build used (both derive next_pow2 from the same instance count). + const tlasNPadded = wfNextPow2(instanceCount); const ring = new Uint32Array(WF_PARAM_SLOTS * 64); // 256 B = 64 u32 per slot const writeSlot = (slot, curIsA, bounce) => { const o = slot * 64; ring[o + 0] = W; ring[o + 1] = H; ring[o + 2] = cap; ring[o + 3] = curIsA; - ring[o + 4] = bounce; ring[o + 5] = depth; ring[o + 6] = WF_FIXED_TLAS_NPADDED; ring[o + 7] = 0; + ring[o + 4] = bounce; ring[o + 5] = depth; ring[o + 6] = tlasNPadded; ring[o + 7] = 0; }; writeSlot(0, 1, 0); // GENERATE for (let d = 0; d < depth; d++) {