From 8f6a52a460ef53c9c961bebb10d846ed199f7869 Mon Sep 17 00:00:00 2001
From: catbot <catbot@bot.local>
Date: Thu, 4 Jun 2026 13:32:58 +0000
Subject: [PATCH] fix(webgpu-rt): derive rayQuery TLAS leaf-start from dynamic
 nPadded (#25)

The software rayQuery shim's _rqTraverseTlas detected BVH leaves with a
compile-time constant TLAS_BVH_LEAVES_START = 16384 - 1, while the actual
TLAS sweep tree is built at depth log2(next_pow2(instanceCount)). For any
scene with fewer than 8193 instances the padded leaf count is far below
16384, so no node index ever reached 16383: every node looked internal,
the descent walked into zeroed out-of-tree AABBs, and the pick reported a
permanent miss. This broke every rayQuery=true compute shader (builder
picking, splash queries) on the WebGPU backend.

Pass the per-build padded leaf count to the shim the same way the
megakernel _rtwTraverseTlas reads wfParams.tlasNPadded: a small uniform
(RqTlasMeta.nPadded) at @group(1) @binding(10), written each wgpuBuildTLAS
from wfNextPow2(instanceCount), and bound by both rayQuery dispatch paths.
_rqTraverseTlas now computes leavesStart = nPadded - 1 dynamically.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 additional/dom-webgpu.js | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)
diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js
index e5fbbd3..facf92c 100644
--- a/additional/dom-webgpu.js
+++ b/additional/dom-webgpu.js
@@ -1108,6 +1108,7 @@ env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQue
                 { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
             { binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
             { binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
+            { binding: 10, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
             ]})
             : device.createBindGroupLayout({ entries: [
                 { binding: 0, visibility: GPUShaderStage.COMPUTE,
@@ -1208,6 +1209,7 @@ env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handle
                 { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
                 { binding: 8, resource: { buffer: orderBuf } },
                 { binding: 9, resource: { buffer: bvhBuf } },
+                { binding: 10, resource: { buffer: rtState.rqTlasMetaBuf } },
             ],
         });
         state.pass.setBindGroup(1, rtBG);
@@ -1464,8 +1466,18 @@ struct BvhNode {
     _pad1:   u32,
 };
 @group(1) @binding(9) var<storage,read>  tlasBvhNodes : array<BvhNode>;
-const TLAS_BVH_N_PADDED: u32 = 16384u;
-const TLAS_BVH_LEAVES_START: u32 = TLAS_BVH_N_PADDED - 1u;
+// Active TLAS sweep-tree padded leaf count, written per build as
+// next_pow2(instanceCount). Leaves live at [nPadded-1, 2*nPadded-1).
+// The rayQuery descent MUST derive its leaf-start from this dynamic value
+// — a fixed 16384-leaf assumption means no node index ever reaches a leaf
+// for realistic (< 8193) instance counts, so every pick misses.
+struct RqTlasMeta {
+    nPadded: u32,
+    _pad0:   u32,
+    _pad1:   u32,
+    _pad2:   u32,
+};
+@group(1) @binding(10) var<uniform> rqTlasMeta : RqTlasMeta;
 `;
 
 
@@ -2145,8 +2157,9 @@ fn _rqTraverseTlas(rq: ptr<function, RayQuery>) {
         if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
             continue;
         }
-        if (nodeIdx >= TLAS_BVH_LEAVES_START) {
-            let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START;
+        let leavesStart = rqTlasMeta.nPadded - 1u;
+        if (nodeIdx >= leavesStart) {
+            let leafIdx = nodeIdx - leavesStart;
             let i = tlasEntryOrder[leafIdx];
             if (i == 0xFFFFFFFFu) { continue; }
             let inst = tlasEntries[i];
@@ -2752,6 +2765,13 @@ function rtInit() {
         size: 16,
         usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
     });
+    // RqTlasMeta uniform for the rayQuery shim: holds the active TLAS's
+    // padded leaf count (next_pow2 of the instance count) so the software
+    // traversal derives its leaf-start dynamically, matching the build.
+    rtState.rqTlasMetaBuf = device.createBuffer({
+        size: 16,
+        usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
+    });
 }
 
 function rtMeshRecordsEnsure(meshCount) {
@@ -2888,6 +2908,9 @@ env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle,
     countBuf[0] = instanceCount;
     countBuf[1] = wfNextPow2(instanceCount);
     queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf);
+    // Publish the padded leaf count to the rayQuery shim's meta uniform so
+    // its TLAS descent uses the same dynamic leaf-start the build does.
+    queue.writeBuffer(rtState.rqTlasMetaBuf, 0, new Uint32Array([wfNextPow2(instanceCount), 0, 0, 0]));
 
     const lbvhBg = device.createBindGroup({
         layout: rtState.lbvhBuildBgl,
@@ -3444,6 +3467,7 @@ env.wgpuLoadComputePipeline = (wgslPtr, wgslLen, pushUniformSize,
             { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
             { binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
             { binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
+            { binding: 10, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
         ]}));
     }
 
@@ -3572,6 +3596,7 @@ env.wgpuDispatchCompute = (pipelineHandle, pushPtr, pushBytes,
                 { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
                 { binding: 8, resource: { buffer: orderBuf } },
                 { binding: 9, resource: { buffer: bvhBuf } },
+                { binding: 10, resource: { buffer: rtState.rqTlasMetaBuf } },
             ],
         });
     }