From 8f6a52a460ef53c9c961bebb10d846ed199f7869 Mon Sep 17 00:00:00 2001 From: catbot Date: Thu, 4 Jun 2026 13:32:58 +0000 Subject: [PATCH] fix(webgpu-rt): derive rayQuery TLAS leaf-start from dynamic nPadded (#25) The software rayQuery shim's _rqTraverseTlas detected BVH leaves with a compile-time constant TLAS_BVH_LEAVES_START = 16384 - 1, while the actual TLAS sweep tree is built at depth log2(next_pow2(instanceCount)). For any scene with fewer than 8193 instances the padded leaf count is far below 16384, so no node index ever reached 16383: every node looked internal, the descent walked into zeroed out-of-tree AABBs, and the pick reported a permanent miss. This broke every rayQuery=true compute shader (builder picking, splash queries) on the WebGPU backend. Pass the per-build padded leaf count to the shim the same way the megakernel _rtwTraverseTlas reads wfParams.tlasNPadded: a small uniform (RqTlasMeta.nPadded) at @group(1) @binding(10), written each wgpuBuildTLAS from wfNextPow2(instanceCount), and bound by both rayQuery dispatch paths. _rqTraverseTlas now computes leavesStart = nPadded - 1 dynamically. Co-Authored-By: Claude Opus 4.8 --- additional/dom-webgpu.js | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js index e5fbbd3..facf92c 100644 --- a/additional/dom-webgpu.js +++ b/additional/dom-webgpu.js @@ -1108,6 +1108,7 @@ env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQue { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, { binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, { binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, + { binding: 10, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } }, ]}) : device.createBindGroupLayout({ entries: [ { binding: 0, visibility: GPUShaderStage.COMPUTE, @@ -1208,6 +1209,7 @@ env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handle { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } }, { binding: 8, resource: { buffer: orderBuf } }, { binding: 9, resource: { buffer: bvhBuf } }, + { binding: 10, resource: { buffer: rtState.rqTlasMetaBuf } }, ], }); state.pass.setBindGroup(1, rtBG); @@ -1464,8 +1466,18 @@ struct BvhNode { _pad1: u32, }; @group(1) @binding(9) var tlasBvhNodes : array; -const TLAS_BVH_N_PADDED: u32 = 16384u; -const TLAS_BVH_LEAVES_START: u32 = TLAS_BVH_N_PADDED - 1u; +// Active TLAS sweep-tree padded leaf count, written per build as +// next_pow2(instanceCount). Leaves live at [nPadded-1, 2*nPadded-1). +// The rayQuery descent MUST derive its leaf-start from this dynamic value +// — a fixed 16384-leaf assumption means no node index ever reaches a leaf +// for realistic (< 8193) instance counts, so every pick misses. +struct RqTlasMeta { + nPadded: u32, + _pad0: u32, + _pad1: u32, + _pad2: u32, +}; +@group(1) @binding(10) var rqTlasMeta : RqTlasMeta; `; @@ -2145,8 +2157,9 @@ fn _rqTraverseTlas(rq: ptr) { if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) { continue; } - if (nodeIdx >= TLAS_BVH_LEAVES_START) { - let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START; + let leavesStart = rqTlasMeta.nPadded - 1u; + if (nodeIdx >= leavesStart) { + let leafIdx = nodeIdx - leavesStart; let i = tlasEntryOrder[leafIdx]; if (i == 0xFFFFFFFFu) { continue; } let inst = tlasEntries[i]; @@ -2752,6 +2765,13 @@ function rtInit() { size: 16, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, }); + // RqTlasMeta uniform for the rayQuery shim: holds the active TLAS's + // padded leaf count (next_pow2 of the instance count) so the software + // traversal derives its leaf-start dynamically, matching the build. + rtState.rqTlasMetaBuf = device.createBuffer({ + size: 16, + usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, + }); } function rtMeshRecordsEnsure(meshCount) { @@ -2888,6 +2908,9 @@ env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle, countBuf[0] = instanceCount; countBuf[1] = wfNextPow2(instanceCount); queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf); + // Publish the padded leaf count to the rayQuery shim's meta uniform so + // its TLAS descent uses the same dynamic leaf-start the build does. + queue.writeBuffer(rtState.rqTlasMetaBuf, 0, new Uint32Array([wfNextPow2(instanceCount), 0, 0, 0])); const lbvhBg = device.createBindGroup({ layout: rtState.lbvhBuildBgl, @@ -3444,6 +3467,7 @@ env.wgpuLoadComputePipeline = (wgslPtr, wgslLen, pushUniformSize, { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, { binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, { binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, + { binding: 10, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } }, ]})); } @@ -3572,6 +3596,7 @@ env.wgpuDispatchCompute = (pipelineHandle, pushPtr, pushBytes, { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } }, { binding: 8, resource: { buffer: orderBuf } }, { binding: 9, resource: { buffer: bvhBuf } }, + { binding: 10, resource: { buffer: rtState.rqTlasMetaBuf } }, ], }); }