fix(webgpu-rt): dynamic rayQuery TLAS leaf-start so picks hit for realistic instance counts (#25) #26
1 changed files with 29 additions and 4 deletions
fix(webgpu-rt): derive rayQuery TLAS leaf-start from dynamic nPadded (#25)
The software rayQuery shim's _rqTraverseTlas detected BVH leaves with a compile-time constant TLAS_BVH_LEAVES_START = 16384 - 1, while the actual TLAS sweep tree is built at depth log2(next_pow2(instanceCount)). For any scene with fewer than 8193 instances the padded leaf count is far below 16384, so no node index ever reached 16383: every node looked internal, the descent walked into zeroed out-of-tree AABBs, and the pick reported a permanent miss. This broke every rayQuery=true compute shader (builder picking, splash queries) on the WebGPU backend. Pass the per-build padded leaf count to the shim the same way the megakernel _rtwTraverseTlas reads wfParams.tlasNPadded: a small uniform (RqTlasMeta.nPadded) at @group(1) @binding(10), written each wgpuBuildTLAS from wfNextPow2(instanceCount), and bound by both rayQuery dispatch paths. _rqTraverseTlas now computes leavesStart = nPadded - 1 dynamically. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
commit
8f6a52a460
|
|
@ -1108,6 +1108,7 @@ env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQue
|
||||||
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||||||
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||||||
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||||||
|
{ binding: 10, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
|
||||||
]})
|
]})
|
||||||
: device.createBindGroupLayout({ entries: [
|
: device.createBindGroupLayout({ entries: [
|
||||||
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
|
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
|
||||||
|
|
@ -1208,6 +1209,7 @@ env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handle
|
||||||
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
|
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
|
||||||
{ binding: 8, resource: { buffer: orderBuf } },
|
{ binding: 8, resource: { buffer: orderBuf } },
|
||||||
{ binding: 9, resource: { buffer: bvhBuf } },
|
{ binding: 9, resource: { buffer: bvhBuf } },
|
||||||
|
{ binding: 10, resource: { buffer: rtState.rqTlasMetaBuf } },
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
state.pass.setBindGroup(1, rtBG);
|
state.pass.setBindGroup(1, rtBG);
|
||||||
|
|
@ -1464,8 +1466,18 @@ struct BvhNode {
|
||||||
_pad1: u32,
|
_pad1: u32,
|
||||||
};
|
};
|
||||||
@group(1) @binding(9) var<storage,read> tlasBvhNodes : array<BvhNode>;
|
@group(1) @binding(9) var<storage,read> tlasBvhNodes : array<BvhNode>;
|
||||||
const TLAS_BVH_N_PADDED: u32 = 16384u;
|
// Active TLAS sweep-tree padded leaf count, written per build as
|
||||||
const TLAS_BVH_LEAVES_START: u32 = TLAS_BVH_N_PADDED - 1u;
|
// next_pow2(instanceCount). Leaves live at [nPadded-1, 2*nPadded-1).
|
||||||
|
// The rayQuery descent MUST derive its leaf-start from this dynamic value
|
||||||
|
// — a fixed 16384-leaf assumption means no node index ever reaches a leaf
|
||||||
|
// for realistic (< 8193) instance counts, so every pick misses.
|
||||||
|
struct RqTlasMeta {
|
||||||
|
nPadded: u32,
|
||||||
|
_pad0: u32,
|
||||||
|
_pad1: u32,
|
||||||
|
_pad2: u32,
|
||||||
|
};
|
||||||
|
@group(1) @binding(10) var<uniform> rqTlasMeta : RqTlasMeta;
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -2145,8 +2157,9 @@ fn _rqTraverseTlas(rq: ptr<function, RayQuery>) {
|
||||||
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
|
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (nodeIdx >= TLAS_BVH_LEAVES_START) {
|
let leavesStart = rqTlasMeta.nPadded - 1u;
|
||||||
let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START;
|
if (nodeIdx >= leavesStart) {
|
||||||
|
let leafIdx = nodeIdx - leavesStart;
|
||||||
let i = tlasEntryOrder[leafIdx];
|
let i = tlasEntryOrder[leafIdx];
|
||||||
if (i == 0xFFFFFFFFu) { continue; }
|
if (i == 0xFFFFFFFFu) { continue; }
|
||||||
let inst = tlasEntries[i];
|
let inst = tlasEntries[i];
|
||||||
|
|
@ -2752,6 +2765,13 @@ function rtInit() {
|
||||||
size: 16,
|
size: 16,
|
||||||
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
||||||
});
|
});
|
||||||
|
// RqTlasMeta uniform for the rayQuery shim: holds the active TLAS's
|
||||||
|
// padded leaf count (next_pow2 of the instance count) so the software
|
||||||
|
// traversal derives its leaf-start dynamically, matching the build.
|
||||||
|
rtState.rqTlasMetaBuf = device.createBuffer({
|
||||||
|
size: 16,
|
||||||
|
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function rtMeshRecordsEnsure(meshCount) {
|
function rtMeshRecordsEnsure(meshCount) {
|
||||||
|
|
@ -2888,6 +2908,9 @@ env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle,
|
||||||
countBuf[0] = instanceCount;
|
countBuf[0] = instanceCount;
|
||||||
countBuf[1] = wfNextPow2(instanceCount);
|
countBuf[1] = wfNextPow2(instanceCount);
|
||||||
queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf);
|
queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf);
|
||||||
|
// Publish the padded leaf count to the rayQuery shim's meta uniform so
|
||||||
|
// its TLAS descent uses the same dynamic leaf-start the build does.
|
||||||
|
queue.writeBuffer(rtState.rqTlasMetaBuf, 0, new Uint32Array([wfNextPow2(instanceCount), 0, 0, 0]));
|
||||||
|
|
||||||
const lbvhBg = device.createBindGroup({
|
const lbvhBg = device.createBindGroup({
|
||||||
layout: rtState.lbvhBuildBgl,
|
layout: rtState.lbvhBuildBgl,
|
||||||
|
|
@ -3444,6 +3467,7 @@ env.wgpuLoadComputePipeline = (wgslPtr, wgslLen, pushUniformSize,
|
||||||
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||||||
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||||||
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||||||
|
{ binding: 10, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
|
||||||
]}));
|
]}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3572,6 +3596,7 @@ env.wgpuDispatchCompute = (pipelineHandle, pushPtr, pushBytes,
|
||||||
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
|
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
|
||||||
{ binding: 8, resource: { buffer: orderBuf } },
|
{ binding: 8, resource: { buffer: orderBuf } },
|
||||||
{ binding: 9, resource: { buffer: bvhBuf } },
|
{ binding: 9, resource: { buffer: bvhBuf } },
|
||||||
|
{ binding: 10, resource: { buffer: rtState.rqTlasMetaBuf } },
|
||||||
],
|
],
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue