WebGPU RT: wavefront/streaming tracer (replaces megakernel) #4
1 changed files with 97 additions and 6 deletions
WebGPU RT: GPU timestamp-query per-pass harness
Request the timestamp-query feature; write begin/end timestamps around each wavefront pass via timestampWrites; resolve + read back (deferred to after submit) and print a per-pass us breakdown ~1x/sec. RTStress @ 512 instances, 1920x995: TRACE dominates, total ~1.8-3.0ms/frame. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
commit
1d2e12dbc9
|
|
@ -146,7 +146,10 @@ clamp("maxComputeWorkgroupSizeX", 1024);
|
||||||
clamp("maxBufferSize", 1 << 30);
|
clamp("maxBufferSize", 1 << 30);
|
||||||
clamp("maxStorageBufferBindingSize", 1 << 30);
|
clamp("maxStorageBufferBindingSize", 1 << 30);
|
||||||
clamp("maxComputeWorkgroupsPerDimension", 65535);
|
clamp("maxComputeWorkgroupsPerDimension", 65535);
|
||||||
const device = await adapter.requestDevice({ requiredLimits });
|
// Per-pass GPU timing for the wavefront tracer (RTStress HUD / PR numbers).
|
||||||
|
const tsSupported = adapter.features && adapter.features.has("timestamp-query");
|
||||||
|
const requiredFeatures = tsSupported ? ["timestamp-query"] : [];
|
||||||
|
const device = await adapter.requestDevice({ requiredLimits, requiredFeatures });
|
||||||
const queue = device.queue;
|
const queue = device.queue;
|
||||||
const ctx = canvas.getContext("webgpu");
|
const ctx = canvas.getContext("webgpu");
|
||||||
const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
|
const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
|
||||||
|
|
@ -935,6 +938,23 @@ env.wgpuFrameEnd = () => {
|
||||||
queue.submit([state.encoder.finish()]);
|
queue.submit([state.encoder.finish()]);
|
||||||
state.encoder = null;
|
state.encoder = null;
|
||||||
|
|
||||||
|
// Map the wavefront timestamp readback (its resolve/copy was encoded on
|
||||||
|
// the just-submitted encoder) and log a per-pass breakdown ~1×/sec.
|
||||||
|
if (state.tsReadPending) {
|
||||||
|
const ts = state.tsReadPending;
|
||||||
|
state.tsReadPending = null;
|
||||||
|
const n = ts.pendingLabels.length;
|
||||||
|
ts.readBuf.mapAsync(GPUMapMode.READ, 0, 2 * n * 8).then(() => {
|
||||||
|
const data = new BigInt64Array(ts.readBuf.getMappedRange(0, 2 * n * 8).slice(0));
|
||||||
|
ts.readBuf.unmap();
|
||||||
|
ts.inFlight = false;
|
||||||
|
wfLogTimestamps(ts, data);
|
||||||
|
}).catch((e) => {
|
||||||
|
ts.inFlight = false;
|
||||||
|
console.error("[crafter-wgpu] timestamp readback failed:", e);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Kick off mapAsync for the readbacks whose copyBufferToBuffer we
|
// Kick off mapAsync for the readbacks whose copyBufferToBuffer we
|
||||||
// piggy-backed onto the just-submitted encoder. Doing this after
|
// piggy-backed onto the just-submitted encoder. Doing this after
|
||||||
// submit ensures the map waits for that submission's GPU work to
|
// submit ensures the map waits for that submission's GPU work to
|
||||||
|
|
@ -2955,6 +2975,51 @@ function ensureWavefrontBuffers(W, H) {
|
||||||
return wf;
|
return wf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── GPU timestamp-query harness ──────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// One QuerySet with 2 slots per wavefront pass; each beginComputePass writes
|
||||||
|
// begin/end timestamps. After the passes we resolve into a buffer and read
|
||||||
|
// it back (deferred to after submit, like the readback path). Deltas are
|
||||||
|
// summed per pass label and printed ~1×/sec as a per-pass breakdown.
|
||||||
|
const WF_TS_MAX_PASSES = 64; // covers maxDepth up to ~20
|
||||||
|
function wfEnsureTimestamps() {
|
||||||
|
if (!tsSupported) return null;
|
||||||
|
if (rtState.ts) return rtState.ts;
|
||||||
|
const cap = 2 * WF_TS_MAX_PASSES;
|
||||||
|
rtState.ts = {
|
||||||
|
capacity: cap,
|
||||||
|
querySet: device.createQuerySet({ type: "timestamp", count: cap }),
|
||||||
|
resolveBuf: device.createBuffer({ size: cap * 8,
|
||||||
|
usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC }),
|
||||||
|
readBuf: device.createBuffer({ size: cap * 8,
|
||||||
|
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST }),
|
||||||
|
inFlight: false,
|
||||||
|
lastLog: 0,
|
||||||
|
pendingLabels: null,
|
||||||
|
};
|
||||||
|
return rtState.ts;
|
||||||
|
}
|
||||||
|
function wfLogTimestamps(ts, data) {
|
||||||
|
// data: BigInt64Array of ns timestamps, [begin0,end0,begin1,end1,...].
|
||||||
|
const now = Date.now();
|
||||||
|
if (now - ts.lastLog < 1000) return; // throttle to ~1/sec
|
||||||
|
ts.lastLog = now;
|
||||||
|
const labels = ts.pendingLabels;
|
||||||
|
if (!labels) return;
|
||||||
|
const sums = new Map(); // label → ns
|
||||||
|
let totalNs = 0;
|
||||||
|
for (let i = 0; i < labels.length; i++) {
|
||||||
|
const dt = Number(data[2*i + 1] - data[2*i + 0]);
|
||||||
|
if (dt < 0) continue;
|
||||||
|
sums.set(labels[i], (sums.get(labels[i]) || 0) + dt);
|
||||||
|
totalNs += dt;
|
||||||
|
}
|
||||||
|
const order = ["GENERATE", "PREP", "TRACE", "SHADE", "RESOLVE"];
|
||||||
|
const parts = order.filter(k => sums.has(k))
|
||||||
|
.map(k => `${k} ${(sums.get(k)/1000).toFixed(1)}us`);
|
||||||
|
console.log(`[crafter-wgpu] RT passes: ${parts.join(" | ")} | total ${(totalNs/1000).toFixed(1)}us`);
|
||||||
|
}
|
||||||
|
|
||||||
env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
|
env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
|
||||||
if (!rtState.vertHeap) rtInit();
|
if (!rtState.vertHeap) rtInit();
|
||||||
const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
|
const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
|
||||||
|
|
@ -3187,9 +3252,27 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
||||||
|
|
||||||
const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
|
const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
|
||||||
|
|
||||||
|
// GPU timing: write begin/end timestamps around each pass (2 query
|
||||||
|
// slots per pass), then resolve + read back after submit.
|
||||||
|
const ts = wfEnsureTimestamps();
|
||||||
|
const capture = !!(ts && !ts.inFlight);
|
||||||
|
const tsLabels = [];
|
||||||
|
const beginPass = (label, tsName) => {
|
||||||
|
const desc = { label };
|
||||||
|
if (capture && tsLabels.length < WF_TS_MAX_PASSES) {
|
||||||
|
desc.timestampWrites = {
|
||||||
|
querySet: ts.querySet,
|
||||||
|
beginningOfPassWriteIndex: 2 * tsLabels.length,
|
||||||
|
endOfPassWriteIndex: 2 * tsLabels.length + 1,
|
||||||
|
};
|
||||||
|
tsLabels.push(tsName);
|
||||||
|
}
|
||||||
|
return enc.beginComputePass(desc);
|
||||||
|
};
|
||||||
|
|
||||||
// GENERATE
|
// GENERATE
|
||||||
{
|
{
|
||||||
const p = enc.beginComputePass({ label: "wf-generate" });
|
const p = beginPass("wf-generate", "GENERATE");
|
||||||
p.setPipeline(pipe.genPipe);
|
p.setPipeline(pipe.genPipe);
|
||||||
p.setBindGroup(0, paramsBg, [slotOff(0)]);
|
p.setBindGroup(0, paramsBg, [slotOff(0)]);
|
||||||
p.setBindGroup(1, dataBg);
|
p.setBindGroup(1, dataBg);
|
||||||
|
|
@ -3203,7 +3286,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
||||||
const shadeSlot = 1 + 3 * d + 2;
|
const shadeSlot = 1 + 3 * d + 2;
|
||||||
// PREP — publish indirect args, zero next counter.
|
// PREP — publish indirect args, zero next counter.
|
||||||
{
|
{
|
||||||
const p = enc.beginComputePass({ label: "wf-prep" });
|
const p = beginPass("wf-prep", "PREP");
|
||||||
p.setPipeline(pipe.prepPipe);
|
p.setPipeline(pipe.prepPipe);
|
||||||
p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
|
p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
|
||||||
p.setBindGroup(1, dataBg);
|
p.setBindGroup(1, dataBg);
|
||||||
|
|
@ -3213,7 +3296,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
||||||
}
|
}
|
||||||
// TRACE — indirect over the live ray list.
|
// TRACE — indirect over the live ray list.
|
||||||
{
|
{
|
||||||
const p = enc.beginComputePass({ label: "wf-trace" });
|
const p = beginPass("wf-trace", "TRACE");
|
||||||
p.setPipeline(pipe.tracePipe);
|
p.setPipeline(pipe.tracePipe);
|
||||||
p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
|
p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
|
||||||
p.setBindGroup(1, dataBg);
|
p.setBindGroup(1, dataBg);
|
||||||
|
|
@ -3222,7 +3305,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
||||||
}
|
}
|
||||||
// SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
|
// SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
|
||||||
{
|
{
|
||||||
const p = enc.beginComputePass({ label: "wf-shade" });
|
const p = beginPass("wf-shade", "SHADE");
|
||||||
p.setPipeline(pipe.shadePipe);
|
p.setPipeline(pipe.shadePipe);
|
||||||
p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
|
p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
|
||||||
p.setBindGroup(1, dataBg);
|
p.setBindGroup(1, dataBg);
|
||||||
|
|
@ -3233,7 +3316,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
||||||
}
|
}
|
||||||
// RESOLVE — tonemap accum → output image.
|
// RESOLVE — tonemap accum → output image.
|
||||||
{
|
{
|
||||||
const p = enc.beginComputePass({ label: "wf-resolve" });
|
const p = beginPass("wf-resolve", "RESOLVE");
|
||||||
p.setPipeline(pipe.resolvePipe);
|
p.setPipeline(pipe.resolvePipe);
|
||||||
p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
|
p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
|
||||||
p.setBindGroup(1, dataBg);
|
p.setBindGroup(1, dataBg);
|
||||||
|
|
@ -3242,6 +3325,14 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
||||||
p.end();
|
p.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (capture && tsLabels.length > 0) {
|
||||||
|
enc.resolveQuerySet(ts.querySet, 0, 2 * tsLabels.length, ts.resolveBuf, 0);
|
||||||
|
enc.copyBufferToBuffer(ts.resolveBuf, 0, ts.readBuf, 0, 2 * tsLabels.length * 8);
|
||||||
|
ts.inFlight = true;
|
||||||
|
ts.pendingLabels = tsLabels;
|
||||||
|
state.tsReadPending = ts;
|
||||||
|
}
|
||||||
|
|
||||||
// Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
|
// Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
|
||||||
// before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
|
// before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
|
||||||
state.pass = enc.beginComputePass();
|
state.pass = enc.beginComputePass();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue