WebGPU RT: wavefront/streaming tracer (replaces megakernel) #4

Merged
catbot merged 8 commits from claude/issue-3 into master 2026-05-31 22:31:35 +02:00
Showing only changes of commit 1d2e12dbc9 - Show all commits

WebGPU RT: GPU timestamp-query per-pass harness

Request the timestamp-query feature; write begin/end timestamps around
each wavefront pass via timestampWrites; resolve + read back (deferred to
after submit) and print a per-pass us breakdown ~1x/sec. RTStress @ 512
instances, 1920x995: TRACE dominates, total ~1.8-3.0ms/frame.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
catbot 2026-05-31 20:08:39 +00:00

View file

@ -146,7 +146,10 @@ clamp("maxComputeWorkgroupSizeX", 1024);
clamp("maxBufferSize", 1 << 30); clamp("maxBufferSize", 1 << 30);
clamp("maxStorageBufferBindingSize", 1 << 30); clamp("maxStorageBufferBindingSize", 1 << 30);
clamp("maxComputeWorkgroupsPerDimension", 65535); clamp("maxComputeWorkgroupsPerDimension", 65535);
const device = await adapter.requestDevice({ requiredLimits }); // Per-pass GPU timing for the wavefront tracer (RTStress HUD / PR numbers).
const tsSupported = adapter.features && adapter.features.has("timestamp-query");
const requiredFeatures = tsSupported ? ["timestamp-query"] : [];
const device = await adapter.requestDevice({ requiredLimits, requiredFeatures });
const queue = device.queue; const queue = device.queue;
const ctx = canvas.getContext("webgpu"); const ctx = canvas.getContext("webgpu");
const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
@ -935,6 +938,23 @@ env.wgpuFrameEnd = () => {
queue.submit([state.encoder.finish()]); queue.submit([state.encoder.finish()]);
state.encoder = null; state.encoder = null;
// Map the wavefront timestamp readback (its resolve/copy was encoded on
// the just-submitted encoder) and log a per-pass breakdown ~1×/sec.
if (state.tsReadPending) {
const ts = state.tsReadPending;
state.tsReadPending = null;
const n = ts.pendingLabels.length;
ts.readBuf.mapAsync(GPUMapMode.READ, 0, 2 * n * 8).then(() => {
const data = new BigInt64Array(ts.readBuf.getMappedRange(0, 2 * n * 8).slice(0));
ts.readBuf.unmap();
ts.inFlight = false;
wfLogTimestamps(ts, data);
}).catch((e) => {
ts.inFlight = false;
console.error("[crafter-wgpu] timestamp readback failed:", e);
});
}
// Kick off mapAsync for the readbacks whose copyBufferToBuffer we // Kick off mapAsync for the readbacks whose copyBufferToBuffer we
// piggy-backed onto the just-submitted encoder. Doing this after // piggy-backed onto the just-submitted encoder. Doing this after
// submit ensures the map waits for that submission's GPU work to // submit ensures the map waits for that submission's GPU work to
@ -2955,6 +2975,51 @@ function ensureWavefrontBuffers(W, H) {
return wf; return wf;
} }
// ── GPU timestamp-query harness ──────────────────────────────────────────
//
// One QuerySet with 2 slots per wavefront pass; each beginComputePass writes
// begin/end timestamps. After the passes we resolve into a buffer and read
// it back (deferred to after submit, like the readback path). Deltas are
// summed per pass label and printed ~1×/sec as a per-pass breakdown.
const WF_TS_MAX_PASSES = 64; // covers maxDepth up to ~20
function wfEnsureTimestamps() {
if (!tsSupported) return null;
if (rtState.ts) return rtState.ts;
const cap = 2 * WF_TS_MAX_PASSES;
rtState.ts = {
capacity: cap,
querySet: device.createQuerySet({ type: "timestamp", count: cap }),
resolveBuf: device.createBuffer({ size: cap * 8,
usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC }),
readBuf: device.createBuffer({ size: cap * 8,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST }),
inFlight: false,
lastLog: 0,
pendingLabels: null,
};
return rtState.ts;
}
function wfLogTimestamps(ts, data) {
// data: BigInt64Array of ns timestamps, [begin0,end0,begin1,end1,...].
const now = Date.now();
if (now - ts.lastLog < 1000) return; // throttle to ~1/sec
ts.lastLog = now;
const labels = ts.pendingLabels;
if (!labels) return;
const sums = new Map(); // label → ns
let totalNs = 0;
for (let i = 0; i < labels.length; i++) {
const dt = Number(data[2*i + 1] - data[2*i + 0]);
if (dt < 0) continue;
sums.set(labels[i], (sums.get(labels[i]) || 0) + dt);
totalNs += dt;
}
const order = ["GENERATE", "PREP", "TRACE", "SHADE", "RESOLVE"];
const parts = order.filter(k => sums.has(k))
.map(k => `${k} ${(sums.get(k)/1000).toFixed(1)}us`);
console.log(`[crafter-wgpu] RT passes: ${parts.join(" | ")} | total ${(totalNs/1000).toFixed(1)}us`);
}
env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
if (!rtState.vertHeap) rtInit(); if (!rtState.vertHeap) rtInit();
const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen)); const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
@ -3187,9 +3252,27 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); }; const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
// GPU timing: write begin/end timestamps around each pass (2 query
// slots per pass), then resolve + read back after submit.
const ts = wfEnsureTimestamps();
const capture = !!(ts && !ts.inFlight);
const tsLabels = [];
const beginPass = (label, tsName) => {
const desc = { label };
if (capture && tsLabels.length < WF_TS_MAX_PASSES) {
desc.timestampWrites = {
querySet: ts.querySet,
beginningOfPassWriteIndex: 2 * tsLabels.length,
endOfPassWriteIndex: 2 * tsLabels.length + 1,
};
tsLabels.push(tsName);
}
return enc.beginComputePass(desc);
};
// GENERATE // GENERATE
{ {
const p = enc.beginComputePass({ label: "wf-generate" }); const p = beginPass("wf-generate", "GENERATE");
p.setPipeline(pipe.genPipe); p.setPipeline(pipe.genPipe);
p.setBindGroup(0, paramsBg, [slotOff(0)]); p.setBindGroup(0, paramsBg, [slotOff(0)]);
p.setBindGroup(1, dataBg); p.setBindGroup(1, dataBg);
@ -3203,7 +3286,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
const shadeSlot = 1 + 3 * d + 2; const shadeSlot = 1 + 3 * d + 2;
// PREP — publish indirect args, zero next counter. // PREP — publish indirect args, zero next counter.
{ {
const p = enc.beginComputePass({ label: "wf-prep" }); const p = beginPass("wf-prep", "PREP");
p.setPipeline(pipe.prepPipe); p.setPipeline(pipe.prepPipe);
p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]); p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
p.setBindGroup(1, dataBg); p.setBindGroup(1, dataBg);
@ -3213,7 +3296,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
} }
// TRACE — indirect over the live ray list. // TRACE — indirect over the live ray list.
{ {
const p = enc.beginComputePass({ label: "wf-trace" }); const p = beginPass("wf-trace", "TRACE");
p.setPipeline(pipe.tracePipe); p.setPipeline(pipe.tracePipe);
p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]); p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
p.setBindGroup(1, dataBg); p.setBindGroup(1, dataBg);
@ -3222,7 +3305,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
} }
// SHADE — indirect; runs user closesthit/miss, may emit + accumulate. // SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
{ {
const p = enc.beginComputePass({ label: "wf-shade" }); const p = beginPass("wf-shade", "SHADE");
p.setPipeline(pipe.shadePipe); p.setPipeline(pipe.shadePipe);
p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]); p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
p.setBindGroup(1, dataBg); p.setBindGroup(1, dataBg);
@ -3233,7 +3316,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
} }
// RESOLVE — tonemap accum → output image. // RESOLVE — tonemap accum → output image.
{ {
const p = enc.beginComputePass({ label: "wf-resolve" }); const p = beginPass("wf-resolve", "RESOLVE");
p.setPipeline(pipe.resolvePipe); p.setPipeline(pipe.resolvePipe);
p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]); p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
p.setBindGroup(1, dataBg); p.setBindGroup(1, dataBg);
@ -3242,6 +3325,14 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
p.end(); p.end();
} }
if (capture && tsLabels.length > 0) {
enc.resolveQuerySet(ts.querySet, 0, 2 * tsLabels.length, ts.resolveBuf, 0);
enc.copyBufferToBuffer(ts.resolveBuf, 0, ts.readBuf, 0, 2 * tsLabels.length * 8);
ts.inFlight = true;
ts.pendingLabels = tsLabels;
state.tsReadPending = ts;
}
// Reopen the frame's shared pass so wgpuFrameEnd / later UI work as // Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
// before, and flip ping-pong so the blit picks the texture RESOLVE wrote. // before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
state.pass = enc.beginComputePass(); state.pass = enc.beginComputePass();