WebGPU RT: GPU timestamp-query per-pass harness
Request the timestamp-query feature; write begin/end timestamps around each wavefront pass via timestampWrites; resolve + read back (deferred to after submit) and print a per-pass us breakdown ~1x/sec. RTStress @ 512 instances, 1920x995: TRACE dominates, total ~1.8-3.0ms/frame. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
f4d6493d91
commit
1d2e12dbc9
1 changed files with 97 additions and 6 deletions
|
|
@ -146,7 +146,10 @@ clamp("maxComputeWorkgroupSizeX", 1024);
|
|||
clamp("maxBufferSize", 1 << 30);
|
||||
clamp("maxStorageBufferBindingSize", 1 << 30);
|
||||
clamp("maxComputeWorkgroupsPerDimension", 65535);
|
||||
const device = await adapter.requestDevice({ requiredLimits });
|
||||
// Per-pass GPU timing for the wavefront tracer (RTStress HUD / PR numbers).
|
||||
const tsSupported = adapter.features && adapter.features.has("timestamp-query");
|
||||
const requiredFeatures = tsSupported ? ["timestamp-query"] : [];
|
||||
const device = await adapter.requestDevice({ requiredLimits, requiredFeatures });
|
||||
const queue = device.queue;
|
||||
const ctx = canvas.getContext("webgpu");
|
||||
const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
|
||||
|
|
@ -935,6 +938,23 @@ env.wgpuFrameEnd = () => {
|
|||
queue.submit([state.encoder.finish()]);
|
||||
state.encoder = null;
|
||||
|
||||
// Map the wavefront timestamp readback (its resolve/copy was encoded on
|
||||
// the just-submitted encoder) and log a per-pass breakdown ~1×/sec.
|
||||
if (state.tsReadPending) {
|
||||
const ts = state.tsReadPending;
|
||||
state.tsReadPending = null;
|
||||
const n = ts.pendingLabels.length;
|
||||
ts.readBuf.mapAsync(GPUMapMode.READ, 0, 2 * n * 8).then(() => {
|
||||
const data = new BigInt64Array(ts.readBuf.getMappedRange(0, 2 * n * 8).slice(0));
|
||||
ts.readBuf.unmap();
|
||||
ts.inFlight = false;
|
||||
wfLogTimestamps(ts, data);
|
||||
}).catch((e) => {
|
||||
ts.inFlight = false;
|
||||
console.error("[crafter-wgpu] timestamp readback failed:", e);
|
||||
});
|
||||
}
|
||||
|
||||
// Kick off mapAsync for the readbacks whose copyBufferToBuffer we
|
||||
// piggy-backed onto the just-submitted encoder. Doing this after
|
||||
// submit ensures the map waits for that submission's GPU work to
|
||||
|
|
@ -2955,6 +2975,51 @@ function ensureWavefrontBuffers(W, H) {
|
|||
return wf;
|
||||
}
|
||||
|
||||
// ── GPU timestamp-query harness ──────────────────────────────────────────
|
||||
//
|
||||
// One QuerySet with 2 slots per wavefront pass; each beginComputePass writes
|
||||
// begin/end timestamps. After the passes we resolve into a buffer and read
|
||||
// it back (deferred to after submit, like the readback path). Deltas are
|
||||
// summed per pass label and printed ~1×/sec as a per-pass breakdown.
|
||||
const WF_TS_MAX_PASSES = 64; // covers maxDepth up to ~20
|
||||
function wfEnsureTimestamps() {
|
||||
if (!tsSupported) return null;
|
||||
if (rtState.ts) return rtState.ts;
|
||||
const cap = 2 * WF_TS_MAX_PASSES;
|
||||
rtState.ts = {
|
||||
capacity: cap,
|
||||
querySet: device.createQuerySet({ type: "timestamp", count: cap }),
|
||||
resolveBuf: device.createBuffer({ size: cap * 8,
|
||||
usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC }),
|
||||
readBuf: device.createBuffer({ size: cap * 8,
|
||||
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST }),
|
||||
inFlight: false,
|
||||
lastLog: 0,
|
||||
pendingLabels: null,
|
||||
};
|
||||
return rtState.ts;
|
||||
}
|
||||
function wfLogTimestamps(ts, data) {
|
||||
// data: BigInt64Array of ns timestamps, [begin0,end0,begin1,end1,...].
|
||||
const now = Date.now();
|
||||
if (now - ts.lastLog < 1000) return; // throttle to ~1/sec
|
||||
ts.lastLog = now;
|
||||
const labels = ts.pendingLabels;
|
||||
if (!labels) return;
|
||||
const sums = new Map(); // label → ns
|
||||
let totalNs = 0;
|
||||
for (let i = 0; i < labels.length; i++) {
|
||||
const dt = Number(data[2*i + 1] - data[2*i + 0]);
|
||||
if (dt < 0) continue;
|
||||
sums.set(labels[i], (sums.get(labels[i]) || 0) + dt);
|
||||
totalNs += dt;
|
||||
}
|
||||
const order = ["GENERATE", "PREP", "TRACE", "SHADE", "RESOLVE"];
|
||||
const parts = order.filter(k => sums.has(k))
|
||||
.map(k => `${k} ${(sums.get(k)/1000).toFixed(1)}us`);
|
||||
console.log(`[crafter-wgpu] RT passes: ${parts.join(" | ")} | total ${(totalNs/1000).toFixed(1)}us`);
|
||||
}
|
||||
|
||||
env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
|
||||
if (!rtState.vertHeap) rtInit();
|
||||
const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
|
||||
|
|
@ -3187,9 +3252,27 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
|||
|
||||
const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
|
||||
|
||||
// GPU timing: write begin/end timestamps around each pass (2 query
|
||||
// slots per pass), then resolve + read back after submit.
|
||||
const ts = wfEnsureTimestamps();
|
||||
const capture = !!(ts && !ts.inFlight);
|
||||
const tsLabels = [];
|
||||
const beginPass = (label, tsName) => {
|
||||
const desc = { label };
|
||||
if (capture && tsLabels.length < WF_TS_MAX_PASSES) {
|
||||
desc.timestampWrites = {
|
||||
querySet: ts.querySet,
|
||||
beginningOfPassWriteIndex: 2 * tsLabels.length,
|
||||
endOfPassWriteIndex: 2 * tsLabels.length + 1,
|
||||
};
|
||||
tsLabels.push(tsName);
|
||||
}
|
||||
return enc.beginComputePass(desc);
|
||||
};
|
||||
|
||||
// GENERATE
|
||||
{
|
||||
const p = enc.beginComputePass({ label: "wf-generate" });
|
||||
const p = beginPass("wf-generate", "GENERATE");
|
||||
p.setPipeline(pipe.genPipe);
|
||||
p.setBindGroup(0, paramsBg, [slotOff(0)]);
|
||||
p.setBindGroup(1, dataBg);
|
||||
|
|
@ -3203,7 +3286,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
|||
const shadeSlot = 1 + 3 * d + 2;
|
||||
// PREP — publish indirect args, zero next counter.
|
||||
{
|
||||
const p = enc.beginComputePass({ label: "wf-prep" });
|
||||
const p = beginPass("wf-prep", "PREP");
|
||||
p.setPipeline(pipe.prepPipe);
|
||||
p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
|
||||
p.setBindGroup(1, dataBg);
|
||||
|
|
@ -3213,7 +3296,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
|||
}
|
||||
// TRACE — indirect over the live ray list.
|
||||
{
|
||||
const p = enc.beginComputePass({ label: "wf-trace" });
|
||||
const p = beginPass("wf-trace", "TRACE");
|
||||
p.setPipeline(pipe.tracePipe);
|
||||
p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
|
||||
p.setBindGroup(1, dataBg);
|
||||
|
|
@ -3222,7 +3305,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
|||
}
|
||||
// SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
|
||||
{
|
||||
const p = enc.beginComputePass({ label: "wf-shade" });
|
||||
const p = beginPass("wf-shade", "SHADE");
|
||||
p.setPipeline(pipe.shadePipe);
|
||||
p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
|
||||
p.setBindGroup(1, dataBg);
|
||||
|
|
@ -3233,7 +3316,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
|||
}
|
||||
// RESOLVE — tonemap accum → output image.
|
||||
{
|
||||
const p = enc.beginComputePass({ label: "wf-resolve" });
|
||||
const p = beginPass("wf-resolve", "RESOLVE");
|
||||
p.setPipeline(pipe.resolvePipe);
|
||||
p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
|
||||
p.setBindGroup(1, dataBg);
|
||||
|
|
@ -3242,6 +3325,14 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
|||
p.end();
|
||||
}
|
||||
|
||||
if (capture && tsLabels.length > 0) {
|
||||
enc.resolveQuerySet(ts.querySet, 0, 2 * tsLabels.length, ts.resolveBuf, 0);
|
||||
enc.copyBufferToBuffer(ts.resolveBuf, 0, ts.readBuf, 0, 2 * tsLabels.length * 8);
|
||||
ts.inFlight = true;
|
||||
ts.pendingLabels = tsLabels;
|
||||
state.tsReadPending = ts;
|
||||
}
|
||||
|
||||
// Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
|
||||
// before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
|
||||
state.pass = enc.beginComputePass();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue