WebGPU RT: GPU timestamp-query per-pass harness

Request the timestamp-query feature; write begin/end timestamps around each wavefront pass via timestampWrites; resolve + read back (deferred to after submit) and print a per-pass us breakdown ~1x/sec. RTStress @ 512 instances, 1920x995: TRACE dominates, total ~1.8-3.0ms/frame. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 22:31:35 +02:00 · 2026-05-31 20:08:39 +00:00 · 2026-05-31 20:08:39 +00:00
commit 1d2e12dbc9
1 changed files with 97 additions and 6 deletions
--- a/additional/dom-webgpu.js
+++ b/additional/dom-webgpu.js
@ -146,7 +146,10 @@ clamp("maxComputeWorkgroupSizeX", 1024);
 clamp("maxBufferSize", 1 << 30);
 clamp("maxStorageBufferBindingSize", 1 << 30);
 clamp("maxComputeWorkgroupsPerDimension", 65535);
-const device  = await adapter.requestDevice({ requiredLimits });
+// Per-pass GPU timing for the wavefront tracer (RTStress HUD / PR numbers).
 const tsSupported = adapter.features && adapter.features.has("timestamp-query");
 const requiredFeatures = tsSupported ? ["timestamp-query"] : [];
 const device  = await adapter.requestDevice({ requiredLimits, requiredFeatures });
 const queue   = device.queue;
 const ctx     = canvas.getContext("webgpu");
 const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
@ -935,6 +938,23 @@ env.wgpuFrameEnd = () => {
    queue.submit([state.encoder.finish()]);
    state.encoder = null;
    // Map the wavefront timestamp readback (its resolve/copy was encoded on
    // the just-submitted encoder) and log a per-pass breakdown ~1×/sec.
    if (state.tsReadPending) {
        const ts = state.tsReadPending;
        state.tsReadPending = null;
        const n = ts.pendingLabels.length;
        ts.readBuf.mapAsync(GPUMapMode.READ, 0, 2 * n * 8).then(() => {
            const data = new BigInt64Array(ts.readBuf.getMappedRange(0, 2 * n * 8).slice(0));
            ts.readBuf.unmap();
            ts.inFlight = false;
            wfLogTimestamps(ts, data);
        }).catch((e) => {
            ts.inFlight = false;
            console.error("[crafter-wgpu] timestamp readback failed:", e);
        });
    }
    // Kick off mapAsync for the readbacks whose copyBufferToBuffer we
    // piggy-backed onto the just-submitted encoder. Doing this after
    // submit ensures the map waits for that submission's GPU work to
@ -2955,6 +2975,51 @@ function ensureWavefrontBuffers(W, H) {
    return wf;
 }
 // ── GPU timestamp-query harness ──────────────────────────────────────────
 //
 // One QuerySet with 2 slots per wavefront pass; each beginComputePass writes
 // begin/end timestamps. After the passes we resolve into a buffer and read
 // it back (deferred to after submit, like the readback path). Deltas are
 // summed per pass label and printed ~1×/sec as a per-pass breakdown.
 const WF_TS_MAX_PASSES = 64;   // covers maxDepth up to ~20
 function wfEnsureTimestamps() {
    if (!tsSupported) return null;
    if (rtState.ts) return rtState.ts;
    const cap = 2 * WF_TS_MAX_PASSES;
    rtState.ts = {
        capacity:  cap,
        querySet:  device.createQuerySet({ type: "timestamp", count: cap }),
        resolveBuf: device.createBuffer({ size: cap * 8,
            usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC }),
        readBuf:   device.createBuffer({ size: cap * 8,
            usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST }),
        inFlight:  false,
        lastLog:   0,
        pendingLabels: null,
    };
    return rtState.ts;
 }
 function wfLogTimestamps(ts, data) {
    // data: BigInt64Array of ns timestamps, [begin0,end0,begin1,end1,...].
    const now = Date.now();
    if (now - ts.lastLog < 1000) return;   // throttle to ~1/sec
    ts.lastLog = now;
    const labels = ts.pendingLabels;
    if (!labels) return;
    const sums = new Map();   // label → ns
    let totalNs = 0;
    for (let i = 0; i < labels.length; i++) {
        const dt = Number(data[2*i + 1] - data[2*i + 0]);
        if (dt < 0) continue;
        sums.set(labels[i], (sums.get(labels[i]) || 0) + dt);
        totalNs += dt;
    }
    const order = ["GENERATE", "PREP", "TRACE", "SHADE", "RESOLVE"];
    const parts = order.filter(k => sums.has(k))
                       .map(k => `${k} ${(sums.get(k)/1000).toFixed(1)}us`);
    console.log(`[crafter-wgpu] RT passes: ${parts.join(" | ")} | total ${(totalNs/1000).toFixed(1)}us`);
 }
 env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
    if (!rtState.vertHeap) rtInit();
    const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
@ -3187,9 +3252,27 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
    const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
    // GPU timing: write begin/end timestamps around each pass (2 query
    // slots per pass), then resolve + read back after submit.
    const ts = wfEnsureTimestamps();
    const capture = !!(ts && !ts.inFlight);
    const tsLabels = [];
    const beginPass = (label, tsName) => {
        const desc = { label };
        if (capture && tsLabels.length < WF_TS_MAX_PASSES) {
            desc.timestampWrites = {
                querySet: ts.querySet,
                beginningOfPassWriteIndex: 2 * tsLabels.length,
                endOfPassWriteIndex:       2 * tsLabels.length + 1,
            };
            tsLabels.push(tsName);
        }
        return enc.beginComputePass(desc);
    };
    // GENERATE
    {
-        const p = enc.beginComputePass({ label: "wf-generate" });
+        const p = beginPass("wf-generate", "GENERATE");
        p.setPipeline(pipe.genPipe);
        p.setBindGroup(0, paramsBg, [slotOff(0)]);
        p.setBindGroup(1, dataBg);
@ -3203,7 +3286,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
        const shadeSlot = 1 + 3 * d + 2;
        // PREP — publish indirect args, zero next counter.
        {
-            const p = enc.beginComputePass({ label: "wf-prep" });
+            const p = beginPass("wf-prep", "PREP");
            p.setPipeline(pipe.prepPipe);
            p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
            p.setBindGroup(1, dataBg);
@ -3213,7 +3296,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
        }
        // TRACE — indirect over the live ray list.
        {
-            const p = enc.beginComputePass({ label: "wf-trace" });
+            const p = beginPass("wf-trace", "TRACE");
            p.setPipeline(pipe.tracePipe);
            p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
            p.setBindGroup(1, dataBg);
@ -3222,7 +3305,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
        }
        // SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
        {
-            const p = enc.beginComputePass({ label: "wf-shade" });
+            const p = beginPass("wf-shade", "SHADE");
            p.setPipeline(pipe.shadePipe);
            p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
            p.setBindGroup(1, dataBg);
@ -3233,7 +3316,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
    }
    // RESOLVE — tonemap accum → output image.
    {
-        const p = enc.beginComputePass({ label: "wf-resolve" });
+        const p = beginPass("wf-resolve", "RESOLVE");
        p.setPipeline(pipe.resolvePipe);
        p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
        p.setBindGroup(1, dataBg);
@ -3242,6 +3325,14 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
        p.end();
    }
    if (capture && tsLabels.length > 0) {
        enc.resolveQuerySet(ts.querySet, 0, 2 * tsLabels.length, ts.resolveBuf, 0);
        enc.copyBufferToBuffer(ts.resolveBuf, 0, ts.readBuf, 0, 2 * tsLabels.length * 8);
        ts.inFlight = true;
        ts.pendingLabels = tsLabels;
        state.tsReadPending = ts;
    }
    // Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
    // before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
    state.pass = enc.beginComputePass();