From 1d2e12dbc9623c7b7d6da373185a4cdaa116f5a3 Mon Sep 17 00:00:00 2001 From: catbot Date: Sun, 31 May 2026 20:08:39 +0000 Subject: [PATCH] WebGPU RT: GPU timestamp-query per-pass harness Request the timestamp-query feature; write begin/end timestamps around each wavefront pass via timestampWrites; resolve + read back (deferred to after submit) and print a per-pass us breakdown ~1x/sec. RTStress @ 512 instances, 1920x995: TRACE dominates, total ~1.8-3.0ms/frame. Co-Authored-By: Claude Opus 4.8 --- additional/dom-webgpu.js | 103 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 6 deletions(-) diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js index 9380517..fcadaba 100644 --- a/additional/dom-webgpu.js +++ b/additional/dom-webgpu.js @@ -146,7 +146,10 @@ clamp("maxComputeWorkgroupSizeX", 1024); clamp("maxBufferSize", 1 << 30); clamp("maxStorageBufferBindingSize", 1 << 30); clamp("maxComputeWorkgroupsPerDimension", 65535); -const device = await adapter.requestDevice({ requiredLimits }); +// Per-pass GPU timing for the wavefront tracer (RTStress HUD / PR numbers). +const tsSupported = adapter.features && adapter.features.has("timestamp-query"); +const requiredFeatures = tsSupported ? ["timestamp-query"] : []; +const device = await adapter.requestDevice({ requiredLimits, requiredFeatures }); const queue = device.queue; const ctx = canvas.getContext("webgpu"); const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit @@ -935,6 +938,23 @@ env.wgpuFrameEnd = () => { queue.submit([state.encoder.finish()]); state.encoder = null; + // Map the wavefront timestamp readback (its resolve/copy was encoded on + // the just-submitted encoder) and log a per-pass breakdown ~1×/sec. + if (state.tsReadPending) { + const ts = state.tsReadPending; + state.tsReadPending = null; + const n = ts.pendingLabels.length; + ts.readBuf.mapAsync(GPUMapMode.READ, 0, 2 * n * 8).then(() => { + const data = new BigInt64Array(ts.readBuf.getMappedRange(0, 2 * n * 8).slice(0)); + ts.readBuf.unmap(); + ts.inFlight = false; + wfLogTimestamps(ts, data); + }).catch((e) => { + ts.inFlight = false; + console.error("[crafter-wgpu] timestamp readback failed:", e); + }); + } + // Kick off mapAsync for the readbacks whose copyBufferToBuffer we // piggy-backed onto the just-submitted encoder. Doing this after // submit ensures the map waits for that submission's GPU work to @@ -2955,6 +2975,51 @@ function ensureWavefrontBuffers(W, H) { return wf; } +// ── GPU timestamp-query harness ────────────────────────────────────────── +// +// One QuerySet with 2 slots per wavefront pass; each beginComputePass writes +// begin/end timestamps. After the passes we resolve into a buffer and read +// it back (deferred to after submit, like the readback path). Deltas are +// summed per pass label and printed ~1×/sec as a per-pass breakdown. +const WF_TS_MAX_PASSES = 64; // covers maxDepth up to ~20 +function wfEnsureTimestamps() { + if (!tsSupported) return null; + if (rtState.ts) return rtState.ts; + const cap = 2 * WF_TS_MAX_PASSES; + rtState.ts = { + capacity: cap, + querySet: device.createQuerySet({ type: "timestamp", count: cap }), + resolveBuf: device.createBuffer({ size: cap * 8, + usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC }), + readBuf: device.createBuffer({ size: cap * 8, + usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST }), + inFlight: false, + lastLog: 0, + pendingLabels: null, + }; + return rtState.ts; +} +function wfLogTimestamps(ts, data) { + // data: BigInt64Array of ns timestamps, [begin0,end0,begin1,end1,...]. + const now = Date.now(); + if (now - ts.lastLog < 1000) return; // throttle to ~1/sec + ts.lastLog = now; + const labels = ts.pendingLabels; + if (!labels) return; + const sums = new Map(); // label → ns + let totalNs = 0; + for (let i = 0; i < labels.length; i++) { + const dt = Number(data[2*i + 1] - data[2*i + 0]); + if (dt < 0) continue; + sums.set(labels[i], (sums.get(labels[i]) || 0) + dt); + totalNs += dt; + } + const order = ["GENERATE", "PREP", "TRACE", "SHADE", "RESOLVE"]; + const parts = order.filter(k => sums.has(k)) + .map(k => `${k} ${(sums.get(k)/1000).toFixed(1)}us`); + console.log(`[crafter-wgpu] RT passes: ${parts.join(" | ")} | total ${(totalNs/1000).toFixed(1)}us`); +} + env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { if (!rtState.vertHeap) rtInit(); const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen)); @@ -3187,9 +3252,27 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); }; + // GPU timing: write begin/end timestamps around each pass (2 query + // slots per pass), then resolve + read back after submit. + const ts = wfEnsureTimestamps(); + const capture = !!(ts && !ts.inFlight); + const tsLabels = []; + const beginPass = (label, tsName) => { + const desc = { label }; + if (capture && tsLabels.length < WF_TS_MAX_PASSES) { + desc.timestampWrites = { + querySet: ts.querySet, + beginningOfPassWriteIndex: 2 * tsLabels.length, + endOfPassWriteIndex: 2 * tsLabels.length + 1, + }; + tsLabels.push(tsName); + } + return enc.beginComputePass(desc); + }; + // GENERATE { - const p = enc.beginComputePass({ label: "wf-generate" }); + const p = beginPass("wf-generate", "GENERATE"); p.setPipeline(pipe.genPipe); p.setBindGroup(0, paramsBg, [slotOff(0)]); p.setBindGroup(1, dataBg); @@ -3203,7 +3286,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, const shadeSlot = 1 + 3 * d + 2; // PREP — publish indirect args, zero next counter. { - const p = enc.beginComputePass({ label: "wf-prep" }); + const p = beginPass("wf-prep", "PREP"); p.setPipeline(pipe.prepPipe); p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]); p.setBindGroup(1, dataBg); @@ -3213,7 +3296,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, } // TRACE — indirect over the live ray list. { - const p = enc.beginComputePass({ label: "wf-trace" }); + const p = beginPass("wf-trace", "TRACE"); p.setPipeline(pipe.tracePipe); p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]); p.setBindGroup(1, dataBg); @@ -3222,7 +3305,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, } // SHADE — indirect; runs user closesthit/miss, may emit + accumulate. { - const p = enc.beginComputePass({ label: "wf-shade" }); + const p = beginPass("wf-shade", "SHADE"); p.setPipeline(pipe.shadePipe); p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]); p.setBindGroup(1, dataBg); @@ -3233,7 +3316,7 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, } // RESOLVE — tonemap accum → output image. { - const p = enc.beginComputePass({ label: "wf-resolve" }); + const p = beginPass("wf-resolve", "RESOLVE"); p.setPipeline(pipe.resolvePipe); p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]); p.setBindGroup(1, dataBg); @@ -3242,6 +3325,14 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, p.end(); } + if (capture && tsLabels.length > 0) { + enc.resolveQuerySet(ts.querySet, 0, 2 * tsLabels.length, ts.resolveBuf, 0); + enc.copyBufferToBuffer(ts.resolveBuf, 0, ts.readBuf, 0, 2 * tsLabels.length * 8); + ts.inFlight = true; + ts.pendingLabels = tsLabels; + state.tsReadPending = ts; + } + // Reopen the frame's shared pass so wgpuFrameEnd / later UI work as // before, and flip ping-pong so the blit picks the texture RESOLVE wrote. state.pass = enc.beginComputePass();