WebGPU RT: wavefront tracer core (GENERATE/PREP/TRACE/SHADE/RESOLVE)

Replace the megakernel @compute entry with five wavefront kernels sharing one module, connected by GPU ray/hit/payload buffers and a GPU-driven indirect bounce loop: GENERATE -> (PREP -> TRACE -> SHADE) x maxDepth -> RESOLVE - TRACE contains zero user code (pure _rtwTraverseTlas/Blas, opaque-only). - PREP publishes dispatchWorkgroupsIndirect args from the live ray count; the indirect-args buffer lives in its own bind group so it is never bound read-write in the same dispatch that consumes it as INDIRECT. - New emit/accumulate API: rtEmitPrimaryRay / rtEmitRay / rtAccumulate, plus an optional user Resolve stage (tonemap hook; identity by default). - Per-pass WfParams via a dynamic-offset uniform ring (curIsA/bounce vary between passes within one submit). - Payload-typed wfPayload binding emitted in the codegen region after the user's struct Payload; payload travels with each ray (2*W*H slots). - Request maxBufferSize / maxStorageBufferBindingSize / maxComputeWorkgroups PerDimension so the W*H-sized work buffers fit past the 128MB baseline. VulkanTriangle ported to the new API and renders bit-identical to the megakernel baseline at maxDepth=1. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 16:24:41 +00:00 · 2026-05-31 16:24:41 +00:00 · 4e42d663a6
commit 4e42d663a6
parent e0d72f57f2
9 changed files with 755 additions and 101 deletions
--- a/additional/dom-webgpu.js
+++ b/additional/dom-webgpu.js
@ -136,6 +136,16 @@ clamp("maxStorageTexturesPerShaderStage", 8);
 // per-workgroup invocation cap raised from the default 256.
 clamp("maxComputeInvocationsPerWorkgroup", 1024);
 clamp("maxComputeWorkgroupSizeX", 1024);
+// Wavefront RT work buffers are sized to W·H rays. At 1080p the payload
+// store (≈245 MB) and hit buffer (≈214 MB) blow past the 128 MB baseline
+// storage-buffer binding size, and the whole set past the 256 MB baseline
+// maxBufferSize — request whatever the adapter actually allows (4090/Dawn
+// reports 1 GB+). maxComputeWorkgroupsPerDimension bounds the indirect
+// TRACE/SHADE 1-D dispatch (ceil(W·H/64) ≈ 32k workgroups at 1080p; the
+// 65535 default covers it, but request the adapter max for headroom).
+clamp("maxBufferSize", 1 << 30);
+clamp("maxStorageBufferBindingSize", 1 << 30);
+clamp("maxComputeWorkgroupsPerDimension", 65535);
 const device  = await adapter.requestDevice({ requiredLimits });
 const queue   = device.queue;
 const ctx     = canvas.getContext("webgpu");
@ -147,6 +157,9 @@ device.lost.then((info) => {
    console.error("[crafter-wgpu] device lost:", info.message);
    state.gpuLost = true;
 });
+device.addEventListener("uncapturederror", (e) => {
+    console.error("[crafter-wgpu] uncaptured error:", e.error && e.error.message);
+});

 // ─── handle tables ─────────────────────────────────────────────────────

@ -1671,6 +1684,350 @@ fn traceRay(tlasIdx: u32, flags: u32, cullMask: u32,
 }
 `;

+// ════════════════════════════════════════════════════════════════════════
+// WAVEFRONT RT — streaming tracer (GENERATE → PREP → TRACE → SHADE →
+// RESOLVE). Replaces the megakernel. The C++ side (PipelineRTWebGPU) emits
+// the user sources, the per-stage SBT switches, the Payload-typed wfPayload
+// binding, and the five @compute entry points; this JS injects the bindings
+// + library helpers the entry points call.
+// ════════════════════════════════════════════════════════════════════════
+
+// Bindings prelude for the wavefront pipeline. group(0) is the per-pass
+// WfParams uniform (dynamic-offset ring — one slot per pass so curIsA /
+// bounce can vary between passes inside one submit). group(1) carries the
+// geometry heaps (0..9, identical to the old megakernel layout so the
+// register/build paths are unchanged) plus the wavefront work buffers
+// (10..14); wfPayload at binding 15 is emitted in the codegen region after
+// the user's `struct Payload`. group(2) is the indirect-args buffer, bound
+// only by PREP (a buffer used as INDIRECT in a dispatch may not also be
+// bound read-write in that same dispatch — so TRACE/SHADE must not bind it).
+const rtWgslWavefrontBindings = String.raw`
+struct WfParams {
+    surfaceW:    u32,
+    surfaceH:    u32,
+    rayCapacity: u32,
+    curIsA:      u32,   // 1 → current ray buffer is A (emit-next = B); 0 → B
+    bounce:      u32,
+    maxDepth:    u32,
+    tlasNPadded: u32,   // TLAS sweep-tree padded leaf count (descent depth)
+    flags:       u32,
+};
+
+// One in-flight ray. 64 bytes; origin/direction vec3-aligned to 16.
+struct WfRay {
+    origin:          vec3<f32>,
+    tMin:            f32,
+    direction:       vec3<f32>,
+    tMax:            f32,
+    pixel:           u32,   // linear framebuffer pixel this ray contributes to
+    flags:           u32,
+    cullMask:        u32,
+    missIndex:       u32,
+    sbtRecordOffset: u32,
+    payloadSlot:     u32,   // index into wfPayload
+    kind:            u32,   // 0 primary, 1 continuation (informational)
+    _pad:            u32,
+};
+
+// TRACE → SHADE handoff. Mirrors HitInfo + a hitKind (0 miss, 1 triangle).
+struct HitResult {
+    t:                  f32,
+    instanceId:         u32,
+    primitiveId:        u32,
+    hitGroupIndex:      u32,
+    attribs:            vec2<f32>,
+    hitKind:            u32,
+    customIndex:        u32,
+    objectRayOrigin:    vec3<f32>,
+    _p0:                f32,
+    objectRayDirection: vec3<f32>,
+    _p1:                f32,
+    objectToWorldR0:    vec4<f32>,
+    objectToWorldR1:    vec4<f32>,
+    objectToWorldR2:    vec4<f32>,
+};
+
+struct BvhNode {
+    aabbMin: vec3<f32>,
+    _pad0:   u32,
+    aabbMax: vec3<f32>,
+    _pad1:   u32,
+};
+
+@group(0) @binding(0) var<uniform> wfParams : WfParams;
+
+@group(1) @binding(0) var<storage,read>  tlasEntries    : array<TLASEntry>;
+@group(1) @binding(1) var<storage,read>  bvhNodes       : array<BVHNode>;
+@group(1) @binding(2) var<storage,read>  meshRecords    : array<MeshRecord>;
+@group(1) @binding(3) var<storage,read>  vertices       : array<f32>;
+@group(1) @binding(4) var<storage,read>  indices        : array<u32>;
+@group(1) @binding(5) var<storage,read>  primRemap      : array<u32>;
+@group(1) @binding(6) var outImage : texture_storage_2d<rgba8unorm, write>;
+@group(1) @binding(7) var<storage,read>  vertexAttribs  : array<u32>;
+@group(1) @binding(8) var<storage,read>  tlasEntryOrder : array<u32>;
+@group(1) @binding(9) var<storage,read>  tlasBvhNodes   : array<BvhNode>;
+@group(1) @binding(10) var<storage,read_write> wfRaysA   : array<WfRay>;
+@group(1) @binding(11) var<storage,read_write> wfRaysB   : array<WfRay>;
+@group(1) @binding(12) var<storage,read_write> wfHits    : array<HitResult>;
+@group(1) @binding(13) var<storage,read_write> wfAccum   : array<vec4<f32>>;
+@group(1) @binding(14) var<storage,read_write> wfCounters : array<atomic<u32>>;
+// @group(1) @binding(15) wfPayload : array<Payload>  — emitted by codegen.
+
+@group(2) @binding(0) var<storage,read_write> wfIndirect : array<u32>;
+`;
+
+// Library helpers the codegen entry points call. Sits after the pure
+// helpers (_rtAabb/_rtTri/_rtFetchTri) and after the user's Payload +
+// wfPayload binding, so rtEmit*/_wfShade can name Payload/wfPayload.
+const rtWgslWavefrontHelpers = String.raw`
+var<private> _wfPixel: u32 = 0u;
+
+// Live ray count for the current buffer, clamped to capacity (the emit
+// counter can overshoot capacity; dropped rays were never written).
+fn _wfCurCount() -> u32 {
+    let raw = select(atomicLoad(&wfCounters[1]), atomicLoad(&wfCounters[0]),
+                     wfParams.curIsA == 1u);
+    return min(raw, wfParams.rayCapacity);
+}
+
+// Add linear radiance to the pixel this SHADE/GENERATE thread owns. Safe
+// without atomics: at most one ray per pixel per bounce, and bounces run
+// in separate passes (implicit barrier between them).
+fn rtAccumulate(rgb: vec3<f32>) {
+    wfAccum[_wfPixel] = wfAccum[_wfPixel] + vec4<f32>(rgb, 0.0);
+}
+
+// raygen → emit the pixel's primary ray. Bounce 0's current buffer is
+// always A, so primaries land in A with their payload in the A region
+// [0, rayCapacity).
+fn rtEmitPrimaryRay(origin: vec3<f32>, tMin: f32, dir: vec3<f32>, tMax: f32,
+                    flags: u32, cullMask: u32, sbtRecordOffset: u32,
+                    missIndex: u32, payload: Payload) {
+    let slot = atomicAdd(&wfCounters[0], 1u);
+    if (slot >= wfParams.rayCapacity) { return; }
+    var r: WfRay;
+    r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax;
+    r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask;
+    r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset;
+    r.payloadSlot = slot; r.kind = 0u;
+    wfRaysA[slot] = r;
+    wfPayload[slot] = payload;
+}
+
+// closesthit/miss → spawn a continuation/shadow ray into the NEXT buffer
+// (the one the upcoming TRACE will read). Payload travels with it; the
+// next buffer's payload region is [rayCapacity, 2*rayCapacity) for B.
+fn rtEmitRay(origin: vec3<f32>, tMin: f32, dir: vec3<f32>, tMax: f32,
+             flags: u32, cullMask: u32, sbtRecordOffset: u32,
+             missIndex: u32, payload: Payload) {
+    let nextIsA   = wfParams.curIsA == 0u;
+    let counterIdx = select(1u, 0u, nextIsA);
+    let slot = atomicAdd(&wfCounters[counterIdx], 1u);
+    if (slot >= wfParams.rayCapacity) { return; }
+    let payloadBase = select(wfParams.rayCapacity, 0u, nextIsA);
+    var r: WfRay;
+    r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax;
+    r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask;
+    r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset;
+    r.payloadSlot = payloadBase + slot; r.kind = 1u;
+    if (nextIsA) { wfRaysA[slot] = r; } else { wfRaysB[slot] = r; }
+    wfPayload[r.payloadSlot] = payload;
+}
+
+// Opaque-only BLAS descent (no anyhit — TRACE runs zero user code).
+fn _rtwTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
+                    instanceId: u32, hitGroupBase: u32,
+                    bestHit: ptr<function, HitInfo>,
+                    bestT:   ptr<function, f32>) -> bool {
+    let invD = vec3<f32>(1.0) / rayObj.direction;
+    var stack: array<u32, 32>;
+    var sp: u32 = 0u;
+    var nodeRel: u32 = 0u;
+    loop {
+        let absI = meshRec.bvhOffset + nodeRel;
+        let node = bvhNodes[absI];
+        if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, *bestT)) {
+            if (sp == 0u) { break; }
+            sp = sp - 1u; nodeRel = stack[sp]; continue;
+        }
+        if (node.primCount > 0u) {
+            for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
+                let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
+                let verts = _rtFetchTri(meshRec, triIndex);
+                let tr = _rtTri(rayObj.origin, rayObj.direction,
+                                verts[0], verts[1], verts[2], rayObj.tMin, *bestT);
+                if (!tr.hit) { continue; }
+                let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
+                let facing = dot(geomNormal, rayObj.direction);
+                if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES)  != 0u && facing > 0.0) { continue; }
+                if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
+                var candidate: HitInfo;
+                candidate.t             = tr.t;
+                candidate.instanceId    = instanceId;
+                candidate.primitiveId   = triIndex;
+                candidate.hitGroupIndex = hitGroupBase;
+                candidate.attribs       = vec2<f32>(tr.u, tr.v);
+                candidate.objectRayOrigin    = rayObj.origin;
+                candidate.objectRayDirection = rayObj.direction;
+                *bestHit = candidate;
+                *bestT   = tr.t;
+                if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
+            }
+            if (sp == 0u) { break; }
+            sp = sp - 1u; nodeRel = stack[sp]; continue;
+        }
+        let left  = node.firstChildOrPrim;
+        let right = left + 1u;
+        if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
+        nodeRel = left;
+    }
+    return false;
+}
+
+fn _rtwTraverseTlas(rayWorld: RayDesc, flags: u32, cullMask: u32,
+                    sbtRecordOffset: u32,
+                    bestHit: ptr<function, HitInfo>,
+                    bestT:   ptr<function, f32>) -> bool {
+    let invD = vec3<f32>(1.0) / rayWorld.direction;
+    let leavesStart = wfParams.tlasNPadded - 1u;
+    var stack: array<u32, 32>;
+    var sp: u32 = 0u;
+    stack[sp] = 0u; sp = sp + 1u;
+    loop {
+        if (sp == 0u) { break; }
+        sp = sp - 1u;
+        let nodeIdx = stack[sp];
+        let node = tlasBvhNodes[nodeIdx];
+        if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, *bestT)) { continue; }
+        if (nodeIdx >= leavesStart) {
+            let leafIdx = nodeIdx - leavesStart;
+            let i = tlasEntryOrder[leafIdx];
+            if (i == 0xFFFFFFFFu) { continue; }
+            let inst = tlasEntries[i];
+            let instanceMask = inst.maskHGOffset & 0xFFu;
+            if ((instanceMask & cullMask) == 0u) { continue; }
+            if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, *bestT)) { continue; }
+            let r0 = inst.worldToObjectR0;
+            let r1 = inst.worldToObjectR1;
+            let r2 = inst.worldToObjectR2;
+            var rayObj: RayDesc;
+            rayObj.origin = vec3<f32>(
+                dot(r0.xyz, rayWorld.origin) + r0.w,
+                dot(r1.xyz, rayWorld.origin) + r1.w,
+                dot(r2.xyz, rayWorld.origin) + r2.w);
+            rayObj.direction = vec3<f32>(
+                dot(r0.xyz, rayWorld.direction),
+                dot(r1.xyz, rayWorld.direction),
+                dot(r2.xyz, rayWorld.direction));
+            rayObj.tMin = rayWorld.tMin;
+            rayObj.tMax = *bestT;
+            var effective = flags;
+            let iflags = inst.instanceFlags;
+            if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
+                effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
+            }
+            let hitGroupOffset = inst.maskHGOffset >> 8u;
+            let hitGroupBase   = sbtRecordOffset + hitGroupOffset;
+            let meshRec        = meshRecords[inst.blasMeshIdx];
+            let pre = *bestT;
+            let endSearch = _rtwTraverseBlas(rayObj, effective, meshRec, i, hitGroupBase, bestHit, bestT);
+            if ((*bestT) < pre || endSearch) {
+                (*bestHit).objectToWorldR0 = inst.objectToWorldR0;
+                (*bestHit).objectToWorldR1 = inst.objectToWorldR1;
+                (*bestHit).objectToWorldR2 = inst.objectToWorldR2;
+                (*bestHit).customIndex     = inst.customIndex;
+            }
+            if (endSearch) { return true; }
+            if ((*bestT) < pre && (effective & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
+        } else {
+            let left  = 2u * nodeIdx + 1u;
+            let right = 2u * nodeIdx + 2u;
+            if (sp + 1u < 32u) {
+                stack[sp] = right; sp = sp + 1u;
+                stack[sp] = left;  sp = sp + 1u;
+            }
+        }
+    }
+    return false;
+}
+
+fn _wfReadRay(i: u32) -> WfRay {
+    if (wfParams.curIsA == 1u) { return wfRaysA[i]; }
+    return wfRaysB[i];
+}
+
+// PREP — publish indirect args for the upcoming TRACE/SHADE; zero the next
+// buffer's emit counter.
+fn _wfPrep() {
+    let n = _wfCurCount();
+    wfIndirect[0] = (n + 63u) / 64u;
+    wfIndirect[1] = 1u;
+    wfIndirect[2] = 1u;
+    if (wfParams.curIsA == 1u) { atomicStore(&wfCounters[1], 0u); }
+    else                       { atomicStore(&wfCounters[0], 0u); }
+}
+
+// TRACE — pure traversal, zero user code.
+fn _wfTrace(i: u32) {
+    if (i >= _wfCurCount()) { return; }
+    let ray = _wfReadRay(i);
+    var rd: RayDesc;
+    rd.origin = ray.origin; rd.tMin = ray.tMin;
+    rd.direction = ray.direction; rd.tMax = ray.tMax;
+    var bestHit: HitInfo;
+    bestHit.t = ray.tMax;
+    var bestT = ray.tMax;
+    _rtwTraverseTlas(rd, ray.flags, ray.cullMask & 0xFFu, ray.sbtRecordOffset, &bestHit, &bestT);
+    var hr: HitResult;
+    if (bestT < ray.tMax) {
+        hr.t = bestHit.t;
+        hr.instanceId = bestHit.instanceId;
+        hr.primitiveId = bestHit.primitiveId;
+        hr.hitGroupIndex = bestHit.hitGroupIndex;
+        hr.attribs = bestHit.attribs;
+        hr.hitKind = 1u;
+        hr.customIndex = bestHit.customIndex;
+        hr.objectRayOrigin = bestHit.objectRayOrigin;
+        hr.objectRayDirection = bestHit.objectRayDirection;
+        hr.objectToWorldR0 = bestHit.objectToWorldR0;
+        hr.objectToWorldR1 = bestHit.objectToWorldR1;
+        hr.objectToWorldR2 = bestHit.objectToWorldR2;
+    } else {
+        hr.hitKind = 0u;
+    }
+    wfHits[i] = hr;
+}
+
+// SHADE — dispatch to runMiss / runClosestHit with the ray's payload.
+fn _wfShade(i: u32) {
+    if (i >= _wfCurCount()) { return; }
+    let ray = _wfReadRay(i);
+    let hr = wfHits[i];
+    _wfPixel = ray.pixel;
+    var payload: Payload = wfPayload[ray.payloadSlot];
+    var rd: RayDesc;
+    rd.origin = ray.origin; rd.tMin = ray.tMin;
+    rd.direction = ray.direction; rd.tMax = ray.tMax;
+    if (hr.hitKind == 0u) {
+        runMiss(ray.missIndex, rd, &payload);
+    } else if ((ray.flags & RT_FLAG_SKIP_CLOSEST_HIT) == 0u) {
+        var hit: HitInfo;
+        hit.t = hr.t;
+        hit.instanceId = hr.instanceId;
+        hit.primitiveId = hr.primitiveId;
+        hit.hitGroupIndex = hr.hitGroupIndex;
+        hit.attribs = hr.attribs;
+        hit.customIndex = hr.customIndex;
+        hit.objectRayOrigin = hr.objectRayOrigin;
+        hit.objectRayDirection = hr.objectRayDirection;
+        hit.objectToWorldR0 = hr.objectToWorldR0;
+        hit.objectToWorldR1 = hr.objectToWorldR1;
+        hit.objectToWorldR2 = hr.objectToWorldR2;
+        runClosestHit(hr.hitGroupIndex, rd, hit, &payload);
+    }
+}
+`;
+
 // ── WGSL library: rayQuery API for non-megakernel compute shaders ────
 //
 // Mirrors GL_EXT_ray_query semantics that 3DForts's physics shaders use
@ -2564,6 +2921,40 @@ env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle,
 // textures, samplers). Pass (0, 0) for a pipeline with no user bindings.
 const rtPipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups }

+// Per-payload byte budget in wfPayload (rounded up; user Payload structs
+// must fit). Sponza's Payload is 48 B; 64 leaves headroom while keeping
+// 2·W·H·64 B ≈ 265 MB at 1080p.
+const WF_PAYLOAD_BYTES = 64;
+// Dynamic-offset uniform ring: one WfParams slot per wavefront pass. 128
+// slots covers maxDepth up to ~42 (1 + 3·maxDepth + 1 passes).
+const WF_PARAM_SLOTS   = 128;
+const WF_FIXED_TLAS_NPADDED = 16384;   // matches lbvhBuildWgsl N_PADDED
+
+function ensureWavefrontBuffers(W, H) {
+    const cap = W * H;
+    rtState.wf = rtState.wf || { cap: 0 };
+    const wf = rtState.wf;
+    if (wf.cap === cap && wf.raysA) return wf;
+    for (const b of [wf.raysA, wf.raysB, wf.hits, wf.accum, wf.counters,
+                     wf.payload, wf.indirect]) { if (b) b.destroy(); }
+    const S = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST;
+    wf.raysA    = device.createBuffer({ size: cap * 64,                 usage: S, label: "wf-raysA" });
+    wf.raysB    = device.createBuffer({ size: cap * 64,                 usage: S, label: "wf-raysB" });
+    wf.hits     = device.createBuffer({ size: cap * 112,                usage: S, label: "wf-hits" });
+    wf.accum    = device.createBuffer({ size: cap * 16,                 usage: S, label: "wf-accum" });
+    wf.payload  = device.createBuffer({ size: 2 * cap * WF_PAYLOAD_BYTES, usage: S, label: "wf-payload" });
+    wf.counters = device.createBuffer({ size: 64,
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, label: "wf-counters" });
+    wf.indirect = device.createBuffer({ size: 16,
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST, label: "wf-indirect" });
+    if (!wf.paramsRing) {
+        wf.paramsRing = device.createBuffer({ size: WF_PARAM_SLOTS * 256,
+            usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, label: "wf-params" });
+    }
+    wf.cap = cap;
+    return wf;
+}
+
 env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
    if (!rtState.vertHeap) rtInit();
    const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
@ -2577,16 +2968,23 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
        beforeHelpers = userPart.substring(0, mi);
        afterHelpers  = userPart.substring(mi + marker.length);
    }
-    const fullWgsl = rtWgslPrelude + "\n" + beforeHelpers + "\n" + rtWgslPureHelpers + "\n" + rtWgslMegakernelHelpers + "\n" + afterHelpers;
+    // Wavefront assembly: types + bindings | user CH/miss/resolve + wfPayload
+    // + switches (beforeHelpers) | pure helpers | wavefront helpers | user
+    // raygen + the five @compute entry points (afterHelpers).
+    const fullWgsl = rtWgslTypes + rtWgslWavefrontBindings + "\n"
+        + beforeHelpers + "\n" + rtWgslPureHelpers + "\n"
+        + rtWgslWavefrontHelpers + "\n" + afterHelpers;

-    // Parse user bindings (same wire format as wgpuLoadCustomShader).
+    // Parse user bindings (same wire format as wgpuLoadCustomShader). For
+    // the wavefront RT pipeline, group 0 = WfParams, group 1 = data heaps,
+    // group 2 = indirect args — so user bindings must start at group 3.
    const userBindings = [];
    if (bindingsCount > 0) {
        const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
        for (let i = 0; i < bindingsCount; i++) {
            const g = dv.getUint8(i*8 + 0);
-            if (g < 2) {
-                console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 2`);
+            if (g < 3) {
+                console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 3`);
                return 0;
            }
            userBindings.push({
@ -2604,33 +3002,28 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
    }
    const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);

-    let pipeline;
    try {
-        const mod = device.createShaderModule({ code: fullWgsl, label: "rt-megakernel" });
-        // RTDispatchHeader is 16 bytes; bind exactly that.
-        const headerBgl = device.createBindGroupLayout({ entries: [
+        const mod = device.createShaderModule({ code: fullWgsl, label: "rt-wavefront" });
+        const paramsBgl = device.createBindGroupLayout({ entries: [
            { binding: 0, visibility: GPUShaderStage.COMPUTE,
-              buffer: { type: "uniform", minBindingSize: 16 } },
+              buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 32 } },
        ]});
+        const sb = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } });
+        const rw = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } });
        const dataBgl = device.createBindGroupLayout({ entries: [
-            { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
-            { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
-            { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
-            { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
-            { binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
-            { binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
+            sb(0), sb(1), sb(2), sb(3), sb(4), sb(5),
            { binding: 6, visibility: GPUShaderStage.COMPUTE,
              storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
-            { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
-            { binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
-            { binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
+            sb(7), sb(8), sb(9),
+            rw(10), rw(11), rw(12), rw(13), rw(14), rw(15),
        ]});
-        // User binding-group layouts. WebGPU pipeline layouts need a
-        // contiguous array up to the highest group used, so pad any gaps
-        // with empty bgls (same rule as wgpuLoadCustomShader).
+        const indirectBgl = device.createBindGroupLayout({ entries: [ rw(0) ]});
+        const emptyBgl    = device.createBindGroupLayout({ entries: [] });
+
+        // User binding-group layouts for groups 3..highest (pad gaps).
        const userBgls = [];
-        const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 1;
-        for (let g = 2; g <= highest; g++) {
+        const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 2;
+        for (let g = 3; g <= highest; g++) {
            if (byGroup.has(g)) {
                const entries = byGroup.get(g).map(b => {
                    const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
@ -2638,6 +3031,7 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
                    else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
                    else if (b.kind === 2) e.sampler = { type: "filtering" };
                    else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
+                    else if (b.kind === 4) e.buffer  = { type: "storage" };
                    return e;
                });
                userBgls.push(device.createBindGroupLayout({ entries }));
@ -2645,12 +3039,29 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
                userBgls.push(device.createBindGroupLayout({ entries: [] }));
            }
        }
-        pipeline = device.createComputePipeline({
-            layout: device.createPipelineLayout({ bindGroupLayouts: [headerBgl, dataBgl, ...userBgls] }),
-            compute: { module: mod, entryPoint: "main" },
-        });
+        // GENERATE / SHADE / RESOLVE may touch user bindings (raygen camera,
+        // closesthit albedo, resolve params) → params + data + empty(group2)
+        // + user. PREP → params + data + indirect. TRACE → params + data.
+        const userLayout = device.createPipelineLayout({
+            bindGroupLayouts: [paramsBgl, dataBgl, emptyBgl, ...userBgls] });
+        const prepLayout = device.createPipelineLayout({
+            bindGroupLayouts: [paramsBgl, dataBgl, indirectBgl] });
+        const traceLayout = device.createPipelineLayout({
+            bindGroupLayouts: [paramsBgl, dataBgl] });
+        const mk = (layout, ep) => device.createComputePipeline({
+            layout, compute: { module: mod, entryPoint: ep } });
+
+        const entry = {
+            genPipe:     mk(userLayout,  "wfGenerate"),
+            prepPipe:    mk(prepLayout,  "wfPrep"),
+            tracePipe:   mk(traceLayout, "wfTrace"),
+            shadePipe:   mk(userLayout,  "wfShade"),
+            resolvePipe: mk(userLayout,  "wfResolve"),
+            paramsBgl, dataBgl, indirectBgl, emptyBgl, userBgls,
+            byGroup, sortedGroups,
+        };
        const handle = newHandle();
-        rtPipelines.set(handle, { pipeline, headerBgl, dataBgl, userBgls, byGroup, sortedGroups });
+        rtPipelines.set(handle, entry);
        return handle;
    } catch (e) {
        console.error("[crafter-wgpu] RT pipeline compile failed:", e);
@ -2659,35 +3070,83 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
    }
 };

+// Build the user @group(3+) bind groups for a pass, returning a list of
+// { group, bindGroup } to set. Shared by GENERATE / SHADE / RESOLVE.
+function wfUserBindGroups(pipe, handlesPtr, handlesCount) {
+    const out = [];
+    if (handlesCount <= 0) return out;
+    const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
+    let handleIdx = 0;
+    let bglIdx = 0;
+    const highest = pipe.sortedGroups.length ? pipe.sortedGroups[pipe.sortedGroups.length - 1] : 2;
+    for (let g = 3; g <= highest; g++) {
+        if (pipe.byGroup.has(g)) {
+            const entries = pipe.byGroup.get(g).map(b => {
+                const h = handles[handleIdx++];
+                let resource;
+                if      (b.kind === 0) resource = { buffer: buffers.get(h) };
+                else if (b.kind === 1) resource = textureViews.get(h);
+                else if (b.kind === 2) resource = samplers.get(h);
+                else if (b.kind === 3) resource = textureViews.get(h);
+                else if (b.kind === 4) resource = { buffer: buffers.get(h) };
+                return { binding: b.binding, resource };
+            });
+            out.push({ group: g, bindGroup: device.createBindGroup({ layout: pipe.userBgls[bglIdx], entries }) });
+        }
+        bglIdx++;
+    }
+    return out;
+}
+
 env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
                      tlasBufHandle, instanceCount, gx, gy,
-                      handlesPtr, handlesCount) => {
-    if (!state.pass) return;
+                      handlesPtr, handlesCount, maxDepth) => {
+    if (!state.encoder) return;
    const pipe = rtPipelines.get(pipelineHandle);
    const tlas = buffers.get(tlasBufHandle);
    if (!pipe || !tlas) {
        console.error("[crafter-wgpu] wgpuDispatchRT: unknown pipeline or tlas");
        return;
    }
-    // Write RT header from push data (first 16 bytes). Surface dims + instance count + flags.
-    const hdr32 = new Uint32Array(4);
-    hdr32[0] = state.width;
-    hdr32[1] = state.height;
-    hdr32[2] = instanceCount;
-    hdr32[3] = 0;
-    queue.writeBuffer(rtState.rtHeader, 0, hdr32);
-
-    const headerBg = device.createBindGroup({
-        layout: pipe.headerBgl,
-        entries: [{ binding: 0, resource: { buffer: rtState.rtHeader, offset: 0, size: 16 } }],
-    });
-    const outView = state.outIsPing ? state.pingView : state.pongView;
    const entryOrderBuf = buffers.get(rtState.currentEntryOrder);
    const bvhBuf        = buffers.get(rtState.currentBvh);
    if (!entryOrderBuf || !bvhBuf) {
-        console.error("[crafter-wgpu] wgpuDispatchRT: missing entryOrder/bins (no TLAS built yet?)");
+        console.error("[crafter-wgpu] wgpuDispatchRT: missing entryOrder/bvh (no TLAS built yet?)");
        return;
    }
+    const W = state.width, H = state.height;
+    const cap = W * H;
+    const depth = Math.max(1, maxDepth | 0);
+    const wf = ensureWavefrontBuffers(W, H);
+
+    // ── Per-pass WfParams ring. queue.writeBuffer lands before submit, so
+    // we can't mutate the uniform between passes — instead we pre-write one
+    // slot per pass and bind it with a dynamic offset. Slot order:
+    //   0            GENERATE
+    //   1+3*d .. +2  PREP / TRACE / SHADE for bounce d
+    //   1+3*depth    RESOLVE
+    const passCount = 2 + 3 * depth;
+    const ring = new Uint32Array(WF_PARAM_SLOTS * 64);  // 256 B = 64 u32 per slot
+    const writeSlot = (slot, curIsA, bounce) => {
+        const o = slot * 64;
+        ring[o + 0] = W; ring[o + 1] = H; ring[o + 2] = cap; ring[o + 3] = curIsA;
+        ring[o + 4] = bounce; ring[o + 5] = depth; ring[o + 6] = WF_FIXED_TLAS_NPADDED; ring[o + 7] = 0;
+    };
+    writeSlot(0, 1, 0);                              // GENERATE
+    for (let d = 0; d < depth; d++) {
+        const curIsA = (d % 2 === 0) ? 1 : 0;
+        writeSlot(1 + 3 * d + 0, curIsA, d);         // PREP
+        writeSlot(1 + 3 * d + 1, curIsA, d);         // TRACE
+        writeSlot(1 + 3 * d + 2, curIsA, d);         // SHADE
+    }
+    writeSlot(1 + 3 * depth, 1, depth);              // RESOLVE
+    queue.writeBuffer(wf.paramsRing, 0, ring, 0, passCount * 64);
+
+    const outView = state.outIsPing ? state.pingView : state.pongView;
+    const paramsBg = device.createBindGroup({
+        layout: pipe.paramsBgl,
+        entries: [{ binding: 0, resource: { buffer: wf.paramsRing, offset: 0, size: 256 } }],
+    });
    const dataBg = device.createBindGroup({
        layout: pipe.dataBgl,
        entries: [
@ -2701,41 +3160,91 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
            { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
            { binding: 8, resource: { buffer: entryOrderBuf } },
            { binding: 9, resource: { buffer: bvhBuf } },
+            { binding: 10, resource: { buffer: wf.raysA } },
+            { binding: 11, resource: { buffer: wf.raysB } },
+            { binding: 12, resource: { buffer: wf.hits } },
+            { binding: 13, resource: { buffer: wf.accum } },
+            { binding: 14, resource: { buffer: wf.counters } },
+            { binding: 15, resource: { buffer: wf.payload } },
        ],
    });
+    const indirectBg = device.createBindGroup({
+        layout: pipe.indirectBgl,
+        entries: [{ binding: 0, resource: { buffer: wf.indirect } }],
+    });
+    const userBgs = wfUserBindGroups(pipe, handlesPtr, handlesCount);

-    state.pass.setPipeline(pipe.pipeline);
-    state.pass.setBindGroup(0, headerBg);
-    state.pass.setBindGroup(1, dataBg);
+    // Close the frame's shared compute pass; the wavefront runs as its own
+    // sequence of passes on the same encoder (implicit barrier between each
+    // makes PREP's atomic writes visible to TRACE, etc.), then we reopen it.
+    if (state.pass) { state.pass.end(); state.pass = null; }
+    const enc = state.encoder;
+    const tileX = gx, tileY = gy;
+    const slotOff = (slot) => slot * 256;

-    // User bindings: walk byGroup in the same sorted order the C++ side
-    // packed handles[], picking up indices linearly.
-    if (handlesCount > 0) {
-        const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
-        let handleIdx = 0;
-        let bglIdx = 0;
-        for (let g = 2; g <= (pipe.sortedGroups[pipe.sortedGroups.length - 1] || 1); g++) {
-            if (pipe.byGroup.has(g)) {
-                const entries = pipe.byGroup.get(g).map(b => {
-                    const h = handles[handleIdx++];
-                    let resource;
-                    if      (b.kind === 0) resource = { buffer: buffers.get(h) };
-                    else if (b.kind === 1) resource = textureViews.get(h);
-                    else if (b.kind === 2) resource = samplers.get(h);
-                    else if (b.kind === 3) resource = textureViews.get(h);
-                    return { binding: b.binding, resource };
-                });
-                const bg = device.createBindGroup({
-                    layout: pipe.userBgls[bglIdx],
-                    entries,
-                });
-                state.pass.setBindGroup(g, bg);
-            }
-            bglIdx++;
+    // Zero the two emit counters before GENERATE.
+    enc.clearBuffer(wf.counters, 0, 64);
+
+    const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
+
+    // GENERATE
+    {
+        const p = enc.beginComputePass({ label: "wf-generate" });
+        p.setPipeline(pipe.genPipe);
+        p.setBindGroup(0, paramsBg, [slotOff(0)]);
+        p.setBindGroup(1, dataBg);
+        setUser(p);
+        p.dispatchWorkgroups(tileX, tileY, 1);
+        p.end();
+    }
+    for (let d = 0; d < depth; d++) {
+        const prepSlot  = 1 + 3 * d + 0;
+        const traceSlot = 1 + 3 * d + 1;
+        const shadeSlot = 1 + 3 * d + 2;
+        // PREP — publish indirect args, zero next counter.
+        {
+            const p = enc.beginComputePass({ label: "wf-prep" });
+            p.setPipeline(pipe.prepPipe);
+            p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
+            p.setBindGroup(1, dataBg);
+            p.setBindGroup(2, indirectBg);
+            p.dispatchWorkgroups(1, 1, 1);
+            p.end();
+        }
+        // TRACE — indirect over the live ray list.
+        {
+            const p = enc.beginComputePass({ label: "wf-trace" });
+            p.setPipeline(pipe.tracePipe);
+            p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
+            p.setBindGroup(1, dataBg);
+            p.dispatchWorkgroupsIndirect(wf.indirect, 0);
+            p.end();
+        }
+        // SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
+        {
+            const p = enc.beginComputePass({ label: "wf-shade" });
+            p.setPipeline(pipe.shadePipe);
+            p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
+            p.setBindGroup(1, dataBg);
+            setUser(p);
+            p.dispatchWorkgroupsIndirect(wf.indirect, 0);
+            p.end();
        }
    }
+    // RESOLVE — tonemap accum → output image.
+    {
+        const p = enc.beginComputePass({ label: "wf-resolve" });
+        p.setPipeline(pipe.resolvePipe);
+        p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
+        p.setBindGroup(1, dataBg);
+        setUser(p);
+        p.dispatchWorkgroups(tileX, tileY, 1);
+        p.end();
+    }

-    state.pass.dispatchWorkgroups(gx, gy, 1);
+    // Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
+    // before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
+    state.pass = enc.beginComputePass();
    state.outIsPing = !state.outIsPing;
 };