WebGPU RT: wavefront tracer core (GENERATE/PREP/TRACE/SHADE/RESOLVE)

Replace the megakernel @compute entry with five wavefront kernels sharing one module, connected by GPU ray/hit/payload buffers and a GPU-driven indirect bounce loop: GENERATE -> (PREP -> TRACE -> SHADE) x maxDepth -> RESOLVE - TRACE contains zero user code (pure _rtwTraverseTlas/Blas, opaque-only). - PREP publishes dispatchWorkgroupsIndirect args from the live ray count; the indirect-args buffer lives in its own bind group so it is never bound read-write in the same dispatch that consumes it as INDIRECT. - New emit/accumulate API: rtEmitPrimaryRay / rtEmitRay / rtAccumulate, plus an optional user Resolve stage (tonemap hook; identity by default). - Per-pass WfParams via a dynamic-offset uniform ring (curIsA/bounce vary between passes within one submit). - Payload-typed wfPayload binding emitted in the codegen region after the user's struct Payload; payload travels with each ray (2*W*H slots). - Request maxBufferSize / maxStorageBufferBindingSize / maxComputeWorkgroups PerDimension so the W*H-sized work buffers fit past the 128MB baseline. VulkanTriangle ported to the new API and renders bit-identical to the megakernel baseline at maxDepth=1. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 16:24:41 +00:00 · 2026-05-31 16:24:41 +00:00 · 4e42d663a6
commit 4e42d663a6
parent e0d72f57f2
9 changed files with 755 additions and 101 deletions
--- a/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp
+++ b/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp
@ -78,13 +78,22 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef                 /*cmd*/,
    // shaders by stage. Concatenating *all* non-raygen sources here lets
    // them declare shared helpers, `struct Payload`, etc., in any order.

-    wgsl += "// ── user closesthit / anyhit / miss sources ───────────────\n";
+    wgsl += "// ── user closesthit / anyhit / miss / resolve sources ─────\n";
    for (const auto& shader : sbt.shaders) {
        if (shader.stage == WebGPURTStage::Raygen) continue;
        wgsl += shader.source;
        wgsl += "\n";
    }

+    // ── Payload-typed wavefront storage binding ────────────────────────
+    //
+    // Emitted *after* the user sources so it can name the user's `Payload`
+    // type. Holds one Payload per in-flight ray slot across both ping/pong
+    // ray buffers (capacity = 2·W·H). SHADE loads ray.payloadSlot here;
+    // emit helpers (rtEmitPrimaryRay / rtEmitRay) store into it.
+    wgsl += "\n@group(1) @binding(15) var<storage, read_write> "
+            "wfPayload : array<Payload>;\n";
+
    // ── Section 2: mega-switch dispatchers ─────────────────────────────
    //
    // runClosestHit, runAnyHit, runMiss each dispatch on the per-hit /
@ -141,6 +150,24 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef                 /*cmd*/,
    wgsl += "    }\n";
    wgsl += "}\n";

+    // runResolve — RESOLVE-stage tonemap hook. The first registered
+    // Resolve shader wins; with none, identity passthrough (alpha forced
+    // to 1) so the wavefront output matches a megakernel that wrote raw
+    // colors.
+    std::string resolveEntryFn;
+    for (const auto& shader : sbt.shaders) {
+        if (shader.stage == WebGPURTStage::Resolve) { resolveEntryFn = shader.entryFn; break; }
+    }
+    wgsl += "\nfn runResolve(coord: vec2<u32>, hdr: vec4<f32>) -> vec4<f32> {\n";
+    if (!resolveEntryFn.empty()) {
+        wgsl += "    return ";
+        wgsl += resolveEntryFn;
+        wgsl += "(coord, hdr);\n";
+    } else {
+        wgsl += "    return vec4<f32>(hdr.rgb, 1.0);\n";
+    }
+    wgsl += "}\n";
+
    // Marker — JS-side prelude/post-amble searches for this token to know
    // where the library helpers (traverseBlas/traverseTlas/traceRay) get
    // injected, followed by raygen sources and the @compute entry point.
@ -173,17 +200,55 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef                 /*cmd*/,
        return;
    }

-    // ── Section 4: @compute entry point ────────────────────────────────
+    // ── Section 4: wavefront @compute entry points ─────────────────────
    //
-    // 8x8 tile workgroup matching the rest of the WebGPU backend.
+    // Five kernels share this one module; createComputePipeline selects
+    // each by entryPoint name. GENERATE/RESOLVE are 8x8 screen tiles;
+    // TRACE/SHADE are 64-wide 1-D over the compacted ray list (dispatched
+    // indirectly from PREP); PREP is a single thread. The library helper
+    // bodies (_rtwTraverseTlas, rtEmit*, rtAccumulate, _wfCurCount, …) are
+    // injected JS-side at the marker above.

+    // GENERATE — one thread per pixel; clears the pixel's accumulator and
+    // runs the user raygen, which calls rtEmitPrimaryRay.
    wgsl += "\n@compute @workgroup_size(8, 8, 1)\n";
-    wgsl += "fn main(@builtin(global_invocation_id) gid: vec3<u32>) {\n";
+    wgsl += "fn wfGenerate(@builtin(global_invocation_id) gid: vec3<u32>) {\n";
+    wgsl += "    if (gid.x >= wfParams.surfaceW || gid.y >= wfParams.surfaceH) { return; }\n";
+    wgsl += "    let pixel = gid.y * wfParams.surfaceW + gid.x;\n";
+    wgsl += "    wfAccum[pixel] = vec4<f32>(0.0, 0.0, 0.0, 0.0);\n";
+    wgsl += "    _wfPixel = pixel;\n";
    wgsl += "    ";
    wgsl += raygenEntryFn;
    wgsl += "(gid);\n";
    wgsl += "}\n";

+    // PREP — single thread; reads the live ray count and publishes the
+    // indirect dispatch args for the upcoming TRACE/SHADE, then zeroes the
+    // next buffer's emit counter so SHADE starts compacting from 0.
+    wgsl += "\n@compute @workgroup_size(1)\n";
+    wgsl += "fn wfPrep() { _wfPrep(); }\n";
+
+    // TRACE — zero user code: pure traversal + intersection. One thread
+    // per live ray; writes a HitResult into wfHits[i].
+    wgsl += "\n@compute @workgroup_size(64)\n";
+    wgsl += "fn wfTrace(@builtin(global_invocation_id) gid: vec3<u32>) { _wfTrace(gid.x); }\n";
+
+    // SHADE — one thread per live ray; loads the ray + its hit + payload,
+    // dispatches to runMiss / runClosestHit, which may rtAccumulate and
+    // rtEmitRay continuation/shadow rays into the next buffer.
+    wgsl += "\n@compute @workgroup_size(64)\n";
+    wgsl += "fn wfShade(@builtin(global_invocation_id) gid: vec3<u32>) { _wfShade(gid.x); }\n";
+
+    // RESOLVE — one thread per pixel; runs the user resolve (or identity)
+    // over the linear accumulator and stores to the output image.
+    wgsl += "\n@compute @workgroup_size(8, 8, 1)\n";
+    wgsl += "fn wfResolve(@builtin(global_invocation_id) gid: vec3<u32>) {\n";
+    wgsl += "    if (gid.x >= wfParams.surfaceW || gid.y >= wfParams.surfaceH) { return; }\n";
+    wgsl += "    let pixel = gid.y * wfParams.surfaceW + gid.x;\n";
+    wgsl += "    let outc = runResolve(gid.xy, wfAccum[pixel]);\n";
+    wgsl += "    textureStore(outImage, vec2<i32>(i32(gid.x), i32(gid.y)), outc);\n";
+    wgsl += "}\n";
+
    pipelineHandle = WebGPU::wgpuLoadRTPipeline(
        wgsl.data(),
        static_cast<std::int32_t>(wgsl.size()),