WebGPU RT: wavefront tracer core (GENERATE/PREP/TRACE/SHADE/RESOLVE)
Replace the megakernel @compute entry with five wavefront kernels sharing one module, connected by GPU ray/hit/payload buffers and a GPU-driven indirect bounce loop: GENERATE -> (PREP -> TRACE -> SHADE) x maxDepth -> RESOLVE - TRACE contains zero user code (pure _rtwTraverseTlas/Blas, opaque-only). - PREP publishes dispatchWorkgroupsIndirect args from the live ray count; the indirect-args buffer lives in its own bind group so it is never bound read-write in the same dispatch that consumes it as INDIRECT. - New emit/accumulate API: rtEmitPrimaryRay / rtEmitRay / rtAccumulate, plus an optional user Resolve stage (tonemap hook; identity by default). - Per-pass WfParams via a dynamic-offset uniform ring (curIsA/bounce vary between passes within one submit). - Payload-typed wfPayload binding emitted in the codegen region after the user's struct Payload; payload travels with each ray (2*W*H slots). - Request maxBufferSize / maxStorageBufferBindingSize / maxComputeWorkgroups PerDimension so the W*H-sized work buffers fit past the 128MB baseline. VulkanTriangle ported to the new API and renders bit-identical to the megakernel baseline at maxDepth=1. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
e0d72f57f2
commit
4e42d663a6
9 changed files with 755 additions and 101 deletions
|
|
@ -78,13 +78,22 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef /*cmd*/,
|
|||
// shaders by stage. Concatenating *all* non-raygen sources here lets
|
||||
// them declare shared helpers, `struct Payload`, etc., in any order.
|
||||
|
||||
wgsl += "// ── user closesthit / anyhit / miss sources ───────────────\n";
|
||||
wgsl += "// ── user closesthit / anyhit / miss / resolve sources ─────\n";
|
||||
for (const auto& shader : sbt.shaders) {
|
||||
if (shader.stage == WebGPURTStage::Raygen) continue;
|
||||
wgsl += shader.source;
|
||||
wgsl += "\n";
|
||||
}
|
||||
|
||||
// ── Payload-typed wavefront storage binding ────────────────────────
|
||||
//
|
||||
// Emitted *after* the user sources so it can name the user's `Payload`
|
||||
// type. Holds one Payload per in-flight ray slot across both ping/pong
|
||||
// ray buffers (capacity = 2·W·H). SHADE loads ray.payloadSlot here;
|
||||
// emit helpers (rtEmitPrimaryRay / rtEmitRay) store into it.
|
||||
wgsl += "\n@group(1) @binding(15) var<storage, read_write> "
|
||||
"wfPayload : array<Payload>;\n";
|
||||
|
||||
// ── Section 2: mega-switch dispatchers ─────────────────────────────
|
||||
//
|
||||
// runClosestHit, runAnyHit, runMiss each dispatch on the per-hit /
|
||||
|
|
@ -141,6 +150,24 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef /*cmd*/,
|
|||
wgsl += " }\n";
|
||||
wgsl += "}\n";
|
||||
|
||||
// runResolve — RESOLVE-stage tonemap hook. The first registered
|
||||
// Resolve shader wins; with none, identity passthrough (alpha forced
|
||||
// to 1) so the wavefront output matches a megakernel that wrote raw
|
||||
// colors.
|
||||
std::string resolveEntryFn;
|
||||
for (const auto& shader : sbt.shaders) {
|
||||
if (shader.stage == WebGPURTStage::Resolve) { resolveEntryFn = shader.entryFn; break; }
|
||||
}
|
||||
wgsl += "\nfn runResolve(coord: vec2<u32>, hdr: vec4<f32>) -> vec4<f32> {\n";
|
||||
if (!resolveEntryFn.empty()) {
|
||||
wgsl += " return ";
|
||||
wgsl += resolveEntryFn;
|
||||
wgsl += "(coord, hdr);\n";
|
||||
} else {
|
||||
wgsl += " return vec4<f32>(hdr.rgb, 1.0);\n";
|
||||
}
|
||||
wgsl += "}\n";
|
||||
|
||||
// Marker — JS-side prelude/post-amble searches for this token to know
|
||||
// where the library helpers (traverseBlas/traverseTlas/traceRay) get
|
||||
// injected, followed by raygen sources and the @compute entry point.
|
||||
|
|
@ -173,17 +200,55 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef /*cmd*/,
|
|||
return;
|
||||
}
|
||||
|
||||
// ── Section 4: @compute entry point ────────────────────────────────
|
||||
// ── Section 4: wavefront @compute entry points ─────────────────────
|
||||
//
|
||||
// 8x8 tile workgroup matching the rest of the WebGPU backend.
|
||||
// Five kernels share this one module; createComputePipeline selects
|
||||
// each by entryPoint name. GENERATE/RESOLVE are 8x8 screen tiles;
|
||||
// TRACE/SHADE are 64-wide 1-D over the compacted ray list (dispatched
|
||||
// indirectly from PREP); PREP is a single thread. The library helper
|
||||
// bodies (_rtwTraverseTlas, rtEmit*, rtAccumulate, _wfCurCount, …) are
|
||||
// injected JS-side at the marker above.
|
||||
|
||||
// GENERATE — one thread per pixel; clears the pixel's accumulator and
|
||||
// runs the user raygen, which calls rtEmitPrimaryRay.
|
||||
wgsl += "\n@compute @workgroup_size(8, 8, 1)\n";
|
||||
wgsl += "fn main(@builtin(global_invocation_id) gid: vec3<u32>) {\n";
|
||||
wgsl += "fn wfGenerate(@builtin(global_invocation_id) gid: vec3<u32>) {\n";
|
||||
wgsl += " if (gid.x >= wfParams.surfaceW || gid.y >= wfParams.surfaceH) { return; }\n";
|
||||
wgsl += " let pixel = gid.y * wfParams.surfaceW + gid.x;\n";
|
||||
wgsl += " wfAccum[pixel] = vec4<f32>(0.0, 0.0, 0.0, 0.0);\n";
|
||||
wgsl += " _wfPixel = pixel;\n";
|
||||
wgsl += " ";
|
||||
wgsl += raygenEntryFn;
|
||||
wgsl += "(gid);\n";
|
||||
wgsl += "}\n";
|
||||
|
||||
// PREP — single thread; reads the live ray count and publishes the
|
||||
// indirect dispatch args for the upcoming TRACE/SHADE, then zeroes the
|
||||
// next buffer's emit counter so SHADE starts compacting from 0.
|
||||
wgsl += "\n@compute @workgroup_size(1)\n";
|
||||
wgsl += "fn wfPrep() { _wfPrep(); }\n";
|
||||
|
||||
// TRACE — zero user code: pure traversal + intersection. One thread
|
||||
// per live ray; writes a HitResult into wfHits[i].
|
||||
wgsl += "\n@compute @workgroup_size(64)\n";
|
||||
wgsl += "fn wfTrace(@builtin(global_invocation_id) gid: vec3<u32>) { _wfTrace(gid.x); }\n";
|
||||
|
||||
// SHADE — one thread per live ray; loads the ray + its hit + payload,
|
||||
// dispatches to runMiss / runClosestHit, which may rtAccumulate and
|
||||
// rtEmitRay continuation/shadow rays into the next buffer.
|
||||
wgsl += "\n@compute @workgroup_size(64)\n";
|
||||
wgsl += "fn wfShade(@builtin(global_invocation_id) gid: vec3<u32>) { _wfShade(gid.x); }\n";
|
||||
|
||||
// RESOLVE — one thread per pixel; runs the user resolve (or identity)
|
||||
// over the linear accumulator and stores to the output image.
|
||||
wgsl += "\n@compute @workgroup_size(8, 8, 1)\n";
|
||||
wgsl += "fn wfResolve(@builtin(global_invocation_id) gid: vec3<u32>) {\n";
|
||||
wgsl += " if (gid.x >= wfParams.surfaceW || gid.y >= wfParams.surfaceH) { return; }\n";
|
||||
wgsl += " let pixel = gid.y * wfParams.surfaceW + gid.x;\n";
|
||||
wgsl += " let outc = runResolve(gid.xy, wfAccum[pixel]);\n";
|
||||
wgsl += " textureStore(outImage, vec2<i32>(i32(gid.x), i32(gid.y)), outc);\n";
|
||||
wgsl += "}\n";
|
||||
|
||||
pipelineHandle = WebGPU::wgpuLoadRTPipeline(
|
||||
wgsl.data(),
|
||||
static_cast<std::int32_t>(wgsl.size()),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue