From 4e42d663a6179a179196ef3a8f46474a7df9bc7a Mon Sep 17 00:00:00 2001 From: catbot Date: Sun, 31 May 2026 16:24:41 +0000 Subject: [PATCH] WebGPU RT: wavefront tracer core (GENERATE/PREP/TRACE/SHADE/RESOLVE) Replace the megakernel @compute entry with five wavefront kernels sharing one module, connected by GPU ray/hit/payload buffers and a GPU-driven indirect bounce loop: GENERATE -> (PREP -> TRACE -> SHADE) x maxDepth -> RESOLVE - TRACE contains zero user code (pure _rtwTraverseTlas/Blas, opaque-only). - PREP publishes dispatchWorkgroupsIndirect args from the live ray count; the indirect-args buffer lives in its own bind group so it is never bound read-write in the same dispatch that consumes it as INDIRECT. - New emit/accumulate API: rtEmitPrimaryRay / rtEmitRay / rtAccumulate, plus an optional user Resolve stage (tonemap hook; identity by default). - Per-pass WfParams via a dynamic-offset uniform ring (curIsA/bounce vary between passes within one submit). - Payload-typed wfPayload binding emitted in the codegen region after the user's struct Payload; payload travels with each ray (2*W*H slots). - Request maxBufferSize / maxStorageBufferBindingSize / maxComputeWorkgroups PerDimension so the W*H-sized work buffers fit past the 128MB baseline. VulkanTriangle ported to the new API and renders bit-identical to the megakernel baseline at maxDepth=1. Co-Authored-By: Claude Opus 4.8 --- WAVEFRONT-DESIGN.md | 69 ++ additional/dom-webgpu.js | 653 ++++++++++++++++-- examples/VulkanTriangle/closesthit.wgsl | 11 +- examples/VulkanTriangle/miss.wgsl | 5 +- examples/VulkanTriangle/raygen.wgsl | 25 +- .../Crafter.Graphics-PipelineRTWebGPU.cpp | 73 +- interfaces/Crafter.Graphics-RTPass.cppm | 8 +- ...ter.Graphics-ShaderBindingTableWebGPU.cppm | 5 + interfaces/Crafter.Graphics-WebGPU.cppm | 3 +- 9 files changed, 753 insertions(+), 99 deletions(-) create mode 100644 WAVEFRONT-DESIGN.md diff --git a/WAVEFRONT-DESIGN.md b/WAVEFRONT-DESIGN.md new file mode 100644 index 0000000..78d77e2 --- /dev/null +++ b/WAVEFRONT-DESIGN.md @@ -0,0 +1,69 @@ +# WebGPU wavefront RT rewrite — design & progress (issue #3) + +Replaces the single megakernel (`main`, 8×8 tile, per-pixel +raygen→traceRay→CH/miss→store) with a streaming wavefront tracer: +`GENERATE → PREP → (TRACE → SHADE → PREP)×maxDepth → RESOLVE`, each its own +compute pass, dispatch sizes driven by `dispatchWorkgroupsIndirect`. + +## Kernels (all generated/assembled the same megakernel way, just split) +- **GENERATE** (1 thread/pixel, 8×8): runs user `raygen_main(gid)` which calls + `rtEmitPrimaryRay(...)`. Clears accum slot + payload slot for the pixel. +- **PREP** (1 thread): reads emit counter for the just-filled ray buffer, + writes indirect args `[ceil(n/64),1,1]`, publishes `traceCount=n`, swaps + cur/next ray buffer, resets next emit counter. One PREP before first TRACE + and one after each SHADE. +- **TRACE** (1 thread/ray, 64-wide, indirect): ZERO user code. Reads ray i, + runs `_rtTraverseTlas`, writes `HitResult` i (t/instanceId/primId/hg/attribs + /objToWorld/customIndex/missFlag). +- **SHADE** (1 thread/ray, 64-wide, indirect): reads ray i + hit i + payload + slot p. miss→`runMiss`, hit→`runClosestHit` (unless SKIP_CLOSEST_HIT). User + code calls `rtAccumulate(pixel,rgb)` and `rtEmitRay(...)`. +- **RESOLVE** (1 thread/pixel, 8×8): reads accum slot, runs user `resolve_main` + if present else passthrough; writes outImage. + +## Buffers (rtState, sized to 2*W*H rays) +- `wfRaysA`,`wfRaysB`: array, ping/pong. WfRay = origin,tMin,dir,tMax, + pixel,flags,cullMask,missIndex,sbtOffset,payloadSlot,kind,_pad. +- `wfHits`: array (sized = ray capacity). +- `wfPayload`: array — declared in CODEGEN region after user Payload. +- `wfAccum`: array> per pixel (W*H). +- `wfCounters`: atomic counters: emitA, emitB, trace dispatch args, etc. +- `wfIndirect`: INDIRECT dispatch-args buffer. + +## API (new, breaking) +- raygen: `rtEmitPrimaryRay(origin,tMin,dir,tMax,flags,cullMask,sbtOff,missIdx)` + → allocates payloadSlot=pixel, writes ray to current buffer (atomic bump). +- CH/miss: `rtEmitRay(origin,tMin,dir,tMax,flags,cullMask,sbtOff,missIdx,payload)` + spawns into NEXT buffer carrying a payload slot; `rtAccumulate(pixel,rgb)`. +- `rtGetPayload(slot)` / payload passed by value into CH/miss via slot. + +## Tonemap / resolve +Accum buffer is linear. Optional user `WebGPURTStage::Resolve` entry +`resolve_main(coord:vec2, hdr:vec4)->vec4`. None → passthrough. +VulkanTriangle: no resolve (exact match). Sponza: resolve does Reinhard+gamma. + +## Indirect dispatch (Phase 2 de-risk) +Prove `dispatchWorkgroupsIndirect` + cross-pass atomic visibility with a toy +"emit N → dispatch N" before wiring real kernels. WebGPU inserts an implicit +barrier between compute passes in one submit, so atomics written in PREP are +visible to TRACE. + +## maxDepth +Compile/runtime knob. JS unrolls the chain to maxDepth. VulkanTriangle +maxDepth=1 (primary only). Sponza maxDepth=2 (primary + shadow). + +## Status / progress +- [x] baseline VulkanTriangle renders (megakernel) — /tmp/baseline-triangle.png +- [ ] wavefront prelude + codegen +- [ ] VulkanTriangle on wavefront (maxDepth=1) +- [ ] bounce loop + indirect + Sponza shadow port +- [ ] RTStress example + timestamp queries +- [ ] ordered traversal, dynamic TLAS depth, device limits +- [ ] remove megakernel dual path; final validation; PR + +## Files +- `additional/dom-webgpu.js` — prelude (`rtWgsl*`), `wgpuLoadRTPipeline`, + `wgpuDispatchRT`, LBVH build, rtState/buffers, device-limit clamp (~L131). +- `implementations/Crafter.Graphics-PipelineRTWebGPU.cpp` — assembles user + WGSL + entry glue; must emit 5 entry points + payloadStore binding. +- examples/{VulkanTriangle,Sponza,RTStress}/*.wgsl + main.cpp. diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js index e035d97..9380517 100644 --- a/additional/dom-webgpu.js +++ b/additional/dom-webgpu.js @@ -136,6 +136,16 @@ clamp("maxStorageTexturesPerShaderStage", 8); // per-workgroup invocation cap raised from the default 256. clamp("maxComputeInvocationsPerWorkgroup", 1024); clamp("maxComputeWorkgroupSizeX", 1024); +// Wavefront RT work buffers are sized to W·H rays. At 1080p the payload +// store (≈245 MB) and hit buffer (≈214 MB) blow past the 128 MB baseline +// storage-buffer binding size, and the whole set past the 256 MB baseline +// maxBufferSize — request whatever the adapter actually allows (4090/Dawn +// reports 1 GB+). maxComputeWorkgroupsPerDimension bounds the indirect +// TRACE/SHADE 1-D dispatch (ceil(W·H/64) ≈ 32k workgroups at 1080p; the +// 65535 default covers it, but request the adapter max for headroom). +clamp("maxBufferSize", 1 << 30); +clamp("maxStorageBufferBindingSize", 1 << 30); +clamp("maxComputeWorkgroupsPerDimension", 65535); const device = await adapter.requestDevice({ requiredLimits }); const queue = device.queue; const ctx = canvas.getContext("webgpu"); @@ -147,6 +157,9 @@ device.lost.then((info) => { console.error("[crafter-wgpu] device lost:", info.message); state.gpuLost = true; }); +device.addEventListener("uncapturederror", (e) => { + console.error("[crafter-wgpu] uncaptured error:", e.error && e.error.message); +}); // ─── handle tables ───────────────────────────────────────────────────── @@ -1671,6 +1684,350 @@ fn traceRay(tlasIdx: u32, flags: u32, cullMask: u32, } `; +// ════════════════════════════════════════════════════════════════════════ +// WAVEFRONT RT — streaming tracer (GENERATE → PREP → TRACE → SHADE → +// RESOLVE). Replaces the megakernel. The C++ side (PipelineRTWebGPU) emits +// the user sources, the per-stage SBT switches, the Payload-typed wfPayload +// binding, and the five @compute entry points; this JS injects the bindings +// + library helpers the entry points call. +// ════════════════════════════════════════════════════════════════════════ + +// Bindings prelude for the wavefront pipeline. group(0) is the per-pass +// WfParams uniform (dynamic-offset ring — one slot per pass so curIsA / +// bounce can vary between passes inside one submit). group(1) carries the +// geometry heaps (0..9, identical to the old megakernel layout so the +// register/build paths are unchanged) plus the wavefront work buffers +// (10..14); wfPayload at binding 15 is emitted in the codegen region after +// the user's `struct Payload`. group(2) is the indirect-args buffer, bound +// only by PREP (a buffer used as INDIRECT in a dispatch may not also be +// bound read-write in that same dispatch — so TRACE/SHADE must not bind it). +const rtWgslWavefrontBindings = String.raw` +struct WfParams { + surfaceW: u32, + surfaceH: u32, + rayCapacity: u32, + curIsA: u32, // 1 → current ray buffer is A (emit-next = B); 0 → B + bounce: u32, + maxDepth: u32, + tlasNPadded: u32, // TLAS sweep-tree padded leaf count (descent depth) + flags: u32, +}; + +// One in-flight ray. 64 bytes; origin/direction vec3-aligned to 16. +struct WfRay { + origin: vec3, + tMin: f32, + direction: vec3, + tMax: f32, + pixel: u32, // linear framebuffer pixel this ray contributes to + flags: u32, + cullMask: u32, + missIndex: u32, + sbtRecordOffset: u32, + payloadSlot: u32, // index into wfPayload + kind: u32, // 0 primary, 1 continuation (informational) + _pad: u32, +}; + +// TRACE → SHADE handoff. Mirrors HitInfo + a hitKind (0 miss, 1 triangle). +struct HitResult { + t: f32, + instanceId: u32, + primitiveId: u32, + hitGroupIndex: u32, + attribs: vec2, + hitKind: u32, + customIndex: u32, + objectRayOrigin: vec3, + _p0: f32, + objectRayDirection: vec3, + _p1: f32, + objectToWorldR0: vec4, + objectToWorldR1: vec4, + objectToWorldR2: vec4, +}; + +struct BvhNode { + aabbMin: vec3, + _pad0: u32, + aabbMax: vec3, + _pad1: u32, +}; + +@group(0) @binding(0) var wfParams : WfParams; + +@group(1) @binding(0) var tlasEntries : array; +@group(1) @binding(1) var bvhNodes : array; +@group(1) @binding(2) var meshRecords : array; +@group(1) @binding(3) var vertices : array; +@group(1) @binding(4) var indices : array; +@group(1) @binding(5) var primRemap : array; +@group(1) @binding(6) var outImage : texture_storage_2d; +@group(1) @binding(7) var vertexAttribs : array; +@group(1) @binding(8) var tlasEntryOrder : array; +@group(1) @binding(9) var tlasBvhNodes : array; +@group(1) @binding(10) var wfRaysA : array; +@group(1) @binding(11) var wfRaysB : array; +@group(1) @binding(12) var wfHits : array; +@group(1) @binding(13) var wfAccum : array>; +@group(1) @binding(14) var wfCounters : array>; +// @group(1) @binding(15) wfPayload : array — emitted by codegen. + +@group(2) @binding(0) var wfIndirect : array; +`; + +// Library helpers the codegen entry points call. Sits after the pure +// helpers (_rtAabb/_rtTri/_rtFetchTri) and after the user's Payload + +// wfPayload binding, so rtEmit*/_wfShade can name Payload/wfPayload. +const rtWgslWavefrontHelpers = String.raw` +var _wfPixel: u32 = 0u; + +// Live ray count for the current buffer, clamped to capacity (the emit +// counter can overshoot capacity; dropped rays were never written). +fn _wfCurCount() -> u32 { + let raw = select(atomicLoad(&wfCounters[1]), atomicLoad(&wfCounters[0]), + wfParams.curIsA == 1u); + return min(raw, wfParams.rayCapacity); +} + +// Add linear radiance to the pixel this SHADE/GENERATE thread owns. Safe +// without atomics: at most one ray per pixel per bounce, and bounces run +// in separate passes (implicit barrier between them). +fn rtAccumulate(rgb: vec3) { + wfAccum[_wfPixel] = wfAccum[_wfPixel] + vec4(rgb, 0.0); +} + +// raygen → emit the pixel's primary ray. Bounce 0's current buffer is +// always A, so primaries land in A with their payload in the A region +// [0, rayCapacity). +fn rtEmitPrimaryRay(origin: vec3, tMin: f32, dir: vec3, tMax: f32, + flags: u32, cullMask: u32, sbtRecordOffset: u32, + missIndex: u32, payload: Payload) { + let slot = atomicAdd(&wfCounters[0], 1u); + if (slot >= wfParams.rayCapacity) { return; } + var r: WfRay; + r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax; + r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask; + r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset; + r.payloadSlot = slot; r.kind = 0u; + wfRaysA[slot] = r; + wfPayload[slot] = payload; +} + +// closesthit/miss → spawn a continuation/shadow ray into the NEXT buffer +// (the one the upcoming TRACE will read). Payload travels with it; the +// next buffer's payload region is [rayCapacity, 2*rayCapacity) for B. +fn rtEmitRay(origin: vec3, tMin: f32, dir: vec3, tMax: f32, + flags: u32, cullMask: u32, sbtRecordOffset: u32, + missIndex: u32, payload: Payload) { + let nextIsA = wfParams.curIsA == 0u; + let counterIdx = select(1u, 0u, nextIsA); + let slot = atomicAdd(&wfCounters[counterIdx], 1u); + if (slot >= wfParams.rayCapacity) { return; } + let payloadBase = select(wfParams.rayCapacity, 0u, nextIsA); + var r: WfRay; + r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax; + r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask; + r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset; + r.payloadSlot = payloadBase + slot; r.kind = 1u; + if (nextIsA) { wfRaysA[slot] = r; } else { wfRaysB[slot] = r; } + wfPayload[r.payloadSlot] = payload; +} + +// Opaque-only BLAS descent (no anyhit — TRACE runs zero user code). +fn _rtwTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord, + instanceId: u32, hitGroupBase: u32, + bestHit: ptr, + bestT: ptr) -> bool { + let invD = vec3(1.0) / rayObj.direction; + var stack: array; + var sp: u32 = 0u; + var nodeRel: u32 = 0u; + loop { + let absI = meshRec.bvhOffset + nodeRel; + let node = bvhNodes[absI]; + if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, *bestT)) { + if (sp == 0u) { break; } + sp = sp - 1u; nodeRel = stack[sp]; continue; + } + if (node.primCount > 0u) { + for (var i: u32 = 0u; i < node.primCount; i = i + 1u) { + let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i]; + let verts = _rtFetchTri(meshRec, triIndex); + let tr = _rtTri(rayObj.origin, rayObj.direction, + verts[0], verts[1], verts[2], rayObj.tMin, *bestT); + if (!tr.hit) { continue; } + let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]); + let facing = dot(geomNormal, rayObj.direction); + if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; } + if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; } + var candidate: HitInfo; + candidate.t = tr.t; + candidate.instanceId = instanceId; + candidate.primitiveId = triIndex; + candidate.hitGroupIndex = hitGroupBase; + candidate.attribs = vec2(tr.u, tr.v); + candidate.objectRayOrigin = rayObj.origin; + candidate.objectRayDirection = rayObj.direction; + *bestHit = candidate; + *bestT = tr.t; + if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; } + } + if (sp == 0u) { break; } + sp = sp - 1u; nodeRel = stack[sp]; continue; + } + let left = node.firstChildOrPrim; + let right = left + 1u; + if (sp < 32u) { stack[sp] = right; sp = sp + 1u; } + nodeRel = left; + } + return false; +} + +fn _rtwTraverseTlas(rayWorld: RayDesc, flags: u32, cullMask: u32, + sbtRecordOffset: u32, + bestHit: ptr, + bestT: ptr) -> bool { + let invD = vec3(1.0) / rayWorld.direction; + let leavesStart = wfParams.tlasNPadded - 1u; + var stack: array; + var sp: u32 = 0u; + stack[sp] = 0u; sp = sp + 1u; + loop { + if (sp == 0u) { break; } + sp = sp - 1u; + let nodeIdx = stack[sp]; + let node = tlasBvhNodes[nodeIdx]; + if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, *bestT)) { continue; } + if (nodeIdx >= leavesStart) { + let leafIdx = nodeIdx - leavesStart; + let i = tlasEntryOrder[leafIdx]; + if (i == 0xFFFFFFFFu) { continue; } + let inst = tlasEntries[i]; + let instanceMask = inst.maskHGOffset & 0xFFu; + if ((instanceMask & cullMask) == 0u) { continue; } + if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, *bestT)) { continue; } + let r0 = inst.worldToObjectR0; + let r1 = inst.worldToObjectR1; + let r2 = inst.worldToObjectR2; + var rayObj: RayDesc; + rayObj.origin = vec3( + dot(r0.xyz, rayWorld.origin) + r0.w, + dot(r1.xyz, rayWorld.origin) + r1.w, + dot(r2.xyz, rayWorld.origin) + r2.w); + rayObj.direction = vec3( + dot(r0.xyz, rayWorld.direction), + dot(r1.xyz, rayWorld.direction), + dot(r2.xyz, rayWorld.direction)); + rayObj.tMin = rayWorld.tMin; + rayObj.tMax = *bestT; + var effective = flags; + let iflags = inst.instanceFlags; + if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) { + effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES); + } + let hitGroupOffset = inst.maskHGOffset >> 8u; + let hitGroupBase = sbtRecordOffset + hitGroupOffset; + let meshRec = meshRecords[inst.blasMeshIdx]; + let pre = *bestT; + let endSearch = _rtwTraverseBlas(rayObj, effective, meshRec, i, hitGroupBase, bestHit, bestT); + if ((*bestT) < pre || endSearch) { + (*bestHit).objectToWorldR0 = inst.objectToWorldR0; + (*bestHit).objectToWorldR1 = inst.objectToWorldR1; + (*bestHit).objectToWorldR2 = inst.objectToWorldR2; + (*bestHit).customIndex = inst.customIndex; + } + if (endSearch) { return true; } + if ((*bestT) < pre && (effective & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; } + } else { + let left = 2u * nodeIdx + 1u; + let right = 2u * nodeIdx + 2u; + if (sp + 1u < 32u) { + stack[sp] = right; sp = sp + 1u; + stack[sp] = left; sp = sp + 1u; + } + } + } + return false; +} + +fn _wfReadRay(i: u32) -> WfRay { + if (wfParams.curIsA == 1u) { return wfRaysA[i]; } + return wfRaysB[i]; +} + +// PREP — publish indirect args for the upcoming TRACE/SHADE; zero the next +// buffer's emit counter. +fn _wfPrep() { + let n = _wfCurCount(); + wfIndirect[0] = (n + 63u) / 64u; + wfIndirect[1] = 1u; + wfIndirect[2] = 1u; + if (wfParams.curIsA == 1u) { atomicStore(&wfCounters[1], 0u); } + else { atomicStore(&wfCounters[0], 0u); } +} + +// TRACE — pure traversal, zero user code. +fn _wfTrace(i: u32) { + if (i >= _wfCurCount()) { return; } + let ray = _wfReadRay(i); + var rd: RayDesc; + rd.origin = ray.origin; rd.tMin = ray.tMin; + rd.direction = ray.direction; rd.tMax = ray.tMax; + var bestHit: HitInfo; + bestHit.t = ray.tMax; + var bestT = ray.tMax; + _rtwTraverseTlas(rd, ray.flags, ray.cullMask & 0xFFu, ray.sbtRecordOffset, &bestHit, &bestT); + var hr: HitResult; + if (bestT < ray.tMax) { + hr.t = bestHit.t; + hr.instanceId = bestHit.instanceId; + hr.primitiveId = bestHit.primitiveId; + hr.hitGroupIndex = bestHit.hitGroupIndex; + hr.attribs = bestHit.attribs; + hr.hitKind = 1u; + hr.customIndex = bestHit.customIndex; + hr.objectRayOrigin = bestHit.objectRayOrigin; + hr.objectRayDirection = bestHit.objectRayDirection; + hr.objectToWorldR0 = bestHit.objectToWorldR0; + hr.objectToWorldR1 = bestHit.objectToWorldR1; + hr.objectToWorldR2 = bestHit.objectToWorldR2; + } else { + hr.hitKind = 0u; + } + wfHits[i] = hr; +} + +// SHADE — dispatch to runMiss / runClosestHit with the ray's payload. +fn _wfShade(i: u32) { + if (i >= _wfCurCount()) { return; } + let ray = _wfReadRay(i); + let hr = wfHits[i]; + _wfPixel = ray.pixel; + var payload: Payload = wfPayload[ray.payloadSlot]; + var rd: RayDesc; + rd.origin = ray.origin; rd.tMin = ray.tMin; + rd.direction = ray.direction; rd.tMax = ray.tMax; + if (hr.hitKind == 0u) { + runMiss(ray.missIndex, rd, &payload); + } else if ((ray.flags & RT_FLAG_SKIP_CLOSEST_HIT) == 0u) { + var hit: HitInfo; + hit.t = hr.t; + hit.instanceId = hr.instanceId; + hit.primitiveId = hr.primitiveId; + hit.hitGroupIndex = hr.hitGroupIndex; + hit.attribs = hr.attribs; + hit.customIndex = hr.customIndex; + hit.objectRayOrigin = hr.objectRayOrigin; + hit.objectRayDirection = hr.objectRayDirection; + hit.objectToWorldR0 = hr.objectToWorldR0; + hit.objectToWorldR1 = hr.objectToWorldR1; + hit.objectToWorldR2 = hr.objectToWorldR2; + runClosestHit(hr.hitGroupIndex, rd, hit, &payload); + } +} +`; + // ── WGSL library: rayQuery API for non-megakernel compute shaders ──── // // Mirrors GL_EXT_ray_query semantics that 3DForts's physics shaders use @@ -2564,6 +2921,40 @@ env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle, // textures, samplers). Pass (0, 0) for a pipeline with no user bindings. const rtPipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups } +// Per-payload byte budget in wfPayload (rounded up; user Payload structs +// must fit). Sponza's Payload is 48 B; 64 leaves headroom while keeping +// 2·W·H·64 B ≈ 265 MB at 1080p. +const WF_PAYLOAD_BYTES = 64; +// Dynamic-offset uniform ring: one WfParams slot per wavefront pass. 128 +// slots covers maxDepth up to ~42 (1 + 3·maxDepth + 1 passes). +const WF_PARAM_SLOTS = 128; +const WF_FIXED_TLAS_NPADDED = 16384; // matches lbvhBuildWgsl N_PADDED + +function ensureWavefrontBuffers(W, H) { + const cap = W * H; + rtState.wf = rtState.wf || { cap: 0 }; + const wf = rtState.wf; + if (wf.cap === cap && wf.raysA) return wf; + for (const b of [wf.raysA, wf.raysB, wf.hits, wf.accum, wf.counters, + wf.payload, wf.indirect]) { if (b) b.destroy(); } + const S = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST; + wf.raysA = device.createBuffer({ size: cap * 64, usage: S, label: "wf-raysA" }); + wf.raysB = device.createBuffer({ size: cap * 64, usage: S, label: "wf-raysB" }); + wf.hits = device.createBuffer({ size: cap * 112, usage: S, label: "wf-hits" }); + wf.accum = device.createBuffer({ size: cap * 16, usage: S, label: "wf-accum" }); + wf.payload = device.createBuffer({ size: 2 * cap * WF_PAYLOAD_BYTES, usage: S, label: "wf-payload" }); + wf.counters = device.createBuffer({ size: 64, + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, label: "wf-counters" }); + wf.indirect = device.createBuffer({ size: 16, + usage: GPUBufferUsage.STORAGE | GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST, label: "wf-indirect" }); + if (!wf.paramsRing) { + wf.paramsRing = device.createBuffer({ size: WF_PARAM_SLOTS * 256, + usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, label: "wf-params" }); + } + wf.cap = cap; + return wf; +} + env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { if (!rtState.vertHeap) rtInit(); const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen)); @@ -2577,16 +2968,23 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { beforeHelpers = userPart.substring(0, mi); afterHelpers = userPart.substring(mi + marker.length); } - const fullWgsl = rtWgslPrelude + "\n" + beforeHelpers + "\n" + rtWgslPureHelpers + "\n" + rtWgslMegakernelHelpers + "\n" + afterHelpers; + // Wavefront assembly: types + bindings | user CH/miss/resolve + wfPayload + // + switches (beforeHelpers) | pure helpers | wavefront helpers | user + // raygen + the five @compute entry points (afterHelpers). + const fullWgsl = rtWgslTypes + rtWgslWavefrontBindings + "\n" + + beforeHelpers + "\n" + rtWgslPureHelpers + "\n" + + rtWgslWavefrontHelpers + "\n" + afterHelpers; - // Parse user bindings (same wire format as wgpuLoadCustomShader). + // Parse user bindings (same wire format as wgpuLoadCustomShader). For + // the wavefront RT pipeline, group 0 = WfParams, group 1 = data heaps, + // group 2 = indirect args — so user bindings must start at group 3. const userBindings = []; if (bindingsCount > 0) { const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8); for (let i = 0; i < bindingsCount; i++) { const g = dv.getUint8(i*8 + 0); - if (g < 2) { - console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 2`); + if (g < 3) { + console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 3`); return 0; } userBindings.push({ @@ -2604,33 +3002,28 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { } const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b); - let pipeline; try { - const mod = device.createShaderModule({ code: fullWgsl, label: "rt-megakernel" }); - // RTDispatchHeader is 16 bytes; bind exactly that. - const headerBgl = device.createBindGroupLayout({ entries: [ + const mod = device.createShaderModule({ code: fullWgsl, label: "rt-wavefront" }); + const paramsBgl = device.createBindGroupLayout({ entries: [ { binding: 0, visibility: GPUShaderStage.COMPUTE, - buffer: { type: "uniform", minBindingSize: 16 } }, + buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 32 } }, ]}); + const sb = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }); + const rw = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } }); const dataBgl = device.createBindGroupLayout({ entries: [ - { binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, - { binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, - { binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, - { binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, - { binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, - { binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, + sb(0), sb(1), sb(2), sb(3), sb(4), sb(5), { binding: 6, visibility: GPUShaderStage.COMPUTE, storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } }, - { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, - { binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, - { binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } }, + sb(7), sb(8), sb(9), + rw(10), rw(11), rw(12), rw(13), rw(14), rw(15), ]}); - // User binding-group layouts. WebGPU pipeline layouts need a - // contiguous array up to the highest group used, so pad any gaps - // with empty bgls (same rule as wgpuLoadCustomShader). + const indirectBgl = device.createBindGroupLayout({ entries: [ rw(0) ]}); + const emptyBgl = device.createBindGroupLayout({ entries: [] }); + + // User binding-group layouts for groups 3..highest (pad gaps). const userBgls = []; - const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 1; - for (let g = 2; g <= highest; g++) { + const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 2; + for (let g = 3; g <= highest; g++) { if (byGroup.has(g)) { const entries = byGroup.get(g).map(b => { const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE }; @@ -2638,6 +3031,7 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" }; else if (b.kind === 2) e.sampler = { type: "filtering" }; else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" }; + else if (b.kind === 4) e.buffer = { type: "storage" }; return e; }); userBgls.push(device.createBindGroupLayout({ entries })); @@ -2645,12 +3039,29 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { userBgls.push(device.createBindGroupLayout({ entries: [] })); } } - pipeline = device.createComputePipeline({ - layout: device.createPipelineLayout({ bindGroupLayouts: [headerBgl, dataBgl, ...userBgls] }), - compute: { module: mod, entryPoint: "main" }, - }); + // GENERATE / SHADE / RESOLVE may touch user bindings (raygen camera, + // closesthit albedo, resolve params) → params + data + empty(group2) + // + user. PREP → params + data + indirect. TRACE → params + data. + const userLayout = device.createPipelineLayout({ + bindGroupLayouts: [paramsBgl, dataBgl, emptyBgl, ...userBgls] }); + const prepLayout = device.createPipelineLayout({ + bindGroupLayouts: [paramsBgl, dataBgl, indirectBgl] }); + const traceLayout = device.createPipelineLayout({ + bindGroupLayouts: [paramsBgl, dataBgl] }); + const mk = (layout, ep) => device.createComputePipeline({ + layout, compute: { module: mod, entryPoint: ep } }); + + const entry = { + genPipe: mk(userLayout, "wfGenerate"), + prepPipe: mk(prepLayout, "wfPrep"), + tracePipe: mk(traceLayout, "wfTrace"), + shadePipe: mk(userLayout, "wfShade"), + resolvePipe: mk(userLayout, "wfResolve"), + paramsBgl, dataBgl, indirectBgl, emptyBgl, userBgls, + byGroup, sortedGroups, + }; const handle = newHandle(); - rtPipelines.set(handle, { pipeline, headerBgl, dataBgl, userBgls, byGroup, sortedGroups }); + rtPipelines.set(handle, entry); return handle; } catch (e) { console.error("[crafter-wgpu] RT pipeline compile failed:", e); @@ -2659,35 +3070,83 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => { } }; +// Build the user @group(3+) bind groups for a pass, returning a list of +// { group, bindGroup } to set. Shared by GENERATE / SHADE / RESOLVE. +function wfUserBindGroups(pipe, handlesPtr, handlesCount) { + const out = []; + if (handlesCount <= 0) return out; + const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount); + let handleIdx = 0; + let bglIdx = 0; + const highest = pipe.sortedGroups.length ? pipe.sortedGroups[pipe.sortedGroups.length - 1] : 2; + for (let g = 3; g <= highest; g++) { + if (pipe.byGroup.has(g)) { + const entries = pipe.byGroup.get(g).map(b => { + const h = handles[handleIdx++]; + let resource; + if (b.kind === 0) resource = { buffer: buffers.get(h) }; + else if (b.kind === 1) resource = textureViews.get(h); + else if (b.kind === 2) resource = samplers.get(h); + else if (b.kind === 3) resource = textureViews.get(h); + else if (b.kind === 4) resource = { buffer: buffers.get(h) }; + return { binding: b.binding, resource }; + }); + out.push({ group: g, bindGroup: device.createBindGroup({ layout: pipe.userBgls[bglIdx], entries }) }); + } + bglIdx++; + } + return out; +} + env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, tlasBufHandle, instanceCount, gx, gy, - handlesPtr, handlesCount) => { - if (!state.pass) return; + handlesPtr, handlesCount, maxDepth) => { + if (!state.encoder) return; const pipe = rtPipelines.get(pipelineHandle); const tlas = buffers.get(tlasBufHandle); if (!pipe || !tlas) { console.error("[crafter-wgpu] wgpuDispatchRT: unknown pipeline or tlas"); return; } - // Write RT header from push data (first 16 bytes). Surface dims + instance count + flags. - const hdr32 = new Uint32Array(4); - hdr32[0] = state.width; - hdr32[1] = state.height; - hdr32[2] = instanceCount; - hdr32[3] = 0; - queue.writeBuffer(rtState.rtHeader, 0, hdr32); - - const headerBg = device.createBindGroup({ - layout: pipe.headerBgl, - entries: [{ binding: 0, resource: { buffer: rtState.rtHeader, offset: 0, size: 16 } }], - }); - const outView = state.outIsPing ? state.pingView : state.pongView; const entryOrderBuf = buffers.get(rtState.currentEntryOrder); const bvhBuf = buffers.get(rtState.currentBvh); if (!entryOrderBuf || !bvhBuf) { - console.error("[crafter-wgpu] wgpuDispatchRT: missing entryOrder/bins (no TLAS built yet?)"); + console.error("[crafter-wgpu] wgpuDispatchRT: missing entryOrder/bvh (no TLAS built yet?)"); return; } + const W = state.width, H = state.height; + const cap = W * H; + const depth = Math.max(1, maxDepth | 0); + const wf = ensureWavefrontBuffers(W, H); + + // ── Per-pass WfParams ring. queue.writeBuffer lands before submit, so + // we can't mutate the uniform between passes — instead we pre-write one + // slot per pass and bind it with a dynamic offset. Slot order: + // 0 GENERATE + // 1+3*d .. +2 PREP / TRACE / SHADE for bounce d + // 1+3*depth RESOLVE + const passCount = 2 + 3 * depth; + const ring = new Uint32Array(WF_PARAM_SLOTS * 64); // 256 B = 64 u32 per slot + const writeSlot = (slot, curIsA, bounce) => { + const o = slot * 64; + ring[o + 0] = W; ring[o + 1] = H; ring[o + 2] = cap; ring[o + 3] = curIsA; + ring[o + 4] = bounce; ring[o + 5] = depth; ring[o + 6] = WF_FIXED_TLAS_NPADDED; ring[o + 7] = 0; + }; + writeSlot(0, 1, 0); // GENERATE + for (let d = 0; d < depth; d++) { + const curIsA = (d % 2 === 0) ? 1 : 0; + writeSlot(1 + 3 * d + 0, curIsA, d); // PREP + writeSlot(1 + 3 * d + 1, curIsA, d); // TRACE + writeSlot(1 + 3 * d + 2, curIsA, d); // SHADE + } + writeSlot(1 + 3 * depth, 1, depth); // RESOLVE + queue.writeBuffer(wf.paramsRing, 0, ring, 0, passCount * 64); + + const outView = state.outIsPing ? state.pingView : state.pongView; + const paramsBg = device.createBindGroup({ + layout: pipe.paramsBgl, + entries: [{ binding: 0, resource: { buffer: wf.paramsRing, offset: 0, size: 256 } }], + }); const dataBg = device.createBindGroup({ layout: pipe.dataBgl, entries: [ @@ -2701,41 +3160,91 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes, { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } }, { binding: 8, resource: { buffer: entryOrderBuf } }, { binding: 9, resource: { buffer: bvhBuf } }, + { binding: 10, resource: { buffer: wf.raysA } }, + { binding: 11, resource: { buffer: wf.raysB } }, + { binding: 12, resource: { buffer: wf.hits } }, + { binding: 13, resource: { buffer: wf.accum } }, + { binding: 14, resource: { buffer: wf.counters } }, + { binding: 15, resource: { buffer: wf.payload } }, ], }); + const indirectBg = device.createBindGroup({ + layout: pipe.indirectBgl, + entries: [{ binding: 0, resource: { buffer: wf.indirect } }], + }); + const userBgs = wfUserBindGroups(pipe, handlesPtr, handlesCount); - state.pass.setPipeline(pipe.pipeline); - state.pass.setBindGroup(0, headerBg); - state.pass.setBindGroup(1, dataBg); + // Close the frame's shared compute pass; the wavefront runs as its own + // sequence of passes on the same encoder (implicit barrier between each + // makes PREP's atomic writes visible to TRACE, etc.), then we reopen it. + if (state.pass) { state.pass.end(); state.pass = null; } + const enc = state.encoder; + const tileX = gx, tileY = gy; + const slotOff = (slot) => slot * 256; - // User bindings: walk byGroup in the same sorted order the C++ side - // packed handles[], picking up indices linearly. - if (handlesCount > 0) { - const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount); - let handleIdx = 0; - let bglIdx = 0; - for (let g = 2; g <= (pipe.sortedGroups[pipe.sortedGroups.length - 1] || 1); g++) { - if (pipe.byGroup.has(g)) { - const entries = pipe.byGroup.get(g).map(b => { - const h = handles[handleIdx++]; - let resource; - if (b.kind === 0) resource = { buffer: buffers.get(h) }; - else if (b.kind === 1) resource = textureViews.get(h); - else if (b.kind === 2) resource = samplers.get(h); - else if (b.kind === 3) resource = textureViews.get(h); - return { binding: b.binding, resource }; - }); - const bg = device.createBindGroup({ - layout: pipe.userBgls[bglIdx], - entries, - }); - state.pass.setBindGroup(g, bg); - } - bglIdx++; + // Zero the two emit counters before GENERATE. + enc.clearBuffer(wf.counters, 0, 64); + + const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); }; + + // GENERATE + { + const p = enc.beginComputePass({ label: "wf-generate" }); + p.setPipeline(pipe.genPipe); + p.setBindGroup(0, paramsBg, [slotOff(0)]); + p.setBindGroup(1, dataBg); + setUser(p); + p.dispatchWorkgroups(tileX, tileY, 1); + p.end(); + } + for (let d = 0; d < depth; d++) { + const prepSlot = 1 + 3 * d + 0; + const traceSlot = 1 + 3 * d + 1; + const shadeSlot = 1 + 3 * d + 2; + // PREP — publish indirect args, zero next counter. + { + const p = enc.beginComputePass({ label: "wf-prep" }); + p.setPipeline(pipe.prepPipe); + p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]); + p.setBindGroup(1, dataBg); + p.setBindGroup(2, indirectBg); + p.dispatchWorkgroups(1, 1, 1); + p.end(); + } + // TRACE — indirect over the live ray list. + { + const p = enc.beginComputePass({ label: "wf-trace" }); + p.setPipeline(pipe.tracePipe); + p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]); + p.setBindGroup(1, dataBg); + p.dispatchWorkgroupsIndirect(wf.indirect, 0); + p.end(); + } + // SHADE — indirect; runs user closesthit/miss, may emit + accumulate. + { + const p = enc.beginComputePass({ label: "wf-shade" }); + p.setPipeline(pipe.shadePipe); + p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]); + p.setBindGroup(1, dataBg); + setUser(p); + p.dispatchWorkgroupsIndirect(wf.indirect, 0); + p.end(); } } + // RESOLVE — tonemap accum → output image. + { + const p = enc.beginComputePass({ label: "wf-resolve" }); + p.setPipeline(pipe.resolvePipe); + p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]); + p.setBindGroup(1, dataBg); + setUser(p); + p.dispatchWorkgroups(tileX, tileY, 1); + p.end(); + } - state.pass.dispatchWorkgroups(gx, gy, 1); + // Reopen the frame's shared pass so wgpuFrameEnd / later UI work as + // before, and flip ping-pong so the blit picks the texture RESOLVE wrote. + state.pass = enc.beginComputePass(); state.outIsPing = !state.outIsPing; }; diff --git a/examples/VulkanTriangle/closesthit.wgsl b/examples/VulkanTriangle/closesthit.wgsl index e0d29ae..91758ba 100644 --- a/examples/VulkanTriangle/closesthit.wgsl +++ b/examples/VulkanTriangle/closesthit.wgsl @@ -1,6 +1,9 @@ -// WebGPU port of closesthit.glsl. Library concatenates this BEFORE the -// library helpers, so `Payload` declared here is visible to traceRay, -// runClosestHit, the mega-switch, and the user's raygen source. +// Payload declared here so the WGSL assembler sees it before the wfPayload +// binding, the SHADE dispatch, and the raygen source. +// +// Wavefront model: closesthit_main runs in SHADE and accumulates the +// pixel's color directly (rtAccumulate) instead of writing a payload that +// raygen reads back. struct Payload { color: vec3, @@ -8,5 +11,5 @@ struct Payload { fn closesthit_main(ray: RayDesc, hit: HitInfo, payload: ptr) { let bary = vec3(1.0 - hit.attribs.x - hit.attribs.y, hit.attribs.x, hit.attribs.y); - (*payload).color = bary; + rtAccumulate(bary); } diff --git a/examples/VulkanTriangle/miss.wgsl b/examples/VulkanTriangle/miss.wgsl index a0ba944..39ee6b6 100644 --- a/examples/VulkanTriangle/miss.wgsl +++ b/examples/VulkanTriangle/miss.wgsl @@ -1,5 +1,6 @@ -// WebGPU port of miss.glsl. +// Wavefront miss: runs in SHADE for rays that hit nothing. Accumulate the +// white background directly. fn miss_main(ray: RayDesc, payload: ptr) { - (*payload).color = vec3(1.0, 1.0, 1.0); + rtAccumulate(vec3(1.0, 1.0, 1.0)); } diff --git a/examples/VulkanTriangle/raygen.wgsl b/examples/VulkanTriangle/raygen.wgsl index cdf66fd..3014e8a 100644 --- a/examples/VulkanTriangle/raygen.wgsl +++ b/examples/VulkanTriangle/raygen.wgsl @@ -1,11 +1,12 @@ -// WebGPU port of raygen.glsl. Mirrors the pinhole camera setup — the -// Payload type is declared in closesthit.wgsl (concatenated earlier). +// WebGPU wavefront raygen. Runs in GENERATE: compute the pinhole camera +// ray and emit it as the pixel's primary ray. Shading happens later in +// SHADE (closesthit/miss). The Payload type is declared in closesthit.wgsl. fn raygen_main(gid: vec3) { - if (gid.x >= hdr.surfaceW || gid.y >= hdr.surfaceH) { return; } + if (gid.x >= wfParams.surfaceW || gid.y >= wfParams.surfaceH) { return; } let pixel = vec2(f32(gid.x), f32(gid.y)); - let resolution = vec2(f32(hdr.surfaceW), f32(hdr.surfaceH)); + let resolution = vec2(f32(wfParams.surfaceW), f32(wfParams.surfaceH)); let uv = (pixel + vec2(0.5)) / resolution; let ndc = uv * 2.0 - vec2(1.0); @@ -23,17 +24,11 @@ fn raygen_main(gid: vec3) { var payload: Payload; payload.color = vec3(0.0); - traceRay( - 0u, // tlasIdx (unused) - 0u, // ray flags - 0xFFu, // cull mask - 0u, 0u, 0u, // sbtRecordOffset, sbtRecordStride, missIndex + rtEmitPrimaryRay( origin, 0.001, direction, 10000.0, - &payload, - ); - - textureStore(outImage, - vec2(i32(gid.x), i32(gid.y)), - vec4(payload.color, 1.0)); + 0u, // ray flags + 0xFFu, // cull mask + 0u, 0u, // sbtRecordOffset, missIndex + payload); } diff --git a/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp b/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp index c4265da..8dd949d 100644 --- a/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp +++ b/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp @@ -78,13 +78,22 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef /*cmd*/, // shaders by stage. Concatenating *all* non-raygen sources here lets // them declare shared helpers, `struct Payload`, etc., in any order. - wgsl += "// ── user closesthit / anyhit / miss sources ───────────────\n"; + wgsl += "// ── user closesthit / anyhit / miss / resolve sources ─────\n"; for (const auto& shader : sbt.shaders) { if (shader.stage == WebGPURTStage::Raygen) continue; wgsl += shader.source; wgsl += "\n"; } + // ── Payload-typed wavefront storage binding ──────────────────────── + // + // Emitted *after* the user sources so it can name the user's `Payload` + // type. Holds one Payload per in-flight ray slot across both ping/pong + // ray buffers (capacity = 2·W·H). SHADE loads ray.payloadSlot here; + // emit helpers (rtEmitPrimaryRay / rtEmitRay) store into it. + wgsl += "\n@group(1) @binding(15) var " + "wfPayload : array;\n"; + // ── Section 2: mega-switch dispatchers ───────────────────────────── // // runClosestHit, runAnyHit, runMiss each dispatch on the per-hit / @@ -141,6 +150,24 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef /*cmd*/, wgsl += " }\n"; wgsl += "}\n"; + // runResolve — RESOLVE-stage tonemap hook. The first registered + // Resolve shader wins; with none, identity passthrough (alpha forced + // to 1) so the wavefront output matches a megakernel that wrote raw + // colors. + std::string resolveEntryFn; + for (const auto& shader : sbt.shaders) { + if (shader.stage == WebGPURTStage::Resolve) { resolveEntryFn = shader.entryFn; break; } + } + wgsl += "\nfn runResolve(coord: vec2, hdr: vec4) -> vec4 {\n"; + if (!resolveEntryFn.empty()) { + wgsl += " return "; + wgsl += resolveEntryFn; + wgsl += "(coord, hdr);\n"; + } else { + wgsl += " return vec4(hdr.rgb, 1.0);\n"; + } + wgsl += "}\n"; + // Marker — JS-side prelude/post-amble searches for this token to know // where the library helpers (traverseBlas/traverseTlas/traceRay) get // injected, followed by raygen sources and the @compute entry point. @@ -173,17 +200,55 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef /*cmd*/, return; } - // ── Section 4: @compute entry point ──────────────────────────────── + // ── Section 4: wavefront @compute entry points ───────────────────── // - // 8x8 tile workgroup matching the rest of the WebGPU backend. + // Five kernels share this one module; createComputePipeline selects + // each by entryPoint name. GENERATE/RESOLVE are 8x8 screen tiles; + // TRACE/SHADE are 64-wide 1-D over the compacted ray list (dispatched + // indirectly from PREP); PREP is a single thread. The library helper + // bodies (_rtwTraverseTlas, rtEmit*, rtAccumulate, _wfCurCount, …) are + // injected JS-side at the marker above. + // GENERATE — one thread per pixel; clears the pixel's accumulator and + // runs the user raygen, which calls rtEmitPrimaryRay. wgsl += "\n@compute @workgroup_size(8, 8, 1)\n"; - wgsl += "fn main(@builtin(global_invocation_id) gid: vec3) {\n"; + wgsl += "fn wfGenerate(@builtin(global_invocation_id) gid: vec3) {\n"; + wgsl += " if (gid.x >= wfParams.surfaceW || gid.y >= wfParams.surfaceH) { return; }\n"; + wgsl += " let pixel = gid.y * wfParams.surfaceW + gid.x;\n"; + wgsl += " wfAccum[pixel] = vec4(0.0, 0.0, 0.0, 0.0);\n"; + wgsl += " _wfPixel = pixel;\n"; wgsl += " "; wgsl += raygenEntryFn; wgsl += "(gid);\n"; wgsl += "}\n"; + // PREP — single thread; reads the live ray count and publishes the + // indirect dispatch args for the upcoming TRACE/SHADE, then zeroes the + // next buffer's emit counter so SHADE starts compacting from 0. + wgsl += "\n@compute @workgroup_size(1)\n"; + wgsl += "fn wfPrep() { _wfPrep(); }\n"; + + // TRACE — zero user code: pure traversal + intersection. One thread + // per live ray; writes a HitResult into wfHits[i]. + wgsl += "\n@compute @workgroup_size(64)\n"; + wgsl += "fn wfTrace(@builtin(global_invocation_id) gid: vec3) { _wfTrace(gid.x); }\n"; + + // SHADE — one thread per live ray; loads the ray + its hit + payload, + // dispatches to runMiss / runClosestHit, which may rtAccumulate and + // rtEmitRay continuation/shadow rays into the next buffer. + wgsl += "\n@compute @workgroup_size(64)\n"; + wgsl += "fn wfShade(@builtin(global_invocation_id) gid: vec3) { _wfShade(gid.x); }\n"; + + // RESOLVE — one thread per pixel; runs the user resolve (or identity) + // over the linear accumulator and stores to the output image. + wgsl += "\n@compute @workgroup_size(8, 8, 1)\n"; + wgsl += "fn wfResolve(@builtin(global_invocation_id) gid: vec3) {\n"; + wgsl += " if (gid.x >= wfParams.surfaceW || gid.y >= wfParams.surfaceH) { return; }\n"; + wgsl += " let pixel = gid.y * wfParams.surfaceW + gid.x;\n"; + wgsl += " let outc = runResolve(gid.xy, wfAccum[pixel]);\n"; + wgsl += " textureStore(outImage, vec2(i32(gid.x), i32(gid.y)), outc);\n"; + wgsl += "}\n"; + pipelineHandle = WebGPU::wgpuLoadRTPipeline( wgsl.data(), static_cast(wgsl.size()), diff --git a/interfaces/Crafter.Graphics-RTPass.cppm b/interfaces/Crafter.Graphics-RTPass.cppm index 1397a68..0da7088 100644 --- a/interfaces/Crafter.Graphics-RTPass.cppm +++ b/interfaces/Crafter.Graphics-RTPass.cppm @@ -72,6 +72,11 @@ export namespace Crafter { // 0 means "no user bindings". const void* handlesPtr = nullptr; std::uint32_t handlesCount = 0; + // Wavefront bounce budget: number of (TRACE; SHADE) iterations. + // 1 = primary rays only; 2 = primary + one continuation/shadow + // bounce; etc. The library unrolls GENERATE; (PREP; TRACE; SHADE) + // ×maxDepth; RESOLVE. + std::uint32_t maxDepth = 1; RTPass(PipelineRTWebGPU* p) : pipeline(p) {} @@ -88,7 +93,8 @@ export namespace Crafter { static_cast(gx), static_cast(gy), handlesPtr, - static_cast(handlesCount)); + static_cast(handlesCount), + static_cast(maxDepth)); } }; } diff --git a/interfaces/Crafter.Graphics-ShaderBindingTableWebGPU.cppm b/interfaces/Crafter.Graphics-ShaderBindingTableWebGPU.cppm index c175a52..73c2285 100644 --- a/interfaces/Crafter.Graphics-ShaderBindingTableWebGPU.cppm +++ b/interfaces/Crafter.Graphics-ShaderBindingTableWebGPU.cppm @@ -18,6 +18,11 @@ export namespace Crafter { Miss = 1, ClosestHit = 2, AnyHit = 3, + // Wavefront RESOLVE-stage tonemap/output hook. Optional: if no + // Resolve shader is registered, RESOLVE writes the linear accum + // buffer through unchanged. Signature: + // fn (coord: vec2, hdr: vec4) -> vec4 + Resolve = 4, }; // One WGSL shader source + the function name PipelineRTWebGPU should diff --git a/interfaces/Crafter.Graphics-WebGPU.cppm b/interfaces/Crafter.Graphics-WebGPU.cppm index dc695fe..9089bbd 100644 --- a/interfaces/Crafter.Graphics-WebGPU.cppm +++ b/interfaces/Crafter.Graphics-WebGPU.cppm @@ -201,7 +201,8 @@ namespace Crafter::WebGPU { std::uint32_t tlasBufHandle, std::int32_t instanceCount, std::int32_t gx, std::int32_t gy, - const void* handlesPtr, std::int32_t handlesCount); + const void* handlesPtr, std::int32_t handlesCount, + std::int32_t maxDepth); // GPU TLAS-build dispatch. Two sequential compute passes: // 1. tlasBuildMain — per-instance world AABB + identity permutation