Replace the megakernel @compute entry with five wavefront kernels sharing one module, connected by GPU ray/hit/payload buffers and a GPU-driven indirect bounce loop: GENERATE -> (PREP -> TRACE -> SHADE) x maxDepth -> RESOLVE - TRACE contains zero user code (pure _rtwTraverseTlas/Blas, opaque-only). - PREP publishes dispatchWorkgroupsIndirect args from the live ray count; the indirect-args buffer lives in its own bind group so it is never bound read-write in the same dispatch that consumes it as INDIRECT. - New emit/accumulate API: rtEmitPrimaryRay / rtEmitRay / rtAccumulate, plus an optional user Resolve stage (tonemap hook; identity by default). - Per-pass WfParams via a dynamic-offset uniform ring (curIsA/bounce vary between passes within one submit). - Payload-typed wfPayload binding emitted in the codegen region after the user's struct Payload; payload travels with each ray (2*W*H slots). - Request maxBufferSize / maxStorageBufferBindingSize / maxComputeWorkgroups PerDimension so the W*H-sized work buffers fit past the 128MB baseline. VulkanTriangle ported to the new API and renders bit-identical to the megakernel baseline at maxDepth=1. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
3532 lines
155 KiB
JavaScript
3532 lines
155 KiB
JavaScript
/*
|
||
Crafter.Graphics WebGPU bridge — DOM mode UI compute pipeline.
|
||
|
||
Surface model (high-level, deliberately not 1:1 with WebGPU):
|
||
- JS owns the GPUDevice/queue/compute pipelines/bind-group cache.
|
||
- C++ owns slot allocation and per-frame logic; it calls into ~15 imports.
|
||
- Standard UI shaders are embedded as WGSL strings at the bottom of this
|
||
file and compiled once at startup.
|
||
|
||
Ping-pong output strategy (per Decision 2 in plan):
|
||
- Two rgba8unorm storage textures sized to the canvas.
|
||
- Each Dispatch reads `prev` (sampled), writes `out` (storage, write-only).
|
||
- JS swaps the two between dispatches, so item-order overdraw works.
|
||
- At frame end, the current `out` is blitted to the canvas via
|
||
copyTextureToTexture (canvas configured as rgba8unorm to match).
|
||
|
||
This file is loaded as <script type="module">. Top-level await blocks
|
||
runtime.js's _start() until adapter + device are resolved, so by the time
|
||
main() runs every import here is fully wired.
|
||
*/
|
||
|
||
// ─── env stubs (assigned synchronously, BEFORE any async work) ────────────
|
||
// The wasm module's import-link step needs every declared wgpu* import to
|
||
// resolve to a Function. If init below throws, the stubs stay in place so
|
||
// the wasm still links — and the call site gets a clear error at runtime
|
||
// instead of "import object field X is not a Function" at link time.
|
||
|
||
window.crafter_webbuild_env = window.crafter_webbuild_env || {};
|
||
window.crafter_webbuild_env.table = window.crafter_webbuild_env.table
|
||
|| new WebAssembly.Table({ initial: 4, element: "anyfunc" });
|
||
|
||
let initError = null;
|
||
function stub(name) {
|
||
return (...args) => {
|
||
const msg = `[crafter-wgpu] ${name}() called but WebGPU init failed: ${initError?.message ?? "(no error captured)"}`;
|
||
console.error(msg);
|
||
throw new Error(msg);
|
||
};
|
||
}
|
||
{
|
||
const e = window.crafter_webbuild_env;
|
||
for (const n of [
|
||
"wgpuGetCanvasWidth", "wgpuGetCanvasHeight", "wgpuSurfaceWidth", "wgpuSurfaceHeight",
|
||
"wgpuInit", "wgpuCreateBuffer", "wgpuWriteBuffer", "wgpuWriteBufferRange",
|
||
"wgpuReadbackEnqueue", "wgpuReadbackPoll", "wgpuReadbackReady", "wgpuDestroyBuffer",
|
||
"wgpuCreateAtlasTexture", "wgpuWriteAtlasRegion", "wgpuDestroyTexture",
|
||
"wgpuCreateImage2D", "wgpuWriteImage2D",
|
||
"wgpuCreateImage2DArray", "wgpuWriteImage2DLayer",
|
||
"wgpuCreateLinearClampSampler", "wgpuCreateLinearRepeatSampler",
|
||
"wgpuFrameBegin", "wgpuFrameEnd",
|
||
"wgpuDispatchQuads", "wgpuDispatchCircles", "wgpuDispatchImages", "wgpuDispatchText",
|
||
"wgpuLoadCustomShader", "wgpuDispatchCustom",
|
||
"wgpuRegisterMeshBLAS", "wgpuLoadRTPipeline", "wgpuDispatchRT", "wgpuBuildTLAS",
|
||
"wgpuLoadComputePipeline", "wgpuDispatchCompute",
|
||
]) {
|
||
// Read-write ints don't need a stub-throw; return 0 for the size queries.
|
||
e[n] = n.endsWith("Width") || n.endsWith("Height")
|
||
? () => 0
|
||
: (n === "wgpuRegisterMeshBLAS" ? () => 0 : stub(n));
|
||
}
|
||
}
|
||
|
||
// ─── canvas + device init (runs before _start) ───────────────────────────
|
||
// Wrapped in an async IIFE assigned to window.crafter_webbuild_env_ready so
|
||
// the runtime.js shim can `await` it explicitly before calling _start().
|
||
// Sibling <script type="module"> top-level awaits are NOT reliably
|
||
// serialized in Firefox (verified 2026-05), so we can't depend on this
|
||
// file's TLA to block runtime.js by itself.
|
||
|
||
window.crafter_webbuild_env_ready = (async () => {
|
||
try {
|
||
|
||
if (!navigator.gpu) {
|
||
document.body.innerHTML = "<p style=\"font-family:sans-serif;padding:24px\">"
|
||
+ "WebGPU not available in this browser. Try Chrome 121+ / Firefox 141+ / Safari 26+.</p>";
|
||
initError = new Error("WebGPU unavailable");
|
||
throw initError;
|
||
}
|
||
|
||
const canvas = document.createElement("canvas");
|
||
canvas.id = "crafter-canvas";
|
||
canvas.style.cssText = "position:fixed;inset:0;width:100vw;height:100vh;display:block;";
|
||
document.body.style.margin = "0";
|
||
document.body.appendChild(canvas);
|
||
|
||
function syncCanvasSize() {
|
||
// Canvas pixel size = CSS size × devicePixelRatio so the GPU draws
|
||
// at physical pixel resolution on HiDPI displays — otherwise the
|
||
// browser upscales whatever we rendered at logical size and the
|
||
// result looks blurry. The CSS rule still pins display size at
|
||
// 100vw/100vh, so the canvas paints its physical buffer back down
|
||
// into logical pixels with no perceived layout change.
|
||
//
|
||
// Mouse events arrive in CSS pixels; dom-env.js multiplies them by
|
||
// window.crafter_dpr before dispatching so the wasm-side hit tests
|
||
// share the physical-pixel coordinate space with window.width/.height.
|
||
const dpr = window.devicePixelRatio || 1;
|
||
window.crafter_dpr = dpr;
|
||
const w = Math.max(1, Math.round(window.innerWidth * dpr));
|
||
const h = Math.max(1, Math.round(window.innerHeight * dpr));
|
||
if (canvas.width !== w) canvas.width = w;
|
||
if (canvas.height !== h) canvas.height = h;
|
||
return { w, h };
|
||
}
|
||
syncCanvasSize();
|
||
|
||
const adapter = await navigator.gpu.requestAdapter();
|
||
if (!adapter) {
|
||
initError = new Error("navigator.gpu.requestAdapter() returned null (no compatible adapter)");
|
||
console.error("[crafter-wgpu]", initError.message);
|
||
throw initError;
|
||
}
|
||
// Ask for everything the adapter is willing to give us, up to the values
|
||
// the RT pipeline actually needs. The megakernel prelude declares 7
|
||
// storage buffers at group(1) (tlasEntries / bvhNodes / meshRecords /
|
||
// vertices / indices / primRemap / vertexAttribs); user pipelines like
|
||
// 3DForts add more at group(2), and the WebGPU baseline of 8 isn't
|
||
// enough. Adapters routinely report 10+ — clamp our request to whatever
|
||
// the adapter actually supports so the call doesn't reject on baseline-
|
||
// only devices. Same pattern for storage textures (we use 1 output image
|
||
// per dispatch but headroom is cheap) and for the global storage-buffer
|
||
// pool which is the per-pipeline count's parent budget.
|
||
const adapterLimits = adapter.limits || {};
|
||
const requiredLimits = {};
|
||
const clamp = (name, want) => {
|
||
const cap = adapterLimits[name];
|
||
if (typeof cap === "number" && cap > 0) {
|
||
requiredLimits[name] = Math.min(want, cap);
|
||
}
|
||
};
|
||
clamp("maxStorageBuffersPerShaderStage", 16);
|
||
clamp("maxStorageBuffersInPipelineLayout", 16);
|
||
clamp("maxStorageTexturesPerShaderStage", 8);
|
||
// The TLAS BVH build runs one workgroup of up to N threads in shared
|
||
// memory (bitonic sort over morton codes + sweep-tree refit). Need the
|
||
// per-workgroup invocation cap raised from the default 256.
|
||
clamp("maxComputeInvocationsPerWorkgroup", 1024);
|
||
clamp("maxComputeWorkgroupSizeX", 1024);
|
||
// Wavefront RT work buffers are sized to W·H rays. At 1080p the payload
|
||
// store (≈245 MB) and hit buffer (≈214 MB) blow past the 128 MB baseline
|
||
// storage-buffer binding size, and the whole set past the 256 MB baseline
|
||
// maxBufferSize — request whatever the adapter actually allows (4090/Dawn
|
||
// reports 1 GB+). maxComputeWorkgroupsPerDimension bounds the indirect
|
||
// TRACE/SHADE 1-D dispatch (ceil(W·H/64) ≈ 32k workgroups at 1080p; the
|
||
// 65535 default covers it, but request the adapter max for headroom).
|
||
clamp("maxBufferSize", 1 << 30);
|
||
clamp("maxStorageBufferBindingSize", 1 << 30);
|
||
clamp("maxComputeWorkgroupsPerDimension", 65535);
|
||
const device = await adapter.requestDevice({ requiredLimits });
|
||
const queue = device.queue;
|
||
const ctx = canvas.getContext("webgpu");
|
||
const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
|
||
ctx.configure({ device, format: canvasFormat, alphaMode: "opaque",
|
||
usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_DST });
|
||
|
||
device.lost.then((info) => {
|
||
console.error("[crafter-wgpu] device lost:", info.message);
|
||
state.gpuLost = true;
|
||
});
|
||
device.addEventListener("uncapturederror", (e) => {
|
||
console.error("[crafter-wgpu] uncaptured error:", e.error && e.error.message);
|
||
});
|
||
|
||
// ─── handle tables ─────────────────────────────────────────────────────
|
||
|
||
const buffers = new Map(); // handle → GPUBuffer
|
||
const textures = new Map(); // handle → GPUTexture
|
||
const textureViews = new Map(); // handle → GPUTextureView (mirrors textures key for the view)
|
||
const samplers = new Map(); // handle → GPUSampler
|
||
let nextHandle = 1;
|
||
function newHandle() { return nextHandle++; }
|
||
|
||
// ─── ping-pong storage textures ────────────────────────────────────────
|
||
|
||
const state = {
|
||
pingTex: null, pingView: null,
|
||
pongTex: null, pongView: null,
|
||
outIsPing: true, // current "out" target
|
||
width: 0, height: 0,
|
||
encoder: null,
|
||
pass: null,
|
||
headerRing: null, // GPUBuffer; uniform header writes ring through this
|
||
headerRingSize: 0,
|
||
headerRingOffset: 0,
|
||
bindGroupCache: new Map(), // key → GPUBindGroup
|
||
gpuLost: false,
|
||
};
|
||
|
||
function recreatePingPong(w, h) {
|
||
const usage = GPUTextureUsage.STORAGE_BINDING
|
||
| GPUTextureUsage.TEXTURE_BINDING
|
||
| GPUTextureUsage.COPY_SRC
|
||
| GPUTextureUsage.COPY_DST; // COPY_DST so we can clear it
|
||
if (state.pingTex) state.pingTex.destroy();
|
||
if (state.pongTex) state.pongTex.destroy();
|
||
state.pingTex = device.createTexture({ size: [w, h], format: "rgba8unorm", usage });
|
||
state.pongTex = device.createTexture({ size: [w, h], format: "rgba8unorm", usage });
|
||
state.pingView = state.pingTex.createView();
|
||
state.pongView = state.pongTex.createView();
|
||
state.width = w; state.height = h;
|
||
state.outIsPing = true;
|
||
state.bindGroupCache.clear();
|
||
}
|
||
|
||
function ensureSized() {
|
||
const { w, h } = syncCanvasSize();
|
||
if (w !== state.width || h !== state.height) {
|
||
recreatePingPong(w, h);
|
||
// Notify the wasm side that the surface size changed so it can
|
||
// fire onResize through Window. The wasm export is added by
|
||
// Crafter.Graphics-Window.cpp.
|
||
const onResize = wasmExports && wasmExports.__crafterDom_resize;
|
||
if (onResize) onResize(1, w, h);
|
||
}
|
||
}
|
||
|
||
// Header ring buffer: 256-byte-aligned slots holding UIDispatchHeader (48
|
||
// bytes of meaningful data, padded to 256). Wraps at frame boundary.
|
||
const HEADER_ALIGN = 256;
|
||
const HEADER_RING_SLOTS = 64;
|
||
state.headerRingSize = HEADER_ALIGN * HEADER_RING_SLOTS;
|
||
state.headerRing = device.createBuffer({
|
||
size: state.headerRingSize,
|
||
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
||
});
|
||
|
||
// ─── pipelines ─────────────────────────────────────────────────────────
|
||
|
||
const wgslShared = String.raw`
|
||
struct UIDispatchHeader {
|
||
outImage: u32,
|
||
itemBuffer: u32,
|
||
surfaceW: u32,
|
||
surfaceH: u32,
|
||
clipX: f32,
|
||
clipY: f32,
|
||
clipW: f32,
|
||
clipH: f32,
|
||
itemCount: u32,
|
||
frameIdx: u32,
|
||
flags: u32,
|
||
_pad: u32,
|
||
};
|
||
|
||
@group(0) @binding(0) var<uniform> hdr : UIDispatchHeader;
|
||
@group(1) @binding(0) var outTex : texture_storage_2d<rgba8unorm, write>;
|
||
@group(1) @binding(1) var prevTex : texture_2d<f32>;
|
||
|
||
fn uiResolvePixel(coord: vec2<u32>) -> bool {
|
||
if (coord.x >= hdr.surfaceW || coord.y >= hdr.surfaceH) { return false; }
|
||
let fx = f32(coord.x); let fy = f32(coord.y);
|
||
if (fx < hdr.clipX || fy < hdr.clipY) { return false; }
|
||
if (fx >= hdr.clipX + hdr.clipW) { return false; }
|
||
if (fy >= hdr.clipY + hdr.clipH) { return false; }
|
||
return true;
|
||
}
|
||
|
||
fn uiBlendOver(dst: vec4<f32>, src: vec4<f32>) -> vec4<f32> {
|
||
let a = clamp(src.a, 0.0, 1.0);
|
||
let rgb = mix(dst.rgb, src.rgb, vec3<f32>(a));
|
||
let outA = a + dst.a * (1.0 - a);
|
||
return vec4<f32>(rgb, outA);
|
||
}
|
||
|
||
fn uiSdRoundRect(p: vec2<f32>, halfSize: vec2<f32>, r4: vec4<f32>) -> f32 {
|
||
var r: vec4<f32> = r4;
|
||
// Pick radius for the quadrant p is in. r order: (TL, TR, BR, BL).
|
||
let rx = select(r.x, r.z, p.x > 0.0);
|
||
let ry = select(r.w, r.y, p.x > 0.0);
|
||
let radius = select(ry, rx, p.y > 0.0);
|
||
let q = abs(p) - halfSize + vec2<f32>(radius);
|
||
return min(max(q.x, q.y), 0.0) + length(max(q, vec2<f32>(0.0))) - radius;
|
||
}
|
||
`;
|
||
|
||
const wgslQuads = wgslShared + String.raw`
|
||
struct QuadItem {
|
||
rect: vec4<f32>,
|
||
color: vec4<f32>,
|
||
corners: vec4<f32>,
|
||
outline: vec4<f32>,
|
||
};
|
||
@group(2) @binding(0) var<storage, read> items : array<QuadItem>;
|
||
|
||
@compute @workgroup_size(8, 8, 1)
|
||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||
if (!uiResolvePixel(gid.xy)) { return; }
|
||
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
|
||
var dst = textureLoad(prevTex, coord, 0);
|
||
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
|
||
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
|
||
let it = items[i];
|
||
let lo = it.rect.xy;
|
||
let hi = it.rect.xy + it.rect.zw;
|
||
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
|
||
let halfSize = it.rect.zw * 0.5;
|
||
let p = sp - (it.rect.xy + halfSize);
|
||
let d = uiSdRoundRect(p, halfSize, it.corners);
|
||
let bodyA = clamp(0.5 - d, 0.0, 1.0);
|
||
if (bodyA <= 0.0 && it.outline.x <= 0.0) { continue; }
|
||
var src = vec4<f32>(it.color.rgb, it.color.a * bodyA);
|
||
if (it.outline.x > 0.0) {
|
||
let t = abs(d + it.outline.x * 0.5) - it.outline.x * 0.5;
|
||
let outlineA = clamp(0.5 - t, 0.0, 1.0);
|
||
src = vec4<f32>(mix(src.rgb, it.outline.yzw, vec3<f32>(outlineA)),
|
||
max(src.a, outlineA));
|
||
}
|
||
if (src.a <= 0.0) { continue; }
|
||
dst = uiBlendOver(dst, src);
|
||
}
|
||
textureStore(outTex, coord, dst);
|
||
}
|
||
`;
|
||
|
||
const wgslCircles = wgslShared + String.raw`
|
||
struct CircleItem {
|
||
centerRadius: vec4<f32>,
|
||
color: vec4<f32>,
|
||
outline: vec4<f32>,
|
||
};
|
||
@group(2) @binding(0) var<storage, read> items : array<CircleItem>;
|
||
|
||
@compute @workgroup_size(8, 8, 1)
|
||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||
if (!uiResolvePixel(gid.xy)) { return; }
|
||
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
|
||
var dst = textureLoad(prevTex, coord, 0);
|
||
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
|
||
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
|
||
let it = items[i];
|
||
let center = it.centerRadius.xy;
|
||
let radius = it.centerRadius.z;
|
||
let d = length(sp - center) - radius;
|
||
let bodyA = clamp(0.5 - d, 0.0, 1.0);
|
||
if (bodyA <= 0.0 && it.outline.x <= 0.0) { continue; }
|
||
var src = vec4<f32>(it.color.rgb, it.color.a * bodyA);
|
||
if (it.outline.x > 0.0) {
|
||
let t = abs(d + it.outline.x * 0.5) - it.outline.x * 0.5;
|
||
let outlineA = clamp(0.5 - t, 0.0, 1.0);
|
||
src = vec4<f32>(mix(src.rgb, it.outline.yzw, vec3<f32>(outlineA)),
|
||
max(src.a, outlineA));
|
||
}
|
||
if (src.a <= 0.0) { continue; }
|
||
dst = uiBlendOver(dst, src);
|
||
}
|
||
textureStore(outTex, coord, dst);
|
||
}
|
||
`;
|
||
|
||
const wgslImages = wgslShared + String.raw`
|
||
struct ImageItem {
|
||
rect: vec4<f32>,
|
||
uv: vec4<f32>,
|
||
tint: vec4<f32>,
|
||
slots: vec4<u32>,
|
||
};
|
||
@group(2) @binding(0) var<storage, read> items : array<ImageItem>;
|
||
@group(3) @binding(0) var imgTex : texture_2d<f32>;
|
||
@group(3) @binding(1) var imgSampler : sampler;
|
||
|
||
@compute @workgroup_size(8, 8, 1)
|
||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||
if (!uiResolvePixel(gid.xy)) { return; }
|
||
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
|
||
var dst = textureLoad(prevTex, coord, 0);
|
||
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
|
||
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
|
||
let it = items[i];
|
||
let lo = it.rect.xy;
|
||
let hi = it.rect.xy + it.rect.zw;
|
||
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
|
||
let t = (sp - lo) / it.rect.zw;
|
||
let uv = vec2<f32>(mix(it.uv.x, it.uv.z, t.x), mix(it.uv.y, it.uv.w, t.y));
|
||
let sample = textureSampleLevel(imgTex, imgSampler, uv, 0.0);
|
||
let src = sample * it.tint;
|
||
if (src.a <= 0.0) { continue; }
|
||
dst = uiBlendOver(dst, src);
|
||
}
|
||
textureStore(outTex, coord, dst);
|
||
}
|
||
`;
|
||
|
||
const wgslText = wgslShared + String.raw`
|
||
struct GlyphItem {
|
||
rect: vec4<f32>,
|
||
uv: vec4<f32>,
|
||
color: vec4<f32>,
|
||
};
|
||
@group(2) @binding(0) var<storage, read> items : array<GlyphItem>;
|
||
@group(3) @binding(0) var atlasTex : texture_2d<f32>;
|
||
@group(3) @binding(1) var atlasSampler : sampler;
|
||
|
||
@compute @workgroup_size(8, 8, 1)
|
||
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||
if (!uiResolvePixel(gid.xy)) { return; }
|
||
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
|
||
var dst = textureLoad(prevTex, coord, 0);
|
||
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
|
||
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
|
||
let it = items[i];
|
||
let lo = it.rect.xy;
|
||
let hi = it.rect.xy + it.rect.zw;
|
||
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
|
||
let t = (sp - lo) / it.rect.zw;
|
||
let uv = vec2<f32>(mix(it.uv.x, it.uv.z, t.x), mix(it.uv.y, it.uv.w, t.y));
|
||
// stb_truetype SDF: pixel value ~128 is the edge. Treat alpha as
|
||
// the smoothed step around that midpoint.
|
||
let sdf = textureSampleLevel(atlasTex, atlasSampler, uv, 0.0).r;
|
||
let alpha = clamp((sdf - 0.5) * 8.0 + 0.5, 0.0, 1.0);
|
||
if (alpha <= 0.0) { continue; }
|
||
let src = vec4<f32>(it.color.rgb, it.color.a * alpha);
|
||
dst = uiBlendOver(dst, src);
|
||
}
|
||
textureStore(outTex, coord, dst);
|
||
}
|
||
`;
|
||
|
||
function makePipeline(label, wgsl, hasGroup3) {
|
||
const mod = device.createShaderModule({ label, code: wgsl });
|
||
// Layout: group 0 uniform header, group 1 (out storage + prev sampled),
|
||
// group 2 storage items SSBO, optional group 3 (texture + sampler).
|
||
const bgl0 = device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 48 } },
|
||
]});
|
||
const bgl1 = device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
|
||
{ binding: 1, visibility: GPUShaderStage.COMPUTE, texture: { sampleType: "float", viewDimension: "2d" } },
|
||
]});
|
||
const bgl2 = device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
]});
|
||
const layouts = [bgl0, bgl1, bgl2];
|
||
let bgl3 = null;
|
||
if (hasGroup3) {
|
||
bgl3 = device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, texture: { sampleType: "float", viewDimension: "2d" } },
|
||
{ binding: 1, visibility: GPUShaderStage.COMPUTE, sampler: { type: "filtering" } },
|
||
]});
|
||
layouts.push(bgl3);
|
||
}
|
||
const pl = device.createPipelineLayout({ bindGroupLayouts: layouts });
|
||
const pipeline = device.createComputePipeline({
|
||
layout: pl,
|
||
compute: { module: mod, entryPoint: "main" },
|
||
});
|
||
return { pipeline, bgl0, bgl1, bgl2, bgl3 };
|
||
}
|
||
|
||
const pipeQuads = makePipeline("ui-quads", wgslQuads, false);
|
||
const pipeCircles = makePipeline("ui-circles", wgslCircles, false);
|
||
const pipeImages = makePipeline("ui-images", wgslImages, true);
|
||
const pipeText = makePipeline("ui-text", wgslText, true);
|
||
|
||
// Bind groups for group 0 (header uniform with dynamic offset) — one per
|
||
// pipeline, references the same ring buffer.
|
||
function makeHdrBG(pipe) {
|
||
return device.createBindGroup({
|
||
layout: pipe.bgl0,
|
||
entries: [{ binding: 0, resource: { buffer: state.headerRing, offset: 0, size: 48 } }],
|
||
});
|
||
}
|
||
const hdrBG = {
|
||
quads: makeHdrBG(pipeQuads),
|
||
circles: makeHdrBG(pipeCircles),
|
||
images: makeHdrBG(pipeImages),
|
||
text: makeHdrBG(pipeText),
|
||
};
|
||
|
||
// Group 1 changes between dispatches because `out` and `prev` swap on the
|
||
// ping-pong. Cached by current ping-pong direction and texture size; the
|
||
// stored bind group is reusable across all pipelines that share a
|
||
// layout-compatible bgl1 (all standard pipelines and custom shaders do,
|
||
// since they declare identical group-1 entries per the contract).
|
||
function getGroup1BG(bgl1) {
|
||
const key = `g1/${state.outIsPing ? 1 : 0}/${state.width}x${state.height}`;
|
||
let bg = state.bindGroupCache.get(key);
|
||
if (bg) return bg;
|
||
const outView = state.outIsPing ? state.pingView : state.pongView;
|
||
const prevView = state.outIsPing ? state.pongView : state.pingView;
|
||
bg = device.createBindGroup({
|
||
layout: bgl1,
|
||
entries: [
|
||
{ binding: 0, resource: outView },
|
||
{ binding: 1, resource: prevView },
|
||
],
|
||
});
|
||
state.bindGroupCache.set(key, bg);
|
||
return bg;
|
||
}
|
||
|
||
function getGroup2BG(pipe, itemsHandle) {
|
||
const key = `items/${pipe === pipeQuads ? "q" : pipe === pipeCircles ? "c" : pipe === pipeImages ? "i" : "t"}/${itemsHandle}`;
|
||
let bg = state.bindGroupCache.get(key);
|
||
if (bg) return bg;
|
||
const buf = buffers.get(itemsHandle);
|
||
if (!buf) throw new Error(`getGroup2BG: unknown items buffer ${itemsHandle}`);
|
||
bg = device.createBindGroup({
|
||
layout: pipe.bgl2,
|
||
entries: [{ binding: 0, resource: { buffer: buf } }],
|
||
});
|
||
state.bindGroupCache.set(key, bg);
|
||
return bg;
|
||
}
|
||
|
||
function getGroup3BG(pipe, texHandle, sampHandle) {
|
||
const key = `t3/${texHandle}/${sampHandle}/${pipe === pipeImages ? "i" : "x"}`;
|
||
let bg = state.bindGroupCache.get(key);
|
||
if (bg) return bg;
|
||
const tex = textureViews.get(texHandle);
|
||
const sam = samplers.get(sampHandle);
|
||
if (!tex || !sam) throw new Error(`getGroup3BG: unknown view ${texHandle} / sampler ${sampHandle}`);
|
||
bg = device.createBindGroup({
|
||
layout: pipe.bgl3,
|
||
entries: [
|
||
{ binding: 0, resource: tex },
|
||
{ binding: 1, resource: sam },
|
||
],
|
||
});
|
||
state.bindGroupCache.set(key, bg);
|
||
return bg;
|
||
}
|
||
|
||
// ─── wasm import surface ───────────────────────────────────────────────
|
||
|
||
let wasmExports = null;
|
||
|
||
// Crafter.Build's runtime.js exposes the wasi instance on
|
||
// window.crafter_wasi after instantiation. We grab the exports lazily so
|
||
// every import-side function works regardless of call order. memU8 /
|
||
// memF32 / memU32 always re-derive the typed-array view because the
|
||
// wasm memory's backing ArrayBuffer is detached and replaced whenever
|
||
// the wasm grows its memory; caching a typed array would alias to
|
||
// freed memory after a grow.
|
||
function getExports() {
|
||
if (wasmExports) return wasmExports;
|
||
const wasi = window.crafter_wasi;
|
||
if (!wasi || !wasi.instance) {
|
||
throw new Error("[crafter-wgpu] wasm exports not available yet (called too early)");
|
||
}
|
||
wasmExports = wasi.instance.exports;
|
||
return wasmExports;
|
||
}
|
||
function memU8() { return new Uint8Array(getExports().memory.buffer); }
|
||
function memF32() { return new Float32Array(getExports().memory.buffer); }
|
||
function memU32() { return new Uint32Array(getExports().memory.buffer); }
|
||
|
||
// Stubs were assigned at the top of this file; we now overwrite them with
|
||
// real implementations now that init has succeeded.
|
||
const env = window.crafter_webbuild_env;
|
||
|
||
env.wgpuGetCanvasWidth = () => canvas.width;
|
||
env.wgpuGetCanvasHeight = () => canvas.height;
|
||
|
||
env.wgpuCreateBuffer = (byteSize) => {
|
||
const h = newHandle();
|
||
const buf = device.createBuffer({
|
||
size: Math.max(16, byteSize),
|
||
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
|
||
});
|
||
buffers.set(h, buf);
|
||
return h;
|
||
};
|
||
env.wgpuWriteBuffer = (handle, srcPtr, byteSize) => {
|
||
state.writeBufferCount = (state.writeBufferCount || 0) + 1;
|
||
state.lastWriteHandle = handle;
|
||
state.lastWriteSize = byteSize;
|
||
const buf = buffers.get(handle);
|
||
if (!buf) return;
|
||
// writeBuffer requires a multiple of 4 bytes and an aligned offset.
|
||
const aligned = (byteSize + 3) & ~3;
|
||
queue.writeBuffer(buf, 0, memU8().buffer, srcPtr, aligned);
|
||
};
|
||
// Partial write — copies a sub-range of `srcPtr` into the GPU buffer at
|
||
// `dstByteOffset`. Used by BuildTLAS to skip the transform field of
|
||
// GPU-owned instances; the physics-tlas-transform compute shader is the
|
||
// sole writer of those bytes and we must not clobber its output with a
|
||
// stale CPU mirror.
|
||
env.wgpuWriteBufferRange = (handle, dstByteOffset, srcPtr, byteSize) => {
|
||
const buf = buffers.get(handle);
|
||
if (!buf) return;
|
||
const aligned = (byteSize + 3) & ~3;
|
||
queue.writeBuffer(buf, dstByteOffset, memU8().buffer, srcPtr, aligned);
|
||
};
|
||
|
||
// ── GPU→CPU readback (staging + mapAsync) ──────────────────────────────
|
||
//
|
||
// WebGPU storage buffers can't be CPU-mapped directly (STORAGE usage is
|
||
// incompatible with MAP_READ). Each readback keeps a parallel staging
|
||
// buffer (MAP_READ | COPY_DST) at the same size. wgpuReadbackEnqueue
|
||
// copies the storage buffer into the staging buffer and kicks off an
|
||
// async map. wgpuReadbackPoll synchronously returns whether the map has
|
||
// resolved; if so it copies the bytes into the caller's wasm pointer
|
||
// and the slot is ready for the next Enqueue.
|
||
//
|
||
// Used by Forts3D's physics event drain to read the GPU-written
|
||
// destroy/hit/splash event queues with a one-frame latency.
|
||
|
||
const READBACK_IDLE = 0;
|
||
const READBACK_PENDING = 1;
|
||
const READBACK_READY = 2;
|
||
const readbacks = new Map(); // device-buffer handle → { staging, size, state, pendingData }
|
||
// Readbacks scheduled this frame that still need their mapAsync kicked
|
||
// off — done after the frame's queue.submit so the map waits for the
|
||
// compute writes that wrote to `buf` to finish, not just the standalone
|
||
// copy encoder.
|
||
const pendingReadbackMaps = [];
|
||
|
||
env.wgpuReadbackEnqueue = (handle, byteSize, resetBytes) => {
|
||
const buf = buffers.get(handle);
|
||
if (!buf) return;
|
||
const aligned = (byteSize + 3) & ~3;
|
||
const resetAligned = resetBytes > 0 ? ((resetBytes + 3) & ~3) : 0;
|
||
let rb = readbacks.get(handle);
|
||
if (!rb) {
|
||
rb = {
|
||
staging: device.createBuffer({
|
||
size: Math.max(16, aligned),
|
||
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
|
||
}),
|
||
size: aligned,
|
||
state: READBACK_IDLE,
|
||
pendingData: null,
|
||
};
|
||
readbacks.set(handle, rb);
|
||
}
|
||
if (rb.state !== READBACK_IDLE) {
|
||
// Previous map still in flight (or has data nobody polled yet);
|
||
// skip this enqueue AND the paired reset. Events written by the
|
||
// current frame's substeps will accumulate in `buf` and get
|
||
// picked up by the next successful enqueue (which captures the
|
||
// post-accumulation count and then clears it). Resetting here
|
||
// without copying would wipe those events before they could be
|
||
// drained, causing destroyed projectiles to never reach the CPU.
|
||
return;
|
||
}
|
||
|
||
if (state.encoder) {
|
||
// Mid-frame: piggy-back the copy on the frame's main encoder so
|
||
// it runs AFTER the compute dispatches that wrote `buf`. End the
|
||
// active pass, encode the copy (+ optional reset), reopen the
|
||
// pass. mapAsync gets deferred to wgpuFrameEnd (after
|
||
// queue.submit) — the map resolves once the GPU has finished
|
||
// the submitted work, including our just-encoded copy.
|
||
if (state.pass) {
|
||
state.pass.end();
|
||
state.pass = null;
|
||
}
|
||
state.encoder.copyBufferToBuffer(buf, 0, rb.staging, 0, aligned);
|
||
if (resetAligned > 0) {
|
||
// Clear the first `resetAligned` bytes of `buf` so the next
|
||
// frame's atomic-add starts at 0. Encoded after the copy,
|
||
// so the staging captures the pre-clear count. Tied to a
|
||
// successful enqueue — if the enqueue was skipped above the
|
||
// clear is skipped too, preserving events for the next
|
||
// successful drain.
|
||
state.encoder.clearBuffer(buf, 0, resetAligned);
|
||
}
|
||
state.pass = state.encoder.beginComputePass();
|
||
rb.state = READBACK_PENDING;
|
||
pendingReadbackMaps.push(rb);
|
||
} else {
|
||
// Standalone (no frame in progress): submit our own encoder and
|
||
// kick off mapAsync immediately.
|
||
const enc = device.createCommandEncoder();
|
||
enc.copyBufferToBuffer(buf, 0, rb.staging, 0, aligned);
|
||
if (resetAligned > 0) enc.clearBuffer(buf, 0, resetAligned);
|
||
queue.submit([enc.finish()]);
|
||
rb.state = READBACK_PENDING;
|
||
rb.staging.mapAsync(GPUMapMode.READ).then(() => {
|
||
rb.pendingData = new Uint8Array(rb.staging.getMappedRange()).slice();
|
||
rb.staging.unmap();
|
||
rb.state = READBACK_READY;
|
||
}).catch(e => {
|
||
console.error("[crafter-wgpu] readback mapAsync failed:", e);
|
||
rb.state = READBACK_IDLE;
|
||
});
|
||
}
|
||
};
|
||
|
||
// Returns 1 if the readback for `handle` has completed and bytes were
|
||
// copied into `dstPtr`; returns 0 otherwise (caller retries next frame).
|
||
// After a successful poll the slot is idle again, ready for the next
|
||
// Enqueue.
|
||
env.wgpuReadbackPoll = (handle, dstPtr, byteSize) => {
|
||
const rb = readbacks.get(handle);
|
||
if (!rb || rb.state !== READBACK_READY) return 0;
|
||
memU8().set(rb.pendingData.subarray(0, byteSize), dstPtr);
|
||
rb.pendingData = null;
|
||
rb.state = READBACK_IDLE;
|
||
return 1;
|
||
};
|
||
// Non-consuming readiness check. Lets callers verify multiple readbacks
|
||
// have all resolved before consuming any — needed when a logical drain
|
||
// reads parallel header/array buffers and mustn't ack the header until
|
||
// the array bytes are also available, or the events get lost.
|
||
env.wgpuReadbackReady = (handle) => {
|
||
const rb = readbacks.get(handle);
|
||
return (rb && rb.state === READBACK_READY) ? 1 : 0;
|
||
};
|
||
env.wgpuDestroyBuffer = (handle) => {
|
||
const buf = buffers.get(handle);
|
||
if (buf) { buf.destroy(); buffers.delete(handle); }
|
||
// Invalidate any cached bind group that referenced this handle.
|
||
for (const k of state.bindGroupCache.keys()) {
|
||
if (k.startsWith("items/") && k.endsWith("/" + handle)) {
|
||
state.bindGroupCache.delete(k);
|
||
}
|
||
}
|
||
};
|
||
|
||
env.wgpuCreateAtlasTexture = (w, h) => {
|
||
const handle = newHandle();
|
||
const tex = device.createTexture({
|
||
size: [w, h],
|
||
format: "r8unorm",
|
||
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
|
||
});
|
||
textures.set(handle, tex);
|
||
textureViews.set(handle, tex.createView());
|
||
return handle;
|
||
};
|
||
env.wgpuWriteAtlasRegion = (handle, srcPtr, srcW, srcH, srcBytesPerRow, dstX, dstY, copyW, copyH) => {
|
||
const tex = textures.get(handle);
|
||
if (!tex) return;
|
||
// For r8unorm, 1 byte per pixel; writeTexture requires bytesPerRow >= 256
|
||
// OR == width if width*1 % 256 === 0 — for arbitrary widths we need to
|
||
// re-pack into a 256-aligned staging buffer.
|
||
const alignedBPR = Math.max(256, (srcBytesPerRow + 255) & ~255);
|
||
if (alignedBPR === srcBytesPerRow) {
|
||
const bytes = memU8().subarray(srcPtr + dstY * srcBytesPerRow + dstX,
|
||
srcPtr + (dstY + copyH) * srcBytesPerRow);
|
||
queue.writeTexture(
|
||
{ texture: tex, origin: { x: dstX, y: dstY } },
|
||
bytes,
|
||
{ bytesPerRow: srcBytesPerRow, rowsPerImage: copyH },
|
||
{ width: copyW, height: copyH }
|
||
);
|
||
} else {
|
||
// Repack copyW × copyH starting at (dstX, dstY) in the source.
|
||
const staging = new Uint8Array(alignedBPR * copyH);
|
||
const src = memU8();
|
||
for (let y = 0; y < copyH; y++) {
|
||
const srcRow = (dstY + y) * srcBytesPerRow + dstX;
|
||
staging.set(src.subarray(srcPtr + srcRow, srcPtr + srcRow + copyW),
|
||
y * alignedBPR);
|
||
}
|
||
queue.writeTexture(
|
||
{ texture: tex, origin: { x: dstX, y: dstY } },
|
||
staging,
|
||
{ bytesPerRow: alignedBPR, rowsPerImage: copyH },
|
||
{ width: copyW, height: copyH }
|
||
);
|
||
}
|
||
};
|
||
env.wgpuDestroyTexture = (handle) => {
|
||
const tex = textures.get(handle);
|
||
if (tex) { tex.destroy(); textures.delete(handle); textureViews.delete(handle); }
|
||
};
|
||
|
||
// General-purpose 2D rgba8unorm texture, used by Image2D<RGBA8>. Distinct
|
||
// from the atlas path (r8unorm, sub-region writes) — this one's a one-shot
|
||
// upload of a whole image, sized to the pixel data the caller hands over.
|
||
env.wgpuCreateImage2D = (w, h) => {
|
||
const handle = newHandle();
|
||
const tex = device.createTexture({
|
||
size: [w, h],
|
||
format: "rgba8unorm",
|
||
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
|
||
});
|
||
textures.set(handle, tex);
|
||
textureViews.set(handle, tex.createView());
|
||
return handle;
|
||
};
|
||
// 2D texture array — N layers of identical (w × h) rgba8unorm with
|
||
// `mipLevels` mip levels. Pass mipLevels=1 for a single-level texture.
|
||
// Used by Image2DArray<RGBA8> to back one material albedo per layer;
|
||
// shaders sample with `textureSampleLevel(tex, samp, uv, layerIdx, lod)`
|
||
// where lod ∈ [0, mipLevels-1].
|
||
env.wgpuCreateImage2DArray = (w, h, layerCount, mipLevels) => {
|
||
const handle = newHandle();
|
||
const mips = (typeof mipLevels === "number" && mipLevels > 0) ? mipLevels : 1;
|
||
const tex = device.createTexture({
|
||
size: [w, h, layerCount],
|
||
dimension: "2d",
|
||
format: "rgba8unorm",
|
||
mipLevelCount: mips,
|
||
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
|
||
});
|
||
textures.set(handle, tex);
|
||
textureViews.set(handle, tex.createView({
|
||
dimension: "2d-array",
|
||
arrayLayerCount: layerCount,
|
||
mipLevelCount: mips,
|
||
}));
|
||
return handle;
|
||
};
|
||
// Upload a single mip level of one array layer. `level` indexes into the
|
||
// texture's mip chain; `w` / `h` are the dimensions at that level (= base
|
||
// dimensions >> level). Caller supplies the pre-downsampled bytes for
|
||
// each level — Image2DArray::UpdateLayer on the C++ side does the box-
|
||
// filter chain.
|
||
env.wgpuWriteImage2DLayer = (handle, layer, level, srcPtr, byteSize, w, h) => {
|
||
const tex = textures.get(handle);
|
||
if (!tex) return;
|
||
const srcBPR = w * 4;
|
||
const alignedBPR = (srcBPR + 255) & ~255;
|
||
if (alignedBPR === srcBPR) {
|
||
queue.writeTexture(
|
||
{ texture: tex, mipLevel: level, origin: [0, 0, layer] },
|
||
memU8().subarray(srcPtr, srcPtr + byteSize),
|
||
{ bytesPerRow: srcBPR, rowsPerImage: h },
|
||
{ width: w, height: h, depthOrArrayLayers: 1 }
|
||
);
|
||
} else {
|
||
const staging = new Uint8Array(alignedBPR * h);
|
||
const src = memU8();
|
||
for (let y = 0; y < h; y++) {
|
||
staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
|
||
y * alignedBPR);
|
||
}
|
||
queue.writeTexture(
|
||
{ texture: tex, mipLevel: level, origin: [0, 0, layer] },
|
||
staging,
|
||
{ bytesPerRow: alignedBPR, rowsPerImage: h },
|
||
{ width: w, height: h, depthOrArrayLayers: 1 }
|
||
);
|
||
}
|
||
};
|
||
|
||
env.wgpuWriteImage2D = (handle, srcPtr, byteSize, w, h) => {
|
||
const tex = textures.get(handle);
|
||
if (!tex) return;
|
||
// queue.writeTexture wants bytesPerRow as a multiple of 256, OR == width*bpp
|
||
// when the source is contiguous. RGBA8 = 4 bpp, so bytesPerRow = w*4.
|
||
const srcBPR = w * 4;
|
||
const alignedBPR = (srcBPR + 255) & ~255;
|
||
if (alignedBPR === srcBPR) {
|
||
// Already aligned (w * 4 is a multiple of 256 → w is a multiple of 64).
|
||
queue.writeTexture(
|
||
{ texture: tex },
|
||
memU8().subarray(srcPtr, srcPtr + byteSize),
|
||
{ bytesPerRow: srcBPR, rowsPerImage: h },
|
||
{ width: w, height: h }
|
||
);
|
||
} else {
|
||
// Repack into a 256-aligned staging buffer. One alloc per Update,
|
||
// freed when the function returns — fine for asset-load time use.
|
||
const staging = new Uint8Array(alignedBPR * h);
|
||
const src = memU8();
|
||
for (let y = 0; y < h; y++) {
|
||
staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
|
||
y * alignedBPR);
|
||
}
|
||
queue.writeTexture(
|
||
{ texture: tex },
|
||
staging,
|
||
{ bytesPerRow: alignedBPR, rowsPerImage: h },
|
||
{ width: w, height: h }
|
||
);
|
||
}
|
||
};
|
||
|
||
env.wgpuCreateLinearClampSampler = () => {
|
||
const handle = newHandle();
|
||
samplers.set(handle, device.createSampler({
|
||
magFilter: "linear", minFilter: "linear",
|
||
addressModeU: "clamp-to-edge", addressModeV: "clamp-to-edge",
|
||
}));
|
||
return handle;
|
||
};
|
||
|
||
env.wgpuCreateLinearRepeatSampler = () => {
|
||
const handle = newHandle();
|
||
samplers.set(handle, device.createSampler({
|
||
magFilter: "linear", minFilter: "linear",
|
||
mipmapFilter: "linear",
|
||
addressModeU: "repeat", addressModeV: "repeat",
|
||
}));
|
||
return handle;
|
||
};
|
||
|
||
// ─── per-frame ──────────────────────────────────────────────────────────
|
||
|
||
env.wgpuFrameBegin = () => {
|
||
state.frameBeginCount = (state.frameBeginCount || 0) + 1;
|
||
if (state.gpuLost) return;
|
||
ensureSized();
|
||
state.encoder = device.createCommandEncoder();
|
||
state.outIsPing = true; // reset so each frame starts on the same target
|
||
state.headerRingOffset = 0;
|
||
// DON'T clearBuffer the header ring here. queue.writeBuffer ops from
|
||
// writeHeader() are enqueued BEFORE this command buffer's submit,
|
||
// so an encoded clearBuffer would wipe them — the dispatches would
|
||
// then read all-zero headers and uiResolvePixel would reject every
|
||
// pixel (surfaceW=0).
|
||
clearStorageTexture(state.encoder, state.outIsPing ? state.pingTex : state.pongTex,
|
||
state.width, state.height);
|
||
state.pass = state.encoder.beginComputePass();
|
||
};
|
||
|
||
let zeroBuffer = null;
|
||
let zeroBufferSize = 0;
|
||
function clearStorageTexture(encoder, tex, w, h) {
|
||
const bpr = (w * 4 + 255) & ~255;
|
||
const need = bpr * h;
|
||
if (!zeroBuffer || zeroBufferSize < need) {
|
||
if (zeroBuffer) zeroBuffer.destroy();
|
||
zeroBuffer = device.createBuffer({ size: need, usage: GPUBufferUsage.COPY_SRC, mappedAtCreation: true });
|
||
new Uint8Array(zeroBuffer.getMappedRange()).fill(0);
|
||
zeroBuffer.unmap();
|
||
zeroBufferSize = need;
|
||
}
|
||
encoder.copyBufferToTexture(
|
||
{ buffer: zeroBuffer, bytesPerRow: bpr, rowsPerImage: h },
|
||
{ texture: tex },
|
||
{ width: w, height: h, depthOrArrayLayers: 1 }
|
||
);
|
||
}
|
||
|
||
env.wgpuFrameEnd = () => {
|
||
state.frameEndCount = (state.frameEndCount || 0) + 1;
|
||
if (state.gpuLost || !state.encoder) return;
|
||
state.pass.end();
|
||
state.pass = null;
|
||
// Blit last-written ping-pong texture → canvas. After N dispatches,
|
||
// state.outIsPing points at the NEXT write target, so the latest
|
||
// content lives in the OPPOSITE texture.
|
||
const finalTex = state.outIsPing ? state.pongTex : state.pingTex;
|
||
const canvasTex = ctx.getCurrentTexture();
|
||
state.encoder.copyTextureToTexture(
|
||
{ texture: finalTex },
|
||
{ texture: canvasTex },
|
||
{ width: state.width, height: state.height, depthOrArrayLayers: 1 }
|
||
);
|
||
queue.submit([state.encoder.finish()]);
|
||
state.encoder = null;
|
||
|
||
// Kick off mapAsync for the readbacks whose copyBufferToBuffer we
|
||
// piggy-backed onto the just-submitted encoder. Doing this after
|
||
// submit ensures the map waits for that submission's GPU work to
|
||
// complete, so the staging buffer reflects this frame's compute
|
||
// writes (not pre-substep state).
|
||
while (pendingReadbackMaps.length > 0) {
|
||
const rb = pendingReadbackMaps.pop();
|
||
rb.staging.mapAsync(GPUMapMode.READ).then(() => {
|
||
rb.pendingData = new Uint8Array(rb.staging.getMappedRange()).slice();
|
||
rb.staging.unmap();
|
||
rb.state = READBACK_READY;
|
||
}).catch(e => {
|
||
console.error("[crafter-wgpu] readback mapAsync failed:", e);
|
||
rb.state = READBACK_IDLE;
|
||
});
|
||
}
|
||
};
|
||
|
||
// Write a push struct into the ring buffer at the current offset (which
|
||
// is incremented and 256-aligned). Standard dispatches pass just the
|
||
// 48-byte UIDispatchHeader; custom shaders may pass up to HEADER_ALIGN
|
||
// bytes — anything past the header is the user's per-dispatch struct,
|
||
// declared in WGSL as additional fields after UIDispatchHeader at
|
||
// @group(0) @binding(0). Returns the dynamic offset to pass to
|
||
// setBindGroup.
|
||
function writeHeader(headerPtr, bytes = 48) {
|
||
const upload = Math.min(bytes, HEADER_ALIGN);
|
||
const offset = state.headerRingOffset;
|
||
if (offset + HEADER_ALIGN > state.headerRingSize) {
|
||
// Ring is small enough that overrun in one frame means too many
|
||
// dispatches. Soft-wrap; correctness already requires the ring
|
||
// be large enough.
|
||
state.headerRingOffset = 0;
|
||
}
|
||
queue.writeBuffer(state.headerRing, state.headerRingOffset,
|
||
memU8().buffer, headerPtr, upload);
|
||
state.headerRingOffset += HEADER_ALIGN;
|
||
return offset;
|
||
}
|
||
|
||
function dispatchStandard(pipe, hdrBindGroup, headerPtr, gx, gy, itemsHandle, group3) {
|
||
if (!state.pass) return;
|
||
const off = writeHeader(headerPtr);
|
||
state.pass.setPipeline(pipe.pipeline);
|
||
state.pass.setBindGroup(0, hdrBindGroup, [off]);
|
||
state.pass.setBindGroup(1, getGroup1BG(pipe.bgl1));
|
||
state.pass.setBindGroup(2, getGroup2BG(pipe, itemsHandle));
|
||
if (group3) state.pass.setBindGroup(3, group3);
|
||
state.pass.dispatchWorkgroups(gx, gy, 1);
|
||
// Flip ping-pong: the texture we just wrote becomes next dispatch's prev.
|
||
state.outIsPing = !state.outIsPing;
|
||
}
|
||
|
||
env.wgpuDispatchQuads = (itemsHandle, headerPtr, gx, gy) => {
|
||
state.dispatchQuadsCount = (state.dispatchQuadsCount || 0) + 1;
|
||
dispatchStandard(pipeQuads, hdrBG.quads, headerPtr, gx, gy, itemsHandle, null);
|
||
};
|
||
env.wgpuDispatchCircles = (itemsHandle, headerPtr, gx, gy) => {
|
||
dispatchStandard(pipeCircles, hdrBG.circles, headerPtr, gx, gy, itemsHandle, null);
|
||
};
|
||
env.wgpuDispatchImages = (itemsHandle, headerPtr, gx, gy, texHandle, sampHandle) => {
|
||
const g3 = getGroup3BG(pipeImages, texHandle, sampHandle);
|
||
dispatchStandard(pipeImages, hdrBG.images, headerPtr, gx, gy, itemsHandle, g3);
|
||
};
|
||
env.wgpuDispatchText = (itemsHandle, headerPtr, gx, gy, atlasHandle, sampHandle) => {
|
||
const g3 = getGroup3BG(pipeText, atlasHandle, sampHandle);
|
||
dispatchStandard(pipeText, hdrBG.text, headerPtr, gx, gy, itemsHandle, g3);
|
||
};
|
||
|
||
// ─── custom user-authored shaders ─────────────────────────────────────
|
||
//
|
||
// Bind-group contract (mirrors :WebGPUComputeShader.cppm):
|
||
// group 0 binding 0 — uniform UIDispatchHeader (dynamic offset, 48b)
|
||
// group 1 binding 0 — texture_storage_2d<rgba8unorm, write> out
|
||
// group 1 binding 1 — texture_2d<f32> prev
|
||
// group 2+ — user-declared (UICustomBinding entries)
|
||
//
|
||
// Each UICustomBinding entry on the wasm side is 8 bytes:
|
||
// u8 group, u8 binding, u8 kind, u8 pad, u32 pushOffset
|
||
// kind: 0 = read-only-storage SSBO, 1 = sampled tex 2d, 2 = filtering sampler.
|
||
|
||
const customPipelines = new Map(); // handle → { pipeline, bgls, hdrBG, byGroup }
|
||
|
||
env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQueryFlag) => {
|
||
if (!rtState.vertHeap && rayQueryFlag) rtInit();
|
||
const userWgsl = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
|
||
// For rayQuery-capable shaders, prepend the RT prelude + ray-query
|
||
// library. The user shader can declare its own group 0 / 2+ bindings
|
||
// but MUST NOT redeclare group(1) — that's reserved for RT data.
|
||
const wgsl = rayQueryFlag
|
||
// rayQueryLib's _rqTraverseBlas/_rqTraverseTlas call _rtAabb,
|
||
// _rtFetchTri, _rtTri from rtWgslPureHelpers — must prepend
|
||
// the pure helper subset (NOT the megakernel-only traversal
|
||
// routines, which reference user-emitted runAnyHit/runMiss/
|
||
// runClosestHit and won't compile outside the raygen pipeline).
|
||
? (rtWgslTypes + rtWgslMegakernelBindings + rtWgslPureHelpers + rtWgslRayQueryLib + "\n" + userWgsl)
|
||
: userWgsl;
|
||
|
||
const bindings = [];
|
||
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
|
||
for (let i = 0; i < bindingsCount; i++) {
|
||
bindings.push({
|
||
group: dv.getUint8(i*8 + 0),
|
||
binding: dv.getUint8(i*8 + 1),
|
||
kind: dv.getUint8(i*8 + 2),
|
||
pushOffset: dv.getUint32(i*8 + 4, true),
|
||
});
|
||
}
|
||
|
||
// Group bindings by @group(N) for layout creation.
|
||
const byGroup = new Map();
|
||
for (const b of bindings) {
|
||
if (b.group < 2) {
|
||
console.error(`[crafter-wgpu] custom shader: @group(${b.group}) reserved; use groups >= 2`);
|
||
return 0;
|
||
}
|
||
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
|
||
byGroup.get(b.group).push(b);
|
||
}
|
||
|
||
// Group 0 = header uniform (same for both paths).
|
||
// Group 1 = ping-pong out+prev OR RT data (TLAS, BVH, meshRecs, verts,
|
||
// idx, primRemap, outImage) when rayQuery flag is on.
|
||
//
|
||
// Custom shaders may declare a struct at @group(0) @binding(0) up to
|
||
// HEADER_ALIGN (256) bytes — the standard UIDispatchHeader is 48 but
|
||
// the wgpuDispatchCustom path uploads the full push buffer so user
|
||
// shaders can read extra fields past the header. minBindingSize=0
|
||
// disables the bound-size lower bound; the actual entry is sized to
|
||
// HEADER_ALIGN at bind time.
|
||
const bgls = [
|
||
device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
|
||
buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 0 } },
|
||
]}),
|
||
rayQueryFlag
|
||
? device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
|
||
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
|
||
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
]})
|
||
: device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
|
||
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
|
||
{ binding: 1, visibility: GPUShaderStage.COMPUTE,
|
||
texture: { sampleType: "float", viewDimension: "2d" } },
|
||
]}),
|
||
];
|
||
// Sorted custom groups. Pad any gaps with empty bgls (WebGPU pipeline
|
||
// layouts require a contiguous array of GPUBindGroupLayout per group
|
||
// index up to the highest used).
|
||
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
|
||
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 1;
|
||
for (let g = 2; g <= highest; g++) {
|
||
if (byGroup.has(g)) {
|
||
const entries = byGroup.get(g).map(b => {
|
||
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
|
||
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
|
||
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
|
||
else if (b.kind === 2) e.sampler = { type: "filtering" };
|
||
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
|
||
return e;
|
||
});
|
||
bgls.push(device.createBindGroupLayout({ entries }));
|
||
} else {
|
||
bgls.push(device.createBindGroupLayout({ entries: [] }));
|
||
}
|
||
}
|
||
|
||
let pipeline;
|
||
try {
|
||
const mod = device.createShaderModule({ code: wgsl });
|
||
const layout = device.createPipelineLayout({ bindGroupLayouts: bgls });
|
||
pipeline = device.createComputePipeline({ layout, compute: { module: mod, entryPoint: "main" } });
|
||
} catch (e) {
|
||
console.error("[crafter-wgpu] custom shader compile failed:", e);
|
||
if (rayQueryFlag) console.error("[crafter-wgpu] WGSL was:\n", wgsl);
|
||
return 0;
|
||
}
|
||
|
||
// Bind the full ring slot (HEADER_ALIGN bytes) so custom shaders can
|
||
// declare a struct at @group(0) @binding(0) that's larger than the
|
||
// 48-byte UIDispatchHeader. Standard pipelines keep their tight 48-
|
||
// byte slice. The bgl0 layout's minBindingSize stays at 48 — bigger
|
||
// is allowed.
|
||
const hdrBG = device.createBindGroup({
|
||
layout: bgls[0],
|
||
entries: [{ binding: 0, resource: { buffer: state.headerRing, offset: 0, size: HEADER_ALIGN } }],
|
||
});
|
||
|
||
const handle = newHandle();
|
||
customPipelines.set(handle, { pipeline, bgls, hdrBG, byGroup, sortedGroups, rayQueryCapable: !!rayQueryFlag });
|
||
return handle;
|
||
};
|
||
|
||
env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handlesCount,
|
||
gx, gy, gz) => {
|
||
state.dispatchCustomCount = (state.dispatchCustomCount || 0) + 1;
|
||
if (!state.pass) return;
|
||
const pipe = customPipelines.get(pipelineHandle);
|
||
if (!pipe) {
|
||
console.error("[crafter-wgpu] wgpuDispatchCustom: unknown pipeline", pipelineHandle);
|
||
return;
|
||
}
|
||
|
||
// Write the full push struct (UIDispatchHeader + any custom tail
|
||
// fields the shader declares) into the ring. WGSL's bound struct
|
||
// size decides how many bytes are actually read on the GPU.
|
||
const off = writeHeader(pushPtr, pushBytes);
|
||
|
||
state.pass.setPipeline(pipe.pipeline);
|
||
state.pass.setBindGroup(0, pipe.hdrBG, [off]);
|
||
// Group 1: rayQuery-capable shaders get the RT data heaps + the most
|
||
// recently built TLAS; everyone else gets the standard ping-pong pair.
|
||
if (pipe.rayQueryCapable) {
|
||
const tlasBuf = buffers.get(rtState.currentTlas);
|
||
if (!tlasBuf) {
|
||
console.error("[crafter-wgpu] rayQuery dispatch but no TLAS built yet");
|
||
return;
|
||
}
|
||
const outView = state.outIsPing ? state.pingView : state.pongView;
|
||
const orderBuf = buffers.get(rtState.currentEntryOrder);
|
||
const bvhBuf = buffers.get(rtState.currentBvh);
|
||
if (!orderBuf || !bvhBuf) {
|
||
console.error("[crafter-wgpu] wgpuDispatchCustom rayQuery: no entryOrder/bins (TLAS not built)");
|
||
return;
|
||
}
|
||
const rtBG = device.createBindGroup({
|
||
layout: pipe.bgls[1],
|
||
entries: [
|
||
{ binding: 0, resource: { buffer: tlasBuf } },
|
||
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
|
||
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
|
||
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
|
||
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
|
||
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
|
||
{ binding: 6, resource: outView },
|
||
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
|
||
{ binding: 8, resource: { buffer: orderBuf } },
|
||
{ binding: 9, resource: { buffer: bvhBuf } },
|
||
],
|
||
});
|
||
state.pass.setBindGroup(1, rtBG);
|
||
} else {
|
||
state.pass.setBindGroup(1, getGroup1BG(pipe.bgls[1]));
|
||
}
|
||
|
||
// Walk bindings in declaration order and assemble bind groups.
|
||
// handles[] from wasm is in the SAME order as customBindings, so we
|
||
// pick up indices by walking byGroup in the same sorted order.
|
||
const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
|
||
let handleIdx = 0;
|
||
for (const g of pipe.sortedGroups) {
|
||
const entries = pipe.byGroup.get(g).map(b => {
|
||
const h = handles[handleIdx++];
|
||
let resource;
|
||
if (b.kind === 0) resource = { buffer: buffers.get(h) };
|
||
else if (b.kind === 1) resource = textureViews.get(h);
|
||
else if (b.kind === 2) resource = samplers.get(h);
|
||
else if (b.kind === 3) resource = textureViews.get(h);
|
||
return { binding: b.binding, resource };
|
||
});
|
||
const bg = device.createBindGroup({ layout: pipe.bgls[g], entries });
|
||
state.pass.setBindGroup(g, bg);
|
||
}
|
||
|
||
state.pass.dispatchWorkgroups(gx, gy, gz);
|
||
state.outIsPing = !state.outIsPing;
|
||
};
|
||
|
||
// Debug accessor for browser-console diagnostics.
|
||
window.crafter_wgpu_state = state;
|
||
window.crafter_wgpu_device = device;
|
||
window.crafter_wgpu_canvasCtx = ctx;
|
||
window.crafter_wgpu_debug = () => ({
|
||
width: state.width, height: state.height,
|
||
outIsPing: state.outIsPing,
|
||
encoderActive: !!state.encoder,
|
||
passActive: !!state.pass,
|
||
bgCacheSize: state.bindGroupCache.size,
|
||
bufferHandles: buffers.size,
|
||
textureHandles: textures.size,
|
||
samplerHandles: samplers.size,
|
||
headerRingOffset: state.headerRingOffset,
|
||
frameBeginCount: state.frameBeginCount || 0,
|
||
frameEndCount: state.frameEndCount || 0,
|
||
dispatchQuadsCount: state.dispatchQuadsCount || 0,
|
||
writeBufferCount: state.writeBufferCount || 0,
|
||
lastWriteHandle: state.lastWriteHandle,
|
||
lastWriteSize: state.lastWriteSize,
|
||
});
|
||
|
||
window.crafter_wgpu_bufferKeys = () => [...buffers.keys()];
|
||
|
||
// Read back the first QuadItem from a registered buffer to verify the
|
||
// GPU sees what the CPU wrote.
|
||
window.crafter_wgpu_readBuffer = async (handle, byteSize = 64) => {
|
||
const buf = buffers.get(handle);
|
||
if (!buf) return "no buffer for handle " + handle;
|
||
const read = device.createBuffer({ size: 256, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST });
|
||
const enc = device.createCommandEncoder();
|
||
enc.copyBufferToBuffer(buf, 0, read, 0, byteSize);
|
||
device.queue.submit([enc.finish()]);
|
||
await read.mapAsync(GPUMapMode.READ);
|
||
const data = new Float32Array(read.getMappedRange().slice(0, byteSize));
|
||
read.unmap();
|
||
return [...data];
|
||
};
|
||
|
||
// Surface size getters (the wasm side may query these on Resize events).
|
||
env.wgpuSurfaceWidth = () => state.width || canvas.width;
|
||
env.wgpuSurfaceHeight = () => state.height || canvas.height;
|
||
|
||
// One-shot init: forces ping-pong allocation at current canvas size so
|
||
// any Buffer/Texture creation before the first frame works against a
|
||
// concrete size. Called by Crafter::Device::Initialize on the wasm side.
|
||
env.wgpuInit = () => {
|
||
const { w, h } = syncCanvasSize();
|
||
recreatePingPong(w, h);
|
||
};
|
||
|
||
// Resize listener — wires up to the same `resize` event dom-env.js
|
||
// listens to. We trigger sizing on next frame begin; no work here.
|
||
window.addEventListener("resize", () => { /* ensureSized in wgpuFrameBegin */ });
|
||
|
||
// ─────────────────────────────────────────────────────────────────────
|
||
// ── Software raytracing subsystem ────────────────────────────────────
|
||
// ─────────────────────────────────────────────────────────────────────
|
||
//
|
||
// WebGPU has no hardware RT. The library emulates DXR/VKRT semantics in
|
||
// compute: a megakernel raygen pipeline traverses a CPU-built BLAS BVH +
|
||
// GPU-built TLAS, dispatches user closesthit / anyhit / miss via a
|
||
// generated `switch`. The same traversal kernel is also exposed as a
|
||
// rayQuery* function set for regular compute shaders (see the
|
||
// rayQueryCapable path under wgpuLoadCustomShader).
|
||
//
|
||
// The four mesh data streams live in *shared* GPU heaps; each Mesh::Build
|
||
// appends to them and gets a u32 handle back. The handle is what the
|
||
// application stores in RTInstance::accelerationStructureReference.
|
||
|
||
// ── WGSL library: shared types + constants (no bindings) ─────────────
|
||
// Used by both the megakernel pipeline (which adds group(0..1) bindings)
|
||
// and the TLAS-build pipeline (which only uses group(2)). Keeping bindings
|
||
// out of the shared block avoids inflating storage-buffer count past the
|
||
// 8-per-stage baseline limit on pipelines that don't actually use them.
|
||
const rtWgslTypes = String.raw`
|
||
struct RTDispatchHeader {
|
||
surfaceW: u32,
|
||
surfaceH: u32,
|
||
instanceCount: u32,
|
||
flags: u32,
|
||
};
|
||
|
||
struct RayDesc {
|
||
origin: vec3<f32>,
|
||
tMin: f32,
|
||
direction: vec3<f32>,
|
||
tMax: f32,
|
||
};
|
||
|
||
struct HitInfo {
|
||
t: f32,
|
||
instanceId: u32,
|
||
primitiveId: u32,
|
||
hitGroupIndex: u32,
|
||
attribs: vec2<f32>,
|
||
objectRayOrigin: vec3<f32>,
|
||
objectRayDirection: vec3<f32>,
|
||
objectToWorldR0: vec4<f32>,
|
||
objectToWorldR1: vec4<f32>,
|
||
objectToWorldR2: vec4<f32>,
|
||
customIndex: u32,
|
||
};
|
||
|
||
// Matches Crafter::BVHNode in interfaces/Crafter.Graphics-Mesh.cppm.
|
||
struct BVHNode {
|
||
aabbMin: vec3<f32>,
|
||
firstChildOrPrim: u32,
|
||
aabbMax: vec3<f32>,
|
||
primCount: u32,
|
||
};
|
||
|
||
// Per-mesh record. Indexed by RTInstance::accelerationStructureReference.
|
||
// attribsOffset is the per-mesh base index (in u32 words) into the
|
||
// vertexAttribs heap; meshes registered without per-vertex attribs leave
|
||
// it 0 (the heap entries at that range are also 0 / never touched). The
|
||
// per-vertex stride lives in the user's WGSL — the library doesn't store
|
||
// it because the layout is example-defined (Sponza uses 8 u32 / vertex
|
||
// for VertexNormalTangentUVPacked).
|
||
struct MeshRecord {
|
||
rootAabbMin: vec3<f32>,
|
||
vertexOffset: u32,
|
||
rootAabbMax: vec3<f32>,
|
||
indexOffset: u32,
|
||
bvhOffset: u32,
|
||
primRemapOffset: u32,
|
||
triangleCount: u32,
|
||
attribsOffset: u32,
|
||
};
|
||
|
||
// Per-instance TLAS record built by the TLAS-build compute pass.
|
||
struct TLASEntry {
|
||
aabbMin: vec3<f32>,
|
||
maskHGOffset: u32,
|
||
aabbMax: vec3<f32>,
|
||
blasMeshIdx: u32,
|
||
objectToWorldR0: vec4<f32>,
|
||
objectToWorldR1: vec4<f32>,
|
||
objectToWorldR2: vec4<f32>,
|
||
worldToObjectR0: vec4<f32>,
|
||
worldToObjectR1: vec4<f32>,
|
||
worldToObjectR2: vec4<f32>,
|
||
customIndex: u32,
|
||
instanceFlags: u32,
|
||
_pad0: u32,
|
||
_pad1: u32,
|
||
};
|
||
|
||
// ── Ray flag mirror of VkGeometryInstanceFlagBitsKHR + DXR ray flags ──
|
||
const RT_FLAG_OPAQUE: u32 = 0x1u;
|
||
const RT_FLAG_NO_OPAQUE: u32 = 0x2u;
|
||
const RT_FLAG_TERMINATE_ON_FIRST_HIT: u32 = 0x4u;
|
||
const RT_FLAG_SKIP_CLOSEST_HIT: u32 = 0x8u;
|
||
const RT_FLAG_CULL_BACK_FACING_TRIANGLES: u32 = 0x10u;
|
||
const RT_FLAG_CULL_FRONT_FACING_TRIANGLES: u32 = 0x20u;
|
||
const RT_FLAG_CULL_OPAQUE: u32 = 0x40u;
|
||
const RT_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
|
||
const RT_FLAG_SKIP_TRIANGLES: u32 = 0x100u;
|
||
const RT_FLAG_SKIP_AABBS: u32 = 0x200u;
|
||
|
||
const RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE: u32 = 0x1u;
|
||
const RT_INSTANCE_TRIANGLE_FLIP_FACING: u32 = 0x2u;
|
||
const RT_INSTANCE_FORCE_OPAQUE: u32 = 0x4u;
|
||
const RT_INSTANCE_FORCE_NO_OPAQUE: u32 = 0x8u;
|
||
|
||
const RT_ANYHIT_ACCEPT: u32 = 0u;
|
||
const RT_ANYHIT_IGNORE: u32 = 1u;
|
||
const RT_ANYHIT_END_SEARCH: u32 = 2u;
|
||
|
||
const RT_INTERSECTION_NONE: u32 = 0u;
|
||
const RT_INTERSECTION_TRIANGLE: u32 = 1u;
|
||
`;
|
||
|
||
// Megakernel-only bindings. Concatenated after rtWgslTypes for the
|
||
// raygen pipeline; the TLAS-build pipeline omits these because it doesn't
|
||
// touch them — declaring them would push it past 8 storage buffers per
|
||
// stage on the WebGPU baseline.
|
||
const rtWgslMegakernelBindings = String.raw`
|
||
@group(0) @binding(0) var<uniform> hdr : RTDispatchHeader;
|
||
@group(1) @binding(0) var<storage,read> tlasEntries : array<TLASEntry>;
|
||
@group(1) @binding(1) var<storage,read> bvhNodes : array<BVHNode>;
|
||
@group(1) @binding(2) var<storage,read> meshRecords : array<MeshRecord>;
|
||
@group(1) @binding(3) var<storage,read> vertices : array<f32>;
|
||
@group(1) @binding(4) var<storage,read> indices : array<u32>;
|
||
@group(1) @binding(5) var<storage,read> primRemap : array<u32>;
|
||
@group(1) @binding(6) var outImage : texture_storage_2d<rgba8unorm, write>;
|
||
@group(1) @binding(7) var<storage,read> vertexAttribs : array<u32>;
|
||
// TLAS Morton-sorted permutation: tlasEntryOrder[i] gives the
|
||
// tlasEntries[] index that BVH leaf i should sample.
|
||
@group(1) @binding(8) var<storage,read> tlasEntryOrder : array<u32>;
|
||
// Sweep-tree BVH built by the LBVH-build pass. 2 * N_PADDED - 1
|
||
// nodes = 2047 for N_PADDED = 1024. Internal nodes at [0, N_PADDED - 1);
|
||
// leaves at [N_PADDED - 1, 2 * N_PADDED - 1). For internal node i,
|
||
// children are 2i+1 and 2i+2 (implicit perfect binary tree). Each node
|
||
// stores just its world-space AABB.
|
||
struct BvhNode {
|
||
aabbMin: vec3<f32>,
|
||
_pad0: u32,
|
||
aabbMax: vec3<f32>,
|
||
_pad1: u32,
|
||
};
|
||
@group(1) @binding(9) var<storage,read> tlasBvhNodes : array<BvhNode>;
|
||
const TLAS_BVH_N_PADDED: u32 = 16384u;
|
||
const TLAS_BVH_LEAVES_START: u32 = TLAS_BVH_N_PADDED - 1u;
|
||
`;
|
||
|
||
const rtWgslPrelude = rtWgslTypes + rtWgslMegakernelBindings;
|
||
|
||
// ── WGSL library: helpers + traverseBlas + traverseTlas + traceRay ───
|
||
// Injected after the user-supplied closesthit/anyhit/miss sources +
|
||
// mega-switch dispatchers (which PipelineRTWebGPU emits). User raygen
|
||
// sources sit after this block so they can call traceRay.
|
||
// The "pure" subset of the RT helpers — no calls into runAnyHit /
|
||
// runClosestHit / traceRay, so this can be prepended ahead of compute
|
||
// pipelines using rayQuery without dragging in megakernel-only symbols.
|
||
const rtWgslPureHelpers = String.raw`
|
||
fn _rtFetchTri(meshRec: MeshRecord, triIndex: u32) -> array<vec3<f32>, 3> {
|
||
let baseIdx = meshRec.indexOffset + triIndex * 3u;
|
||
let i0 = indices[baseIdx + 0u];
|
||
let i1 = indices[baseIdx + 1u];
|
||
let i2 = indices[baseIdx + 2u];
|
||
let baseV = meshRec.vertexOffset;
|
||
let v0i = (baseV + i0) * 3u;
|
||
let v1i = (baseV + i1) * 3u;
|
||
let v2i = (baseV + i2) * 3u;
|
||
return array<vec3<f32>, 3>(
|
||
vec3<f32>(vertices[v0i + 0u], vertices[v0i + 1u], vertices[v0i + 2u]),
|
||
vec3<f32>(vertices[v1i + 0u], vertices[v1i + 1u], vertices[v1i + 2u]),
|
||
vec3<f32>(vertices[v2i + 0u], vertices[v2i + 1u], vertices[v2i + 2u]),
|
||
);
|
||
}
|
||
|
||
fn _rtAabb(ro: vec3<f32>, invRd: vec3<f32>, mn: vec3<f32>, mx: vec3<f32>, tMax: f32) -> bool {
|
||
// Reject degenerate (mn > mx) boxes outright. The min(t0,t1)/
|
||
// max(t0,t1) trick below silently re-orients an inverted box
|
||
// and would otherwise return true for sentinel-padded BVH leaves
|
||
// — letting rays "hit" empty slots and accidentally re-traverse
|
||
// instance 0 via the OOB outOrder → tlasEntries[0] path.
|
||
if (any(mn > mx)) { return false; }
|
||
let t0 = (mn - ro) * invRd;
|
||
let t1 = (mx - ro) * invRd;
|
||
let tmin = min(t0, t1);
|
||
let tmax = max(t0, t1);
|
||
let tEnter = max(max(tmin.x, tmin.y), tmin.z);
|
||
let tExit = min(min(tmax.x, tmax.y), tmax.z);
|
||
return tExit >= max(tEnter, 0.0) && tEnter <= tMax;
|
||
}
|
||
|
||
struct _RtTriHit { hit: bool, t: f32, u: f32, v: f32 };
|
||
fn _rtTri(ro: vec3<f32>, rd: vec3<f32>, p0: vec3<f32>, p1: vec3<f32>, p2: vec3<f32>,
|
||
tMin: f32, tMax: f32) -> _RtTriHit {
|
||
var r: _RtTriHit;
|
||
r.hit = false;
|
||
let e1 = p1 - p0;
|
||
let e2 = p2 - p0;
|
||
let h = cross(rd, e2);
|
||
let a = dot(e1, h);
|
||
if (abs(a) < 1e-8) { return r; }
|
||
let f = 1.0 / a;
|
||
let s = ro - p0;
|
||
let u = f * dot(s, h);
|
||
if (u < 0.0 || u > 1.0) { return r; }
|
||
let q = cross(s, e1);
|
||
let v = f * dot(rd, q);
|
||
if (v < 0.0 || u + v > 1.0) { return r; }
|
||
let t = f * dot(e2, q);
|
||
if (t < tMin || t > tMax) { return r; }
|
||
r.hit = true; r.t = t; r.u = u; r.v = v;
|
||
return r;
|
||
}
|
||
`;
|
||
|
||
// Megakernel-only helpers: traversal routines that invoke runAnyHit /
|
||
// runClosestHit / runMiss (emitted by the megakernel SBT switch) and
|
||
// `traceRay` that closes over them. Only the raygen-pipeline path
|
||
// prepends this.
|
||
const rtWgslMegakernelHelpers = String.raw`
|
||
// Iterative stack-based BLAS traversal. Returns true if traversal was
|
||
// terminated by an END_SEARCH from anyhit (caller should stop entirely).
|
||
fn _rtTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
|
||
instanceId: u32, hitGroupBase: u32,
|
||
bestHit: ptr<function, HitInfo>,
|
||
bestT: ptr<function, f32>,
|
||
payload: ptr<function, Payload>) -> bool {
|
||
let invD = vec3<f32>(1.0) / rayObj.direction;
|
||
var stack: array<u32, 32>;
|
||
var sp: u32 = 0u;
|
||
var nodeRel: u32 = 0u;
|
||
|
||
loop {
|
||
let abs = meshRec.bvhOffset + nodeRel;
|
||
let node = bvhNodes[abs];
|
||
if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, *bestT)) {
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u; nodeRel = stack[sp]; continue;
|
||
}
|
||
if (node.primCount > 0u) {
|
||
for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
|
||
let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
|
||
let verts = _rtFetchTri(meshRec, triIndex);
|
||
let tr = _rtTri(rayObj.origin, rayObj.direction,
|
||
verts[0], verts[1], verts[2],
|
||
rayObj.tMin, *bestT);
|
||
if (!tr.hit) { continue; }
|
||
|
||
let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
|
||
let facing = dot(geomNormal, rayObj.direction);
|
||
if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; }
|
||
if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
|
||
|
||
var candidate: HitInfo;
|
||
candidate.t = tr.t;
|
||
candidate.instanceId = instanceId;
|
||
candidate.primitiveId = triIndex;
|
||
candidate.hitGroupIndex = hitGroupBase;
|
||
candidate.attribs = vec2<f32>(tr.u, tr.v);
|
||
candidate.objectRayOrigin = rayObj.origin;
|
||
candidate.objectRayDirection = rayObj.direction;
|
||
|
||
let opaque = (flags & RT_FLAG_OPAQUE) != 0u
|
||
|| (flags & RT_FLAG_NO_OPAQUE) == 0u; // default opaque
|
||
|
||
if (opaque) {
|
||
*bestHit = candidate;
|
||
*bestT = tr.t;
|
||
if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
|
||
} else {
|
||
let r = runAnyHit(hitGroupBase, rayObj, candidate, payload);
|
||
if (r == RT_ANYHIT_END_SEARCH) {
|
||
*bestHit = candidate;
|
||
*bestT = tr.t;
|
||
return true;
|
||
}
|
||
if (r == RT_ANYHIT_ACCEPT) {
|
||
*bestHit = candidate;
|
||
*bestT = tr.t;
|
||
if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
|
||
}
|
||
}
|
||
}
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u; nodeRel = stack[sp]; continue;
|
||
}
|
||
// inner node — push right, descend left
|
||
let left = node.firstChildOrPrim;
|
||
let right = left + 1u;
|
||
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
|
||
nodeRel = left;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
fn _rtTraverseTlas(rayWorld: RayDesc, flags: u32, cullMask: u32,
|
||
sbtRecordOffset: u32, sbtRecordStride: u32,
|
||
bestHit: ptr<function, HitInfo>,
|
||
bestT: ptr<function, f32>,
|
||
payload: ptr<function, Payload>) -> bool {
|
||
let invD = vec3<f32>(1.0) / rayWorld.direction;
|
||
// Stack-based descent of the sweep-tree BVH. Internal nodes
|
||
// [0, TLAS_BVH_LEAVES_START); leaves [LEAVES_START, 2*N_PADDED-1).
|
||
// Node i's children are 2i+1 / 2i+2 (implicit perfect binary tree).
|
||
// Stack depth = tree depth = log2(N_PADDED) = 14 for N_PADDED=16384;
|
||
// 24 gives generous headroom.
|
||
var stack: array<u32, 24>;
|
||
var sp: u32 = 0u;
|
||
stack[sp] = 0u; sp = sp + 1u;
|
||
loop {
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u;
|
||
let nodeIdx = stack[sp];
|
||
let node = tlasBvhNodes[nodeIdx];
|
||
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, *bestT)) {
|
||
continue;
|
||
}
|
||
if (nodeIdx >= TLAS_BVH_LEAVES_START) {
|
||
// Leaf: resolve entry, do the existing per-instance test.
|
||
let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START;
|
||
let i = tlasEntryOrder[leafIdx];
|
||
// Sentinel-padded leaves get instanceMask=0; cullMask check
|
||
// (and degenerate AABB above) means they fall out cheaply.
|
||
if (i == 0xFFFFFFFFu) { continue; }
|
||
let inst = tlasEntries[i];
|
||
let instanceMask = inst.maskHGOffset & 0xFFu;
|
||
if ((instanceMask & cullMask) == 0u) { continue; }
|
||
if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, *bestT)) { continue; }
|
||
|
||
// Transform ray to object space.
|
||
let r0 = inst.worldToObjectR0;
|
||
let r1 = inst.worldToObjectR1;
|
||
let r2 = inst.worldToObjectR2;
|
||
var rayObj: RayDesc;
|
||
rayObj.origin = vec3<f32>(
|
||
dot(r0.xyz, rayWorld.origin) + r0.w,
|
||
dot(r1.xyz, rayWorld.origin) + r1.w,
|
||
dot(r2.xyz, rayWorld.origin) + r2.w,
|
||
);
|
||
rayObj.direction = vec3<f32>(
|
||
dot(r0.xyz, rayWorld.direction),
|
||
dot(r1.xyz, rayWorld.direction),
|
||
dot(r2.xyz, rayWorld.direction),
|
||
);
|
||
rayObj.tMin = rayWorld.tMin;
|
||
rayObj.tMax = *bestT;
|
||
|
||
var effective = flags;
|
||
let iflags = inst.instanceFlags;
|
||
if ((iflags & RT_INSTANCE_FORCE_OPAQUE) != 0u) {
|
||
effective = (effective | RT_FLAG_OPAQUE) & ~RT_FLAG_NO_OPAQUE;
|
||
}
|
||
if ((iflags & RT_INSTANCE_FORCE_NO_OPAQUE) != 0u) {
|
||
effective = (effective | RT_FLAG_NO_OPAQUE) & ~RT_FLAG_OPAQUE;
|
||
}
|
||
if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
|
||
effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
|
||
}
|
||
|
||
let hitGroupOffset = inst.maskHGOffset >> 8u;
|
||
let hitGroupBase = sbtRecordOffset + hitGroupOffset;
|
||
let meshRec = meshRecords[inst.blasMeshIdx];
|
||
|
||
let pre = *bestT;
|
||
let endSearch = _rtTraverseBlas(rayObj, effective, meshRec, i, hitGroupBase,
|
||
bestHit, bestT, payload);
|
||
if (endSearch) { return true; }
|
||
if ((*bestT) < pre) {
|
||
// record world-space object-to-world for the closest-hit shader
|
||
(*bestHit).objectToWorldR0 = inst.objectToWorldR0;
|
||
(*bestHit).objectToWorldR1 = inst.objectToWorldR1;
|
||
(*bestHit).objectToWorldR2 = inst.objectToWorldR2;
|
||
(*bestHit).customIndex = inst.customIndex;
|
||
if ((effective & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
|
||
}
|
||
} else {
|
||
// Internal node: push both children (skip overflow).
|
||
let left = 2u * nodeIdx + 1u;
|
||
let right = 2u * nodeIdx + 2u;
|
||
if (sp + 1u < 24u) {
|
||
stack[sp] = right; sp = sp + 1u;
|
||
stack[sp] = left; sp = sp + 1u;
|
||
}
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
fn traceRay(tlasIdx: u32, flags: u32, cullMask: u32,
|
||
sbtRecordOffset: u32, sbtRecordStride: u32, missIndex: u32,
|
||
rayOrigin: vec3<f32>, rayTMin: f32,
|
||
rayDir: vec3<f32>, rayTMax: f32,
|
||
payload: ptr<function, Payload>) {
|
||
var ray: RayDesc;
|
||
ray.origin = rayOrigin;
|
||
ray.direction = rayDir;
|
||
ray.tMin = rayTMin;
|
||
ray.tMax = rayTMax;
|
||
var bestHit: HitInfo;
|
||
bestHit.t = rayTMax;
|
||
var bestT = rayTMax;
|
||
let ended = _rtTraverseTlas(ray, flags, cullMask & 0xFFu,
|
||
sbtRecordOffset, sbtRecordStride,
|
||
&bestHit, &bestT, payload);
|
||
if (bestT < rayTMax) {
|
||
if ((flags & RT_FLAG_SKIP_CLOSEST_HIT) == 0u) {
|
||
runClosestHit(bestHit.hitGroupIndex, ray, bestHit, payload);
|
||
}
|
||
} else {
|
||
runMiss(missIndex, ray, payload);
|
||
}
|
||
}
|
||
`;
|
||
|
||
// ════════════════════════════════════════════════════════════════════════
|
||
// WAVEFRONT RT — streaming tracer (GENERATE → PREP → TRACE → SHADE →
|
||
// RESOLVE). Replaces the megakernel. The C++ side (PipelineRTWebGPU) emits
|
||
// the user sources, the per-stage SBT switches, the Payload-typed wfPayload
|
||
// binding, and the five @compute entry points; this JS injects the bindings
|
||
// + library helpers the entry points call.
|
||
// ════════════════════════════════════════════════════════════════════════
|
||
|
||
// Bindings prelude for the wavefront pipeline. group(0) is the per-pass
|
||
// WfParams uniform (dynamic-offset ring — one slot per pass so curIsA /
|
||
// bounce can vary between passes inside one submit). group(1) carries the
|
||
// geometry heaps (0..9, identical to the old megakernel layout so the
|
||
// register/build paths are unchanged) plus the wavefront work buffers
|
||
// (10..14); wfPayload at binding 15 is emitted in the codegen region after
|
||
// the user's `struct Payload`. group(2) is the indirect-args buffer, bound
|
||
// only by PREP (a buffer used as INDIRECT in a dispatch may not also be
|
||
// bound read-write in that same dispatch — so TRACE/SHADE must not bind it).
|
||
const rtWgslWavefrontBindings = String.raw`
|
||
struct WfParams {
|
||
surfaceW: u32,
|
||
surfaceH: u32,
|
||
rayCapacity: u32,
|
||
curIsA: u32, // 1 → current ray buffer is A (emit-next = B); 0 → B
|
||
bounce: u32,
|
||
maxDepth: u32,
|
||
tlasNPadded: u32, // TLAS sweep-tree padded leaf count (descent depth)
|
||
flags: u32,
|
||
};
|
||
|
||
// One in-flight ray. 64 bytes; origin/direction vec3-aligned to 16.
|
||
struct WfRay {
|
||
origin: vec3<f32>,
|
||
tMin: f32,
|
||
direction: vec3<f32>,
|
||
tMax: f32,
|
||
pixel: u32, // linear framebuffer pixel this ray contributes to
|
||
flags: u32,
|
||
cullMask: u32,
|
||
missIndex: u32,
|
||
sbtRecordOffset: u32,
|
||
payloadSlot: u32, // index into wfPayload
|
||
kind: u32, // 0 primary, 1 continuation (informational)
|
||
_pad: u32,
|
||
};
|
||
|
||
// TRACE → SHADE handoff. Mirrors HitInfo + a hitKind (0 miss, 1 triangle).
|
||
struct HitResult {
|
||
t: f32,
|
||
instanceId: u32,
|
||
primitiveId: u32,
|
||
hitGroupIndex: u32,
|
||
attribs: vec2<f32>,
|
||
hitKind: u32,
|
||
customIndex: u32,
|
||
objectRayOrigin: vec3<f32>,
|
||
_p0: f32,
|
||
objectRayDirection: vec3<f32>,
|
||
_p1: f32,
|
||
objectToWorldR0: vec4<f32>,
|
||
objectToWorldR1: vec4<f32>,
|
||
objectToWorldR2: vec4<f32>,
|
||
};
|
||
|
||
struct BvhNode {
|
||
aabbMin: vec3<f32>,
|
||
_pad0: u32,
|
||
aabbMax: vec3<f32>,
|
||
_pad1: u32,
|
||
};
|
||
|
||
@group(0) @binding(0) var<uniform> wfParams : WfParams;
|
||
|
||
@group(1) @binding(0) var<storage,read> tlasEntries : array<TLASEntry>;
|
||
@group(1) @binding(1) var<storage,read> bvhNodes : array<BVHNode>;
|
||
@group(1) @binding(2) var<storage,read> meshRecords : array<MeshRecord>;
|
||
@group(1) @binding(3) var<storage,read> vertices : array<f32>;
|
||
@group(1) @binding(4) var<storage,read> indices : array<u32>;
|
||
@group(1) @binding(5) var<storage,read> primRemap : array<u32>;
|
||
@group(1) @binding(6) var outImage : texture_storage_2d<rgba8unorm, write>;
|
||
@group(1) @binding(7) var<storage,read> vertexAttribs : array<u32>;
|
||
@group(1) @binding(8) var<storage,read> tlasEntryOrder : array<u32>;
|
||
@group(1) @binding(9) var<storage,read> tlasBvhNodes : array<BvhNode>;
|
||
@group(1) @binding(10) var<storage,read_write> wfRaysA : array<WfRay>;
|
||
@group(1) @binding(11) var<storage,read_write> wfRaysB : array<WfRay>;
|
||
@group(1) @binding(12) var<storage,read_write> wfHits : array<HitResult>;
|
||
@group(1) @binding(13) var<storage,read_write> wfAccum : array<vec4<f32>>;
|
||
@group(1) @binding(14) var<storage,read_write> wfCounters : array<atomic<u32>>;
|
||
// @group(1) @binding(15) wfPayload : array<Payload> — emitted by codegen.
|
||
|
||
@group(2) @binding(0) var<storage,read_write> wfIndirect : array<u32>;
|
||
`;
|
||
|
||
// Library helpers the codegen entry points call. Sits after the pure
|
||
// helpers (_rtAabb/_rtTri/_rtFetchTri) and after the user's Payload +
|
||
// wfPayload binding, so rtEmit*/_wfShade can name Payload/wfPayload.
|
||
const rtWgslWavefrontHelpers = String.raw`
|
||
var<private> _wfPixel: u32 = 0u;
|
||
|
||
// Live ray count for the current buffer, clamped to capacity (the emit
|
||
// counter can overshoot capacity; dropped rays were never written).
|
||
fn _wfCurCount() -> u32 {
|
||
let raw = select(atomicLoad(&wfCounters[1]), atomicLoad(&wfCounters[0]),
|
||
wfParams.curIsA == 1u);
|
||
return min(raw, wfParams.rayCapacity);
|
||
}
|
||
|
||
// Add linear radiance to the pixel this SHADE/GENERATE thread owns. Safe
|
||
// without atomics: at most one ray per pixel per bounce, and bounces run
|
||
// in separate passes (implicit barrier between them).
|
||
fn rtAccumulate(rgb: vec3<f32>) {
|
||
wfAccum[_wfPixel] = wfAccum[_wfPixel] + vec4<f32>(rgb, 0.0);
|
||
}
|
||
|
||
// raygen → emit the pixel's primary ray. Bounce 0's current buffer is
|
||
// always A, so primaries land in A with their payload in the A region
|
||
// [0, rayCapacity).
|
||
fn rtEmitPrimaryRay(origin: vec3<f32>, tMin: f32, dir: vec3<f32>, tMax: f32,
|
||
flags: u32, cullMask: u32, sbtRecordOffset: u32,
|
||
missIndex: u32, payload: Payload) {
|
||
let slot = atomicAdd(&wfCounters[0], 1u);
|
||
if (slot >= wfParams.rayCapacity) { return; }
|
||
var r: WfRay;
|
||
r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax;
|
||
r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask;
|
||
r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset;
|
||
r.payloadSlot = slot; r.kind = 0u;
|
||
wfRaysA[slot] = r;
|
||
wfPayload[slot] = payload;
|
||
}
|
||
|
||
// closesthit/miss → spawn a continuation/shadow ray into the NEXT buffer
|
||
// (the one the upcoming TRACE will read). Payload travels with it; the
|
||
// next buffer's payload region is [rayCapacity, 2*rayCapacity) for B.
|
||
fn rtEmitRay(origin: vec3<f32>, tMin: f32, dir: vec3<f32>, tMax: f32,
|
||
flags: u32, cullMask: u32, sbtRecordOffset: u32,
|
||
missIndex: u32, payload: Payload) {
|
||
let nextIsA = wfParams.curIsA == 0u;
|
||
let counterIdx = select(1u, 0u, nextIsA);
|
||
let slot = atomicAdd(&wfCounters[counterIdx], 1u);
|
||
if (slot >= wfParams.rayCapacity) { return; }
|
||
let payloadBase = select(wfParams.rayCapacity, 0u, nextIsA);
|
||
var r: WfRay;
|
||
r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax;
|
||
r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask;
|
||
r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset;
|
||
r.payloadSlot = payloadBase + slot; r.kind = 1u;
|
||
if (nextIsA) { wfRaysA[slot] = r; } else { wfRaysB[slot] = r; }
|
||
wfPayload[r.payloadSlot] = payload;
|
||
}
|
||
|
||
// Opaque-only BLAS descent (no anyhit — TRACE runs zero user code).
|
||
fn _rtwTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
|
||
instanceId: u32, hitGroupBase: u32,
|
||
bestHit: ptr<function, HitInfo>,
|
||
bestT: ptr<function, f32>) -> bool {
|
||
let invD = vec3<f32>(1.0) / rayObj.direction;
|
||
var stack: array<u32, 32>;
|
||
var sp: u32 = 0u;
|
||
var nodeRel: u32 = 0u;
|
||
loop {
|
||
let absI = meshRec.bvhOffset + nodeRel;
|
||
let node = bvhNodes[absI];
|
||
if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, *bestT)) {
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u; nodeRel = stack[sp]; continue;
|
||
}
|
||
if (node.primCount > 0u) {
|
||
for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
|
||
let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
|
||
let verts = _rtFetchTri(meshRec, triIndex);
|
||
let tr = _rtTri(rayObj.origin, rayObj.direction,
|
||
verts[0], verts[1], verts[2], rayObj.tMin, *bestT);
|
||
if (!tr.hit) { continue; }
|
||
let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
|
||
let facing = dot(geomNormal, rayObj.direction);
|
||
if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; }
|
||
if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
|
||
var candidate: HitInfo;
|
||
candidate.t = tr.t;
|
||
candidate.instanceId = instanceId;
|
||
candidate.primitiveId = triIndex;
|
||
candidate.hitGroupIndex = hitGroupBase;
|
||
candidate.attribs = vec2<f32>(tr.u, tr.v);
|
||
candidate.objectRayOrigin = rayObj.origin;
|
||
candidate.objectRayDirection = rayObj.direction;
|
||
*bestHit = candidate;
|
||
*bestT = tr.t;
|
||
if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
|
||
}
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u; nodeRel = stack[sp]; continue;
|
||
}
|
||
let left = node.firstChildOrPrim;
|
||
let right = left + 1u;
|
||
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
|
||
nodeRel = left;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
fn _rtwTraverseTlas(rayWorld: RayDesc, flags: u32, cullMask: u32,
|
||
sbtRecordOffset: u32,
|
||
bestHit: ptr<function, HitInfo>,
|
||
bestT: ptr<function, f32>) -> bool {
|
||
let invD = vec3<f32>(1.0) / rayWorld.direction;
|
||
let leavesStart = wfParams.tlasNPadded - 1u;
|
||
var stack: array<u32, 32>;
|
||
var sp: u32 = 0u;
|
||
stack[sp] = 0u; sp = sp + 1u;
|
||
loop {
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u;
|
||
let nodeIdx = stack[sp];
|
||
let node = tlasBvhNodes[nodeIdx];
|
||
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, *bestT)) { continue; }
|
||
if (nodeIdx >= leavesStart) {
|
||
let leafIdx = nodeIdx - leavesStart;
|
||
let i = tlasEntryOrder[leafIdx];
|
||
if (i == 0xFFFFFFFFu) { continue; }
|
||
let inst = tlasEntries[i];
|
||
let instanceMask = inst.maskHGOffset & 0xFFu;
|
||
if ((instanceMask & cullMask) == 0u) { continue; }
|
||
if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, *bestT)) { continue; }
|
||
let r0 = inst.worldToObjectR0;
|
||
let r1 = inst.worldToObjectR1;
|
||
let r2 = inst.worldToObjectR2;
|
||
var rayObj: RayDesc;
|
||
rayObj.origin = vec3<f32>(
|
||
dot(r0.xyz, rayWorld.origin) + r0.w,
|
||
dot(r1.xyz, rayWorld.origin) + r1.w,
|
||
dot(r2.xyz, rayWorld.origin) + r2.w);
|
||
rayObj.direction = vec3<f32>(
|
||
dot(r0.xyz, rayWorld.direction),
|
||
dot(r1.xyz, rayWorld.direction),
|
||
dot(r2.xyz, rayWorld.direction));
|
||
rayObj.tMin = rayWorld.tMin;
|
||
rayObj.tMax = *bestT;
|
||
var effective = flags;
|
||
let iflags = inst.instanceFlags;
|
||
if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
|
||
effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
|
||
}
|
||
let hitGroupOffset = inst.maskHGOffset >> 8u;
|
||
let hitGroupBase = sbtRecordOffset + hitGroupOffset;
|
||
let meshRec = meshRecords[inst.blasMeshIdx];
|
||
let pre = *bestT;
|
||
let endSearch = _rtwTraverseBlas(rayObj, effective, meshRec, i, hitGroupBase, bestHit, bestT);
|
||
if ((*bestT) < pre || endSearch) {
|
||
(*bestHit).objectToWorldR0 = inst.objectToWorldR0;
|
||
(*bestHit).objectToWorldR1 = inst.objectToWorldR1;
|
||
(*bestHit).objectToWorldR2 = inst.objectToWorldR2;
|
||
(*bestHit).customIndex = inst.customIndex;
|
||
}
|
||
if (endSearch) { return true; }
|
||
if ((*bestT) < pre && (effective & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
|
||
} else {
|
||
let left = 2u * nodeIdx + 1u;
|
||
let right = 2u * nodeIdx + 2u;
|
||
if (sp + 1u < 32u) {
|
||
stack[sp] = right; sp = sp + 1u;
|
||
stack[sp] = left; sp = sp + 1u;
|
||
}
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
fn _wfReadRay(i: u32) -> WfRay {
|
||
if (wfParams.curIsA == 1u) { return wfRaysA[i]; }
|
||
return wfRaysB[i];
|
||
}
|
||
|
||
// PREP — publish indirect args for the upcoming TRACE/SHADE; zero the next
|
||
// buffer's emit counter.
|
||
fn _wfPrep() {
|
||
let n = _wfCurCount();
|
||
wfIndirect[0] = (n + 63u) / 64u;
|
||
wfIndirect[1] = 1u;
|
||
wfIndirect[2] = 1u;
|
||
if (wfParams.curIsA == 1u) { atomicStore(&wfCounters[1], 0u); }
|
||
else { atomicStore(&wfCounters[0], 0u); }
|
||
}
|
||
|
||
// TRACE — pure traversal, zero user code.
|
||
fn _wfTrace(i: u32) {
|
||
if (i >= _wfCurCount()) { return; }
|
||
let ray = _wfReadRay(i);
|
||
var rd: RayDesc;
|
||
rd.origin = ray.origin; rd.tMin = ray.tMin;
|
||
rd.direction = ray.direction; rd.tMax = ray.tMax;
|
||
var bestHit: HitInfo;
|
||
bestHit.t = ray.tMax;
|
||
var bestT = ray.tMax;
|
||
_rtwTraverseTlas(rd, ray.flags, ray.cullMask & 0xFFu, ray.sbtRecordOffset, &bestHit, &bestT);
|
||
var hr: HitResult;
|
||
if (bestT < ray.tMax) {
|
||
hr.t = bestHit.t;
|
||
hr.instanceId = bestHit.instanceId;
|
||
hr.primitiveId = bestHit.primitiveId;
|
||
hr.hitGroupIndex = bestHit.hitGroupIndex;
|
||
hr.attribs = bestHit.attribs;
|
||
hr.hitKind = 1u;
|
||
hr.customIndex = bestHit.customIndex;
|
||
hr.objectRayOrigin = bestHit.objectRayOrigin;
|
||
hr.objectRayDirection = bestHit.objectRayDirection;
|
||
hr.objectToWorldR0 = bestHit.objectToWorldR0;
|
||
hr.objectToWorldR1 = bestHit.objectToWorldR1;
|
||
hr.objectToWorldR2 = bestHit.objectToWorldR2;
|
||
} else {
|
||
hr.hitKind = 0u;
|
||
}
|
||
wfHits[i] = hr;
|
||
}
|
||
|
||
// SHADE — dispatch to runMiss / runClosestHit with the ray's payload.
|
||
fn _wfShade(i: u32) {
|
||
if (i >= _wfCurCount()) { return; }
|
||
let ray = _wfReadRay(i);
|
||
let hr = wfHits[i];
|
||
_wfPixel = ray.pixel;
|
||
var payload: Payload = wfPayload[ray.payloadSlot];
|
||
var rd: RayDesc;
|
||
rd.origin = ray.origin; rd.tMin = ray.tMin;
|
||
rd.direction = ray.direction; rd.tMax = ray.tMax;
|
||
if (hr.hitKind == 0u) {
|
||
runMiss(ray.missIndex, rd, &payload);
|
||
} else if ((ray.flags & RT_FLAG_SKIP_CLOSEST_HIT) == 0u) {
|
||
var hit: HitInfo;
|
||
hit.t = hr.t;
|
||
hit.instanceId = hr.instanceId;
|
||
hit.primitiveId = hr.primitiveId;
|
||
hit.hitGroupIndex = hr.hitGroupIndex;
|
||
hit.attribs = hr.attribs;
|
||
hit.customIndex = hr.customIndex;
|
||
hit.objectRayOrigin = hr.objectRayOrigin;
|
||
hit.objectRayDirection = hr.objectRayDirection;
|
||
hit.objectToWorldR0 = hr.objectToWorldR0;
|
||
hit.objectToWorldR1 = hr.objectToWorldR1;
|
||
hit.objectToWorldR2 = hr.objectToWorldR2;
|
||
runClosestHit(hr.hitGroupIndex, rd, hit, &payload);
|
||
}
|
||
}
|
||
`;
|
||
|
||
// ── WGSL library: rayQuery API for non-megakernel compute shaders ────
|
||
//
|
||
// Mirrors GL_EXT_ray_query semantics that 3DForts's physics shaders use
|
||
// (projectile-collide, splash, builder-pick). User WGSL:
|
||
// var rq: RayQuery;
|
||
// rayQueryInitialize(&rq, 0u, flags, mask, origin, tMin, dir, tMax);
|
||
// while (rayQueryProceed(&rq)) {} // run traversal to completion
|
||
// if (rayQueryGetCommittedIntersectionType(&rq) != RT_INTERSECTION_NONE) {
|
||
// let t = rayQueryGetCommittedT(&rq);
|
||
// ...
|
||
// }
|
||
//
|
||
// v1 simplification: traversal force-opaques every hit (no anyhit). The
|
||
// user can still test for triangle vs miss and read t/instance/bary.
|
||
// Anyhit-style candidate inspection is a future extension.
|
||
const rtWgslRayQueryLib = String.raw`
|
||
struct RayQuery {
|
||
ray: RayDesc,
|
||
flags: u32,
|
||
cullMask: u32,
|
||
committedType: u32,
|
||
committedT: f32,
|
||
committedInstanceId: u32,
|
||
committedInstanceCustomIndex: u32,
|
||
committedPrimitiveIndex: u32,
|
||
committedBarycentrics: vec2<f32>,
|
||
committedObjectRayOrigin: vec3<f32>,
|
||
committedObjectRayDirection: vec3<f32>,
|
||
committedObjectToWorldR0: vec4<f32>,
|
||
committedObjectToWorldR1: vec4<f32>,
|
||
committedObjectToWorldR2: vec4<f32>,
|
||
committedWorldToObjectR0: vec4<f32>,
|
||
committedWorldToObjectR1: vec4<f32>,
|
||
committedWorldToObjectR2: vec4<f32>,
|
||
done: u32,
|
||
};
|
||
|
||
fn _rqTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
|
||
instanceId: u32, customIndex: u32,
|
||
inst: TLASEntry,
|
||
rq: ptr<function, RayQuery>) {
|
||
let invD = vec3<f32>(1.0) / rayObj.direction;
|
||
var stack: array<u32, 32>;
|
||
var sp: u32 = 0u;
|
||
var nodeRel: u32 = 0u;
|
||
|
||
loop {
|
||
let abs = meshRec.bvhOffset + nodeRel;
|
||
let node = bvhNodes[abs];
|
||
if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u; nodeRel = stack[sp]; continue;
|
||
}
|
||
if (node.primCount > 0u) {
|
||
for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
|
||
let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
|
||
let verts = _rtFetchTri(meshRec, triIndex);
|
||
let tr = _rtTri(rayObj.origin, rayObj.direction,
|
||
verts[0], verts[1], verts[2],
|
||
rayObj.tMin, (*rq).committedT);
|
||
if (!tr.hit) { continue; }
|
||
|
||
let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
|
||
let facing = dot(geomNormal, rayObj.direction);
|
||
if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; }
|
||
if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
|
||
|
||
(*rq).committedType = RT_INTERSECTION_TRIANGLE;
|
||
(*rq).committedT = tr.t;
|
||
(*rq).committedInstanceId = instanceId;
|
||
(*rq).committedInstanceCustomIndex = customIndex;
|
||
(*rq).committedPrimitiveIndex = triIndex;
|
||
(*rq).committedBarycentrics = vec2<f32>(tr.u, tr.v);
|
||
(*rq).committedObjectRayOrigin = rayObj.origin;
|
||
(*rq).committedObjectRayDirection = rayObj.direction;
|
||
(*rq).committedObjectToWorldR0 = inst.objectToWorldR0;
|
||
(*rq).committedObjectToWorldR1 = inst.objectToWorldR1;
|
||
(*rq).committedObjectToWorldR2 = inst.objectToWorldR2;
|
||
(*rq).committedWorldToObjectR0 = inst.worldToObjectR0;
|
||
(*rq).committedWorldToObjectR1 = inst.worldToObjectR1;
|
||
(*rq).committedWorldToObjectR2 = inst.worldToObjectR2;
|
||
}
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u; nodeRel = stack[sp]; continue;
|
||
}
|
||
let left = node.firstChildOrPrim;
|
||
let right = left + 1u;
|
||
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
|
||
nodeRel = left;
|
||
}
|
||
}
|
||
|
||
fn _rqTraverseTlas(rq: ptr<function, RayQuery>) {
|
||
let rayWorld = (*rq).ray;
|
||
let invD = vec3<f32>(1.0) / rayWorld.direction;
|
||
let cullMask = (*rq).cullMask;
|
||
let rayFlags = (*rq).flags;
|
||
// Stack-based BVH descent — same shape as _rtTraverseTlas.
|
||
var stack: array<u32, 24>;
|
||
var sp: u32 = 0u;
|
||
stack[sp] = 0u; sp = sp + 1u;
|
||
loop {
|
||
if (sp == 0u) { break; }
|
||
sp = sp - 1u;
|
||
let nodeIdx = stack[sp];
|
||
let node = tlasBvhNodes[nodeIdx];
|
||
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
|
||
continue;
|
||
}
|
||
if (nodeIdx >= TLAS_BVH_LEAVES_START) {
|
||
let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START;
|
||
let i = tlasEntryOrder[leafIdx];
|
||
if (i == 0xFFFFFFFFu) { continue; }
|
||
let inst = tlasEntries[i];
|
||
let instanceMask = inst.maskHGOffset & 0xFFu;
|
||
if ((instanceMask & cullMask) == 0u) { continue; }
|
||
if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, (*rq).committedT)) { continue; }
|
||
|
||
let r0 = inst.worldToObjectR0;
|
||
let r1 = inst.worldToObjectR1;
|
||
let r2 = inst.worldToObjectR2;
|
||
var rayObj: RayDesc;
|
||
rayObj.origin = vec3<f32>(
|
||
dot(r0.xyz, rayWorld.origin) + r0.w,
|
||
dot(r1.xyz, rayWorld.origin) + r1.w,
|
||
dot(r2.xyz, rayWorld.origin) + r2.w,
|
||
);
|
||
rayObj.direction = vec3<f32>(
|
||
dot(r0.xyz, rayWorld.direction),
|
||
dot(r1.xyz, rayWorld.direction),
|
||
dot(r2.xyz, rayWorld.direction),
|
||
);
|
||
rayObj.tMin = rayWorld.tMin;
|
||
rayObj.tMax = (*rq).committedT;
|
||
|
||
var effective = rayFlags;
|
||
let iflags = inst.instanceFlags;
|
||
if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
|
||
effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
|
||
}
|
||
|
||
let meshRec = meshRecords[inst.blasMeshIdx];
|
||
_rqTraverseBlas(rayObj, effective, meshRec, i, inst.customIndex, inst, rq);
|
||
|
||
if ((rayFlags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u
|
||
&& (*rq).committedType != RT_INTERSECTION_NONE) {
|
||
return;
|
||
}
|
||
} else {
|
||
let left = 2u * nodeIdx + 1u;
|
||
let right = 2u * nodeIdx + 2u;
|
||
if (sp + 1u < 24u) {
|
||
stack[sp] = right; sp = sp + 1u;
|
||
stack[sp] = left; sp = sp + 1u;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
fn rayQueryInitialize(rq: ptr<function, RayQuery>, tlasIdx: u32, flags: u32, cullMask: u32,
|
||
origin: vec3<f32>, tMin: f32, direction: vec3<f32>, tMax: f32) {
|
||
(*rq).ray.origin = origin;
|
||
(*rq).ray.tMin = tMin;
|
||
(*rq).ray.direction = direction;
|
||
(*rq).ray.tMax = tMax;
|
||
(*rq).flags = flags;
|
||
(*rq).cullMask = cullMask & 0xFFu;
|
||
(*rq).committedType = RT_INTERSECTION_NONE;
|
||
(*rq).committedT = tMax;
|
||
(*rq).done = 0u;
|
||
}
|
||
|
||
fn rayQueryProceed(rq: ptr<function, RayQuery>) -> bool {
|
||
if ((*rq).done != 0u) { return false; }
|
||
_rqTraverseTlas(rq);
|
||
(*rq).done = 1u;
|
||
return false;
|
||
}
|
||
|
||
fn rayQueryTerminate(rq: ptr<function, RayQuery>) {
|
||
(*rq).done = 1u;
|
||
}
|
||
|
||
fn rayQueryGetCommittedIntersectionType(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedType; }
|
||
fn rayQueryGetCommittedT(rq: ptr<function, RayQuery>) -> f32 { return (*rq).committedT; }
|
||
fn rayQueryGetCommittedInstanceId(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedInstanceId; }
|
||
fn rayQueryGetCommittedInstanceCustomIndex(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedInstanceCustomIndex; }
|
||
fn rayQueryGetCommittedPrimitiveIndex(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedPrimitiveIndex; }
|
||
fn rayQueryGetCommittedBarycentrics(rq: ptr<function, RayQuery>) -> vec2<f32> { return (*rq).committedBarycentrics; }
|
||
fn rayQueryGetCommittedObjectRayOrigin(rq: ptr<function, RayQuery>) -> vec3<f32> { return (*rq).committedObjectRayOrigin; }
|
||
fn rayQueryGetCommittedObjectRayDirection(rq: ptr<function, RayQuery>) -> vec3<f32> { return (*rq).committedObjectRayDirection; }
|
||
fn rayQueryGetCommittedWorldPosition(rq: ptr<function, RayQuery>) -> vec3<f32> {
|
||
return (*rq).ray.origin + (*rq).ray.direction * (*rq).committedT;
|
||
}
|
||
`;
|
||
|
||
// TLAS spatial-partition bin. 64 of these per frame partition instances
|
||
// by the top 6 bits of their Morton code. RT traversal walks bin AABBs
|
||
// first; only bins a ray hits get their per-instance loop. Layout
|
||
// matches the WGSL struct below.
|
||
const TLAS_BIN_COUNT = 64;
|
||
const TLAS_BIN_SIZE = 32;
|
||
|
||
// ── Internal compute pipeline: builds TLASEntry[] from the RTInstance[]
|
||
// + the meshRecords table. One thread per instance.
|
||
// Uses only rtWgslTypes (no megakernel bindings) so it stays well under
|
||
// the 8-storage-buffer-per-stage baseline limit.
|
||
const tlasBuildWgsl = rtWgslTypes /* needs MeshRecord + TLASEntry */ + String.raw`
|
||
struct RTInstance {
|
||
transformR0: vec4<f32>,
|
||
transformR1: vec4<f32>,
|
||
transformR2: vec4<f32>,
|
||
customIndexMask: u32, // customIndex: low 24, mask: high 8
|
||
sbtFlags: u32, // sbtOffset: low 24, flags: high 8
|
||
accelStructureRef: vec2<u32>,
|
||
};
|
||
|
||
@group(0) @binding(0) var<storage, read> inInstances : array<RTInstance>;
|
||
@group(0) @binding(1) var<storage, read> inMeshes : array<MeshRecord>;
|
||
@group(0) @binding(2) var<storage, read_write> outEntries : array<TLASEntry>;
|
||
@group(0) @binding(3) var<storage, read_write> outOrder : array<u32>; // identity perm for stage 1
|
||
@group(0) @binding(4) var<storage, read_write> outMorton : array<u32>; // 30-bit Morton from world-AABB centroid
|
||
|
||
// Spread the low 10 bits of v across every 3rd bit (so three of these
|
||
// interleave into a 30-bit Morton code). Standard "magic number" version.
|
||
fn _expandBits10(v0: u32) -> u32 {
|
||
var v = v0 & 0x000003FFu;
|
||
v = (v * 0x00010001u) & 0xFF0000FFu;
|
||
v = (v * 0x00000101u) & 0x0F00F00Fu;
|
||
v = (v * 0x00000011u) & 0xC30C30C3u;
|
||
v = (v * 0x00000005u) & 0x49249249u;
|
||
return v;
|
||
}
|
||
fn _mortonCode3D(c: vec3<f32>) -> u32 {
|
||
// c assumed in [0, 1]^3.
|
||
let q = clamp(c, vec3<f32>(0.0), vec3<f32>(1.0));
|
||
let xi = u32(q.x * 1023.0);
|
||
let yi = u32(q.y * 1023.0);
|
||
let zi = u32(q.z * 1023.0);
|
||
return (_expandBits10(xi) << 2u)
|
||
| (_expandBits10(yi) << 1u)
|
||
| _expandBits10(zi);
|
||
}
|
||
|
||
fn _invMat3(c0: vec3<f32>, c1: vec3<f32>, c2: vec3<f32>) -> array<vec3<f32>, 3> {
|
||
let m00 = c1.y*c2.z - c1.z*c2.y;
|
||
let m01 = c1.z*c2.x - c1.x*c2.z;
|
||
let m02 = c1.x*c2.y - c1.y*c2.x;
|
||
let det = c0.x*m00 + c0.y*m01 + c0.z*m02;
|
||
let inv = 1.0 / det;
|
||
return array<vec3<f32>, 3>(
|
||
vec3<f32>(m00, c0.z*c2.y - c0.y*c2.z, c0.y*c1.z - c0.z*c1.y) * inv,
|
||
vec3<f32>(m01, c0.x*c2.z - c0.z*c2.x, c0.z*c1.x - c0.x*c1.z) * inv,
|
||
vec3<f32>(m02, c0.y*c2.x - c0.x*c2.y, c0.x*c1.y - c0.y*c1.x) * inv,
|
||
);
|
||
}
|
||
|
||
@compute @workgroup_size(64, 1, 1)
|
||
fn tlasBuildMain(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||
let i = gid.x;
|
||
if (i >= arrayLength(&inInstances)) { return; }
|
||
let inst = inInstances[i];
|
||
let meshIdx = inst.accelStructureRef.x;
|
||
let custom = inst.customIndexMask & 0xFFFFFFu;
|
||
let mask = (inst.customIndexMask >> 24u) & 0xFFu;
|
||
let sbtOffset = inst.sbtFlags & 0xFFFFFFu;
|
||
let iflags = (inst.sbtFlags >> 24u) & 0xFFu;
|
||
|
||
let rec = inMeshes[meshIdx];
|
||
// 8 corners → world AABB.
|
||
var worldMin = vec3<f32>(1e30);
|
||
var worldMax = vec3<f32>(-1e30);
|
||
for (var c: u32 = 0u; c < 8u; c = c + 1u) {
|
||
let corner = vec3<f32>(
|
||
select(rec.rootAabbMin.x, rec.rootAabbMax.x, (c & 1u) != 0u),
|
||
select(rec.rootAabbMin.y, rec.rootAabbMax.y, (c & 2u) != 0u),
|
||
select(rec.rootAabbMin.z, rec.rootAabbMax.z, (c & 4u) != 0u),
|
||
);
|
||
let wc = vec3<f32>(
|
||
dot(inst.transformR0.xyz, corner) + inst.transformR0.w,
|
||
dot(inst.transformR1.xyz, corner) + inst.transformR1.w,
|
||
dot(inst.transformR2.xyz, corner) + inst.transformR2.w,
|
||
);
|
||
worldMin = min(worldMin, wc);
|
||
worldMax = max(worldMax, wc);
|
||
}
|
||
|
||
// Inverse 3x4 affine.
|
||
let inv = _invMat3(inst.transformR0.xyz, inst.transformR1.xyz, inst.transformR2.xyz);
|
||
let T = vec3<f32>(inst.transformR0.w, inst.transformR1.w, inst.transformR2.w);
|
||
let invT = vec3<f32>(-dot(inv[0], T), -dot(inv[1], T), -dot(inv[2], T));
|
||
|
||
var e: TLASEntry;
|
||
e.aabbMin = worldMin;
|
||
e.aabbMax = worldMax;
|
||
e.maskHGOffset = mask | (sbtOffset << 8u);
|
||
e.blasMeshIdx = meshIdx;
|
||
e.objectToWorldR0 = inst.transformR0;
|
||
e.objectToWorldR1 = inst.transformR1;
|
||
e.objectToWorldR2 = inst.transformR2;
|
||
e.worldToObjectR0 = vec4<f32>(inv[0], invT.x);
|
||
e.worldToObjectR1 = vec4<f32>(inv[1], invT.y);
|
||
e.worldToObjectR2 = vec4<f32>(inv[2], invT.z);
|
||
e.customIndex = custom;
|
||
e.instanceFlags = iflags;
|
||
outEntries[i] = e;
|
||
|
||
// LBVH pre-pass output:
|
||
// outOrder[i] = i — identity permutation (Stage 1; the
|
||
// radix sort in Stage 2 rewrites this).
|
||
// outMorton[i] = morton(centroid normalized to [0,1]^3 over the
|
||
// fixed world bound).
|
||
// Fixed bound is conservative for 3DForts (map ~2000 units, fort ~50).
|
||
// Stage 2 swaps to a true scene-AABB reduction.
|
||
outOrder[i] = i;
|
||
let centroid = (worldMin + worldMax) * 0.5;
|
||
let worldHalfExtent = 4000.0;
|
||
let normalized = (centroid + vec3<f32>(worldHalfExtent)) * (1.0 / (2.0 * worldHalfExtent));
|
||
outMorton[i] = _mortonCode3D(normalized);
|
||
}
|
||
`;
|
||
|
||
// TLAS LBVH build — single dispatch of one 1024-thread workgroup.
|
||
// Phases:
|
||
// 0. Scene AABB reduction (256-wide tree reduce).
|
||
// 1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel
|
||
// slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to
|
||
// 0xFFFE so no real key collides with the sentinel.
|
||
// 2. Bitonic sort of the packed keys ascending (in-place in sortA).
|
||
// Data-oblivious network — 105 compare-exchange sub-stages over
|
||
// 2^14 keys — so it cannot exhibit the count-dependent corruption
|
||
// the old LSD radix scatter did (TODO-lbvh-sort.md, strategy #5).
|
||
// 3. Write sorted instance permutation into outOrder.
|
||
// 4. Initialize BVH leaf AABBs from sorted instances.
|
||
// 5. Bottom-up sweep-tree refit, log2(N_PADDED) levels.
|
||
//
|
||
// Storage-barrier pattern: workgroupBarrier() fences workgroup memory
|
||
// only per WGSL spec; storageBarrier() is required between R/W phases
|
||
// on sortA and outBvh — including between every bitonic sub-stage, which
|
||
// reads and writes sortA. Both are called at every storage boundary.
|
||
//
|
||
// Hard cap: LBVH_MAX = 16384. The bitonic sort gives BVH leaves Morton
|
||
// (Z-order) spatial coherence, and the degenerate-AABB fix in _rtAabb
|
||
// keeps sentinel-only subtrees from being traversed. Per-ray cost scales
|
||
// with log2(N_real), not log2(N_PADDED).
|
||
const LBVH_MAX = 16384;
|
||
const lbvhBuildWgsl = String.raw`
|
||
struct TLASEntryStub {
|
||
aabbMin: vec3<f32>,
|
||
maskHGOffset: u32,
|
||
aabbMax: vec3<f32>,
|
||
blasMeshIdx: u32,
|
||
objectToWorldR0: vec4<f32>,
|
||
objectToWorldR1: vec4<f32>,
|
||
objectToWorldR2: vec4<f32>,
|
||
worldToObjectR0: vec4<f32>,
|
||
worldToObjectR1: vec4<f32>,
|
||
worldToObjectR2: vec4<f32>,
|
||
customIndex: u32,
|
||
instanceFlags: u32,
|
||
_pad0: u32,
|
||
_pad1: u32,
|
||
};
|
||
struct BvhNode {
|
||
aabbMin: vec3<f32>,
|
||
_pad0: u32,
|
||
aabbMax: vec3<f32>,
|
||
_pad1: u32,
|
||
};
|
||
|
||
@group(0) @binding(0) var<storage, read> entries : array<TLASEntryStub>;
|
||
@group(0) @binding(1) var<storage, read_write> outOrder : array<u32>;
|
||
@group(0) @binding(2) var<storage, read_write> outBvh : array<BvhNode>;
|
||
// Radix-sort ping-pong buffers. One u32 per element — the packed
|
||
// (morton16 << 16) | tlasIndex16 key. Sized for N_PADDED.
|
||
@group(0) @binding(3) var<storage, read_write> sortA : array<u32>;
|
||
@group(0) @binding(4) var<storage, read_write> sortB : array<u32>;
|
||
// Real instance count. Passed as a uniform so the entries / outOrder /
|
||
// sortA / sortB / outBvh buffers can be allocated ONCE at N_PADDED and
|
||
// never resized as the application's TLAS instance count changes —
|
||
// runtime resize-on-grow caused subtle BVH corruption (driver-level
|
||
// memory recycling, suspected) and was the root cause of mid-game
|
||
// geometry flicker when projectiles entered the TLAS.
|
||
struct LbvhPC { nReal: u32, _pad0: u32, _pad1: u32, _pad2: u32 };
|
||
@group(0) @binding(5) var<uniform> lbvhPc : LbvhPC;
|
||
|
||
const N_PADDED: u32 = 16384u;
|
||
const THREADS: u32 = 1024u;
|
||
const K_PER: u32 = 16u; // = N_PADDED / THREADS
|
||
const REDUCE_LANES: u32 = 256u;
|
||
const REDUCE_K_PER: u32 = 64u; // = N_PADDED / REDUCE_LANES
|
||
const LEVELS: u32 = 14u; // log2(N_PADDED)
|
||
|
||
// Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is
|
||
// 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB
|
||
// default workgroup-storage cap.
|
||
var<workgroup> shRedMin: array<vec3<f32>, 256>;
|
||
var<workgroup> shRedMax: array<vec3<f32>, 256>;
|
||
var<workgroup> shSceneMin: vec3<f32>;
|
||
var<workgroup> shSceneMax: vec3<f32>;
|
||
|
||
fn _expandBits10(v0: u32) -> u32 {
|
||
var v = v0 & 0x000003FFu;
|
||
v = (v * 0x00010001u) & 0xFF0000FFu;
|
||
v = (v * 0x00000101u) & 0x0F00F00Fu;
|
||
v = (v * 0x00000011u) & 0xC30C30C3u;
|
||
v = (v * 0x00000005u) & 0x49249249u;
|
||
return v;
|
||
}
|
||
fn _mortonCode3D(c: vec3<f32>) -> u32 {
|
||
let q = clamp(c, vec3<f32>(0.0), vec3<f32>(1.0));
|
||
let xi = u32(q.x * 1023.0);
|
||
let yi = u32(q.y * 1023.0);
|
||
let zi = u32(q.z * 1023.0);
|
||
return (_expandBits10(xi) << 2u)
|
||
| (_expandBits10(yi) << 1u)
|
||
| _expandBits10(zi);
|
||
}
|
||
|
||
@compute @workgroup_size(1024, 1, 1)
|
||
fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3<u32>) {
|
||
let tid = lid.x;
|
||
let n = lbvhPc.nReal;
|
||
|
||
// ── Phase 0: scene AABB reduction across centroids ───────────────────
|
||
// REDUCE_LANES=256 lanes each fold REDUCE_K_PER stripes (covers
|
||
// N_PADDED), then an 8-step tree reduce across those lanes gives
|
||
// the final AABB.
|
||
if (tid < REDUCE_LANES) {
|
||
var lMin = vec3<f32>( 1e30);
|
||
var lMax = vec3<f32>(-1e30);
|
||
for (var k: u32 = 0u; k < REDUCE_K_PER; k = k + 1u) {
|
||
let i = tid * REDUCE_K_PER + k;
|
||
if (i < n) {
|
||
let c = (entries[i].aabbMin + entries[i].aabbMax) * 0.5;
|
||
lMin = min(lMin, c);
|
||
lMax = max(lMax, c);
|
||
}
|
||
}
|
||
shRedMin[tid] = lMin;
|
||
shRedMax[tid] = lMax;
|
||
}
|
||
workgroupBarrier();
|
||
var stride: u32 = 128u;
|
||
for (var s: u32 = 0u; s < 8u; s = s + 1u) {
|
||
if (tid < stride) {
|
||
shRedMin[tid] = min(shRedMin[tid], shRedMin[tid + stride]);
|
||
shRedMax[tid] = max(shRedMax[tid], shRedMax[tid + stride]);
|
||
}
|
||
workgroupBarrier();
|
||
stride = stride / 2u;
|
||
}
|
||
if (tid == 0u) {
|
||
shSceneMin = shRedMin[0];
|
||
shSceneMax = shRedMax[0];
|
||
}
|
||
workgroupBarrier();
|
||
|
||
// ── Phase 1: emit packed sort keys into sortA ────────────────────────
|
||
let extent = max(shSceneMax - shSceneMin, vec3<f32>(1e-3));
|
||
let invExtent = vec3<f32>(1.0) / extent;
|
||
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
||
let i = k * THREADS + tid;
|
||
var key: u32;
|
||
if (i < n) {
|
||
let c = (entries[i].aabbMin + entries[i].aabbMax) * 0.5;
|
||
let nrm = (c - shSceneMin) * invExtent;
|
||
let m30 = _mortonCode3D(nrm);
|
||
let m16 = min(m30 >> 14u, 0xFFFEu);
|
||
key = (m16 << 16u) | (i & 0xFFFFu);
|
||
} else {
|
||
key = 0xFFFFFFFFu;
|
||
}
|
||
sortA[i] = key;
|
||
}
|
||
workgroupBarrier();
|
||
storageBarrier();
|
||
|
||
// ── Phase 2: bitonic sort of sortA[0..N_PADDED) ascending ────────────
|
||
// Replaces the previous LSD radix scatter, which produced
|
||
// count-dependent corruption (TODO-lbvh-sort.md): a memory-ordering
|
||
// bug in the Hillis-Steele scan / parallel scatter that surfaced only
|
||
// for certain Morton distributions (a small object beside a tight
|
||
// cluster), making fort geometry flicker. Despite careful review the
|
||
// exact race was never pinned down.
|
||
//
|
||
// A bitonic sorting network is DATA-OBLIVIOUS: the sequence of
|
||
// compare-exchanges depends only on N_PADDED, never on the key values.
|
||
// That eliminates the entire class of distribution-dependent races the
|
||
// radix sort tripped over (TODO strategy #5). N_PADDED is a power of
|
||
// two so the network is exact; sentinel keys (0xFFFFFFFF) compare
|
||
// largest and settle at the tail — exactly where Phase 4 expects them.
|
||
//
|
||
// Single workgroup, storage-resident: 16384 u32 = 64 KB exceeds the
|
||
// workgroup-storage cap, so the keys stay in sortA. Each of the 1024
|
||
// threads owns PAIRS_PER_THREAD = (N_PADDED/2)/THREADS = 8 compare-
|
||
// exchanges per sub-stage. A bitonic network over 2^14 keys has
|
||
// sum(p for p in 1..=14) = 105 sub-stages; storageBarrier() fences
|
||
// sortA between each so one sub-stage's writes are visible to the next.
|
||
// sortB is unused by this path (left bound; harmless).
|
||
const PAIRS: u32 = N_PADDED / 2u; // 8192 compare-exchanges / sub-stage
|
||
const PAIRS_PER_THREAD: u32 = PAIRS / THREADS; // 8
|
||
for (var k: u32 = 2u; k <= N_PADDED; k = k << 1u) {
|
||
for (var j: u32 = k >> 1u; j > 0u; j = j >> 1u) {
|
||
for (var t: u32 = 0u; t < PAIRS_PER_THREAD; t = t + 1u) {
|
||
// Linear pair id p in [0, N_PADDED/2). Map it to the lower
|
||
// index lo of the compared pair by inserting a 0 bit at
|
||
// position log2(j): lo has that bit clear, hi = lo | j.
|
||
let p = t * THREADS + tid;
|
||
let lo = ((p & ~(j - 1u)) << 1u) | (p & (j - 1u));
|
||
let hi = lo | j;
|
||
let a = sortA[lo];
|
||
let b = sortA[hi];
|
||
// Sort direction for this bitonic block. lo and hi differ
|
||
// only in bit log2(j) (< log2(k)), so both agree on (x & k).
|
||
let ascending = (lo & k) == 0u;
|
||
if ((a > b) == ascending) {
|
||
sortA[lo] = b;
|
||
sortA[hi] = a;
|
||
}
|
||
}
|
||
storageBarrier();
|
||
workgroupBarrier();
|
||
}
|
||
}
|
||
// Sorted keys (ascending; sentinels last) now live in sortA.
|
||
|
||
// ── Phase 3: write sorted instance permutation into outOrder ─────────
|
||
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
||
let i = k * THREADS + tid;
|
||
if (i < n) {
|
||
outOrder[i] = sortA[i] & 0xFFFFu;
|
||
}
|
||
}
|
||
workgroupBarrier();
|
||
storageBarrier();
|
||
|
||
// ── Phase 4: initialize BVH leaf AABBs ───────────────────────────────
|
||
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
|
||
let i = k * THREADS + tid;
|
||
let leafIdx = N_PADDED - 1u + i;
|
||
let leafKey = sortA[i];
|
||
if (leafKey == 0xFFFFFFFFu) {
|
||
outBvh[leafIdx].aabbMin = vec3<f32>( 1e30);
|
||
outBvh[leafIdx].aabbMax = vec3<f32>(-1e30);
|
||
} else {
|
||
let e = entries[leafKey & 0xFFFFu];
|
||
outBvh[leafIdx].aabbMin = e.aabbMin;
|
||
outBvh[leafIdx].aabbMax = e.aabbMax;
|
||
}
|
||
}
|
||
workgroupBarrier();
|
||
storageBarrier();
|
||
|
||
// ── Phase 5: bottom-up sweep-tree refit, LEVELS iterations ──────────
|
||
// Deepest internal level has N_PADDED/2 nodes; perThread = ceil of
|
||
// levelCount / THREADS is uniform per step, so workgroupBarrier
|
||
// stays in uniform control flow.
|
||
var levelCount: u32 = N_PADDED / 2u;
|
||
var levelStart: u32 = N_PADDED / 2u - 1u;
|
||
for (var step: u32 = 0u; step < LEVELS; step = step + 1u) {
|
||
let perThread = (levelCount + THREADS - 1u) / THREADS;
|
||
for (var k: u32 = 0u; k < perThread; k = k + 1u) {
|
||
let nodeOff = k * THREADS + tid;
|
||
if (nodeOff < levelCount) {
|
||
let nodeIdx = levelStart + nodeOff;
|
||
let leftIdx = 2u * nodeIdx + 1u;
|
||
let rightIdx = 2u * nodeIdx + 2u;
|
||
let lMin = outBvh[leftIdx].aabbMin;
|
||
let lMax = outBvh[leftIdx].aabbMax;
|
||
let rMin = outBvh[rightIdx].aabbMin;
|
||
let rMax = outBvh[rightIdx].aabbMax;
|
||
outBvh[nodeIdx].aabbMin = min(lMin, rMin);
|
||
outBvh[nodeIdx].aabbMax = max(lMax, rMax);
|
||
}
|
||
}
|
||
workgroupBarrier();
|
||
storageBarrier();
|
||
levelCount = levelCount / 2u;
|
||
levelStart = (levelStart - 1u) / 2u;
|
||
}
|
||
}
|
||
`;
|
||
|
||
// ── RT runtime state ──────────────────────────────────────────────────
|
||
// Mesh heaps grow geometrically; each Mesh::Build appends + records its
|
||
// offsets in meshRecordsCpu/Buffer. nextMeshHandle is what gets returned
|
||
// to C++ as RTInstance::accelerationStructureReference.
|
||
const RT_HEAP_INITIAL_BYTES = 64 * 1024;
|
||
function makeRtHeap() {
|
||
return {
|
||
gpu: device.createBuffer({
|
||
size: RT_HEAP_INITIAL_BYTES,
|
||
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
|
||
}),
|
||
capacity: RT_HEAP_INITIAL_BYTES,
|
||
cursor: 0,
|
||
};
|
||
}
|
||
function rtHeapEnsure(h, neededBytes) {
|
||
if (h.cursor + neededBytes <= h.capacity) return;
|
||
let cap = h.capacity;
|
||
while (h.cursor + neededBytes > cap) cap *= 2;
|
||
const ng = device.createBuffer({
|
||
size: cap,
|
||
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
|
||
});
|
||
const enc = device.createCommandEncoder();
|
||
if (h.cursor > 0) enc.copyBufferToBuffer(h.gpu, 0, ng, 0, h.cursor);
|
||
queue.submit([enc.finish()]);
|
||
h.gpu.destroy();
|
||
h.gpu = ng;
|
||
h.capacity = cap;
|
||
// Invalidate cached bind groups that referenced the heap.
|
||
rtState.bindGroupCache.clear();
|
||
}
|
||
|
||
const rtState = {
|
||
vertHeap: null, // f32 stream (3 floats per vertex)
|
||
indexHeap: null, // u32 stream
|
||
bvhHeap: null, // BVHNode stream (32 bytes per node)
|
||
primRemapHeap: null, // u32 stream
|
||
attribsHeap: null, // u32 stream (per-vertex attribute payload; example-defined stride)
|
||
|
||
meshRecordsBuffer: null, // GPUBuffer of MeshRecord[]
|
||
meshRecordsCapacity: 0,
|
||
nextMeshHandle: 1,
|
||
|
||
rtHeader: null, // uniform buffer for RTDispatchHeader (256 B aligned)
|
||
|
||
bindGroupCache: new Map(), // key → bind group
|
||
|
||
tlasBuildPipeline: null,
|
||
tlasBuildBgl: null,
|
||
|
||
// Latest TLAS buffer handle from wgpuBuildTLAS, used by rayQuery-capable
|
||
// compute shaders at dispatch time.
|
||
currentTlas: 0,
|
||
currentTlasInstanceCount: 0,
|
||
};
|
||
|
||
function rtInit() {
|
||
rtState.vertHeap = makeRtHeap();
|
||
rtState.indexHeap = makeRtHeap();
|
||
rtState.bvhHeap = makeRtHeap();
|
||
rtState.primRemapHeap = makeRtHeap();
|
||
rtState.attribsHeap = makeRtHeap();
|
||
rtState.meshRecordsCapacity = 16;
|
||
rtState.meshRecordsBuffer = device.createBuffer({
|
||
size: rtState.meshRecordsCapacity * 48,
|
||
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
|
||
});
|
||
rtState.rtHeader = device.createBuffer({
|
||
size: 256,
|
||
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
||
});
|
||
|
||
// TLAS-build compute pipeline. group(0) carries:
|
||
// 0 inInstances (read-only-storage)
|
||
// 1 inMeshes (read-only-storage)
|
||
// 2 outEntries (read-write storage)
|
||
// 3 outOrder (read-write — identity perm, sort overwrites later)
|
||
// 4 outMorton (read-write — sort key, 30-bit Morton from centroid)
|
||
const mod = device.createShaderModule({ code: tlasBuildWgsl, label: "rt-tlas-build" });
|
||
const tlasBuildBgl = device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
|
||
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
|
||
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
|
||
]});
|
||
rtState.tlasBuildBgl = tlasBuildBgl;
|
||
rtState.tlasBuildPipeline = device.createComputePipeline({
|
||
layout: device.createPipelineLayout({ bindGroupLayouts: [tlasBuildBgl] }),
|
||
compute: { module: mod, entryPoint: "tlasBuildMain" },
|
||
});
|
||
|
||
// LBVH-build follow-up pipeline. Single workgroup of 1024 threads
|
||
// sorts instances by packed (morton16:idx16) keys (LSD radix) and
|
||
// refits a sweep-tree BVH. N_PADDED = 1024 ceiling for Stage 1.
|
||
// 0 entries (read-only)
|
||
// 1 outOrder (read-write, sorted permutation)
|
||
// 2 outBvh (read-write, 2*N_PADDED - 1 nodes × 32 bytes)
|
||
// 3 sortA (read-write, N_PADDED u32 ping-pong)
|
||
// 4 sortB (read-write, N_PADDED u32 ping-pong)
|
||
const lbvhMod = device.createShaderModule({ code: lbvhBuildWgsl, label: "rt-tlas-lbvh-build" });
|
||
const lbvhBuildBgl = device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
|
||
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
|
||
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
|
||
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
|
||
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
|
||
]});
|
||
rtState.lbvhBuildBgl = lbvhBuildBgl;
|
||
rtState.lbvhBuildPipeline = device.createComputePipeline({
|
||
layout: device.createPipelineLayout({ bindGroupLayouts: [lbvhBuildBgl] }),
|
||
compute: { module: lbvhMod, entryPoint: "lbvhBuildMain" },
|
||
});
|
||
// Tiny uniform buffer for the LBVH's `nReal` field. Written each
|
||
// wgpuBuildTLAS call before dispatch.
|
||
rtState.lbvhCountBuf = device.createBuffer({
|
||
size: 16,
|
||
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
||
});
|
||
}
|
||
|
||
function rtMeshRecordsEnsure(meshCount) {
|
||
if (meshCount <= rtState.meshRecordsCapacity) return;
|
||
let cap = rtState.meshRecordsCapacity;
|
||
while (cap < meshCount) cap *= 2;
|
||
const ng = device.createBuffer({
|
||
size: cap * 48,
|
||
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
|
||
});
|
||
const enc = device.createCommandEncoder();
|
||
enc.copyBufferToBuffer(rtState.meshRecordsBuffer, 0, ng, 0,
|
||
rtState.meshRecordsCapacity * 48);
|
||
queue.submit([enc.finish()]);
|
||
rtState.meshRecordsBuffer.destroy();
|
||
rtState.meshRecordsBuffer = ng;
|
||
rtState.meshRecordsCapacity = cap;
|
||
rtState.bindGroupCache.clear();
|
||
}
|
||
|
||
env.wgpuRegisterMeshBLAS = (minX, minY, minZ, maxX, maxY, maxZ,
|
||
verticesPtr, vertexCount,
|
||
indicesPtr, indexCount,
|
||
bvhNodesPtr, bvhNodeCount,
|
||
primRemapPtr, primRemapCount,
|
||
attribsPtr, attribsByteCount) => {
|
||
if (!rtState.vertHeap) rtInit();
|
||
console.log(`[crafter-wgpu] mesh BLAS: bbox=(${minX.toFixed(1)}..${maxX.toFixed(1)}, ${minY.toFixed(1)}..${maxY.toFixed(1)}, ${minZ.toFixed(1)}..${maxZ.toFixed(1)}), ${vertexCount} verts, ${indexCount/3} tris, attribs=${attribsByteCount}B`);
|
||
|
||
const vBytes = vertexCount * 12;
|
||
const iBytes = indexCount * 4;
|
||
const nBytes = bvhNodeCount * 32;
|
||
const rBytes = primRemapCount * 4;
|
||
// attribsByteCount must be a multiple of 4 (the heap is array<u32>).
|
||
// Round up the upload size; the in-MeshRecord offset is in u32 words.
|
||
const aBytes = (attribsByteCount + 3) & ~3;
|
||
|
||
rtHeapEnsure(rtState.vertHeap, vBytes);
|
||
rtHeapEnsure(rtState.indexHeap, iBytes);
|
||
rtHeapEnsure(rtState.bvhHeap, nBytes);
|
||
rtHeapEnsure(rtState.primRemapHeap, rBytes);
|
||
if (aBytes > 0) rtHeapEnsure(rtState.attribsHeap, aBytes);
|
||
|
||
const vOff = rtState.vertHeap.cursor / 12; // in vec3 units
|
||
const iOff = rtState.indexHeap.cursor / 4; // in u32 units
|
||
const nOff = rtState.bvhHeap.cursor / 32; // in BVHNode units
|
||
const rOff = rtState.primRemapHeap.cursor / 4;
|
||
const aOff = rtState.attribsHeap.cursor / 4; // in u32 units
|
||
|
||
// queue.writeBuffer requires multiple-of-4 sizes. Vertex byte count is
|
||
// already 12*n; index/bvh/remap are 4*n / 32*n / 4*n — all multiples of 4.
|
||
queue.writeBuffer(rtState.vertHeap.gpu, rtState.vertHeap.cursor,
|
||
memU8().buffer, verticesPtr, vBytes);
|
||
queue.writeBuffer(rtState.indexHeap.gpu, rtState.indexHeap.cursor,
|
||
memU8().buffer, indicesPtr, iBytes);
|
||
queue.writeBuffer(rtState.bvhHeap.gpu, rtState.bvhHeap.cursor,
|
||
memU8().buffer, bvhNodesPtr, nBytes);
|
||
queue.writeBuffer(rtState.primRemapHeap.gpu, rtState.primRemapHeap.cursor,
|
||
memU8().buffer, primRemapPtr, rBytes);
|
||
if (aBytes > 0) {
|
||
queue.writeBuffer(rtState.attribsHeap.gpu, rtState.attribsHeap.cursor,
|
||
memU8().buffer, attribsPtr, aBytes);
|
||
}
|
||
|
||
rtState.vertHeap.cursor += vBytes;
|
||
rtState.indexHeap.cursor += iBytes;
|
||
rtState.bvhHeap.cursor += nBytes;
|
||
rtState.primRemapHeap.cursor += rBytes;
|
||
rtState.attribsHeap.cursor += aBytes;
|
||
|
||
const handle = rtState.nextMeshHandle++;
|
||
rtMeshRecordsEnsure(handle + 1);
|
||
|
||
// Build the MeshRecord (48 bytes) and write it.
|
||
const rec = new ArrayBuffer(48);
|
||
const f32 = new Float32Array(rec);
|
||
const u32 = new Uint32Array(rec);
|
||
f32[0] = minX; f32[1] = minY; f32[2] = minZ;
|
||
u32[3] = vOff;
|
||
f32[4] = maxX; f32[5] = maxY; f32[6] = maxZ;
|
||
u32[7] = iOff;
|
||
u32[8] = nOff;
|
||
u32[9] = rOff;
|
||
u32[10] = (vertexCount > 0) ? (indexCount / 3) : 0;
|
||
u32[11] = aOff;
|
||
queue.writeBuffer(rtState.meshRecordsBuffer, handle * 48, rec);
|
||
|
||
return handle;
|
||
};
|
||
|
||
env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle,
|
||
entryOrderHandle, mortonHandle, binsHandle,
|
||
bvhNodesHandle, sortABufHandle, sortBBufHandle) => {
|
||
if (!rtState.tlasBuildPipeline || !rtState.lbvhBuildPipeline) return;
|
||
const inst = buffers.get(instanceBufHandle);
|
||
const out = buffers.get(tlasOutBufHandle);
|
||
const order = buffers.get(entryOrderHandle);
|
||
const morton = buffers.get(mortonHandle);
|
||
const bvh = buffers.get(bvhNodesHandle);
|
||
const sortA = buffers.get(sortABufHandle);
|
||
const sortB = buffers.get(sortBBufHandle);
|
||
if (!inst || !out || !order || !morton || !bvh || !sortA || !sortB) {
|
||
console.error("[crafter-wgpu] wgpuBuildTLAS: unknown buffer handle");
|
||
return;
|
||
}
|
||
if (instanceCount > LBVH_MAX) {
|
||
console.error(`[crafter-wgpu] wgpuBuildTLAS: instance count ${instanceCount} > LBVH cap ${LBVH_MAX}`);
|
||
return;
|
||
}
|
||
|
||
const bg = device.createBindGroup({
|
||
layout: rtState.tlasBuildBgl,
|
||
entries: [
|
||
{ binding: 0, resource: { buffer: inst } },
|
||
{ binding: 1, resource: { buffer: rtState.meshRecordsBuffer } },
|
||
{ binding: 2, resource: { buffer: out } },
|
||
{ binding: 3, resource: { buffer: order } },
|
||
{ binding: 4, resource: { buffer: morton } },
|
||
],
|
||
});
|
||
// Write the real instance count to the LBVH count uniform so the
|
||
// shader can iterate exactly the right number of entries even
|
||
// though the storage buffers stay sized for N_PADDED.
|
||
const countBuf = new Uint32Array(4);
|
||
countBuf[0] = instanceCount;
|
||
queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf);
|
||
|
||
const lbvhBg = device.createBindGroup({
|
||
layout: rtState.lbvhBuildBgl,
|
||
entries: [
|
||
{ binding: 0, resource: { buffer: out } },
|
||
{ binding: 1, resource: { buffer: order } },
|
||
{ binding: 2, resource: { buffer: bvh } },
|
||
{ binding: 3, resource: { buffer: sortA } },
|
||
{ binding: 4, resource: { buffer: sortB } },
|
||
{ binding: 5, resource: { buffer: rtState.lbvhCountBuf } },
|
||
],
|
||
});
|
||
|
||
if (state.pass) {
|
||
// Mid-frame — close the user's compute pass, run our build pass
|
||
// on the same encoder, then reopen.
|
||
state.pass.end();
|
||
state.pass = null;
|
||
}
|
||
const enc = state.encoder || device.createCommandEncoder();
|
||
const ownEncoder = !state.encoder;
|
||
// Pass 1: TLAS entry build (existing).
|
||
{
|
||
const pass = enc.beginComputePass({ label: "tlas-build" });
|
||
pass.setPipeline(rtState.tlasBuildPipeline);
|
||
pass.setBindGroup(0, bg);
|
||
const groups = Math.ceil(instanceCount / 64);
|
||
pass.dispatchWorkgroups(groups, 1, 1);
|
||
pass.end();
|
||
}
|
||
// Pass 2: LBVH-build. Single workgroup of 1024 threads runs the
|
||
// entire BVH build (Morton, sort, sweep-tree refit) in shared
|
||
// memory. Same encoder → pipeline barrier sequences pass1 → pass2.
|
||
if (instanceCount > 0) {
|
||
const pass = enc.beginComputePass({ label: "tlas-lbvh-build" });
|
||
pass.setPipeline(rtState.lbvhBuildPipeline);
|
||
pass.setBindGroup(0, lbvhBg);
|
||
pass.dispatchWorkgroups(1, 1, 1);
|
||
pass.end();
|
||
}
|
||
if (ownEncoder) {
|
||
queue.submit([enc.finish()]);
|
||
} else {
|
||
state.pass = state.encoder.beginComputePass();
|
||
}
|
||
|
||
// Publish so rayQuery-capable compute pipelines pick up the latest TLAS
|
||
// without each dispatch having to thread the handle explicitly.
|
||
rtState.currentTlas = tlasOutBufHandle;
|
||
rtState.currentTlasInstanceCount = instanceCount;
|
||
rtState.currentEntryOrder = entryOrderHandle;
|
||
rtState.currentBvh = bvhNodesHandle;
|
||
};
|
||
|
||
// RT pipeline loader — wraps user-supplied WGSL (sources + generated mega
|
||
// switches + raygen + @compute entry) with the library prelude/helpers.
|
||
// `bindingsPtr` / `bindingsCount` are UICustomBinding entries (same 8-byte
|
||
// shape as wgpuLoadCustomShader) declaring extra @group(2)+ resources the
|
||
// closest-hit / miss / raygen WGSL touches (material SSBOs, albedo
|
||
// textures, samplers). Pass (0, 0) for a pipeline with no user bindings.
|
||
const rtPipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups }
|
||
|
||
// Per-payload byte budget in wfPayload (rounded up; user Payload structs
|
||
// must fit). Sponza's Payload is 48 B; 64 leaves headroom while keeping
|
||
// 2·W·H·64 B ≈ 265 MB at 1080p.
|
||
const WF_PAYLOAD_BYTES = 64;
|
||
// Dynamic-offset uniform ring: one WfParams slot per wavefront pass. 128
|
||
// slots covers maxDepth up to ~42 (1 + 3·maxDepth + 1 passes).
|
||
const WF_PARAM_SLOTS = 128;
|
||
const WF_FIXED_TLAS_NPADDED = 16384; // matches lbvhBuildWgsl N_PADDED
|
||
|
||
function ensureWavefrontBuffers(W, H) {
|
||
const cap = W * H;
|
||
rtState.wf = rtState.wf || { cap: 0 };
|
||
const wf = rtState.wf;
|
||
if (wf.cap === cap && wf.raysA) return wf;
|
||
for (const b of [wf.raysA, wf.raysB, wf.hits, wf.accum, wf.counters,
|
||
wf.payload, wf.indirect]) { if (b) b.destroy(); }
|
||
const S = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST;
|
||
wf.raysA = device.createBuffer({ size: cap * 64, usage: S, label: "wf-raysA" });
|
||
wf.raysB = device.createBuffer({ size: cap * 64, usage: S, label: "wf-raysB" });
|
||
wf.hits = device.createBuffer({ size: cap * 112, usage: S, label: "wf-hits" });
|
||
wf.accum = device.createBuffer({ size: cap * 16, usage: S, label: "wf-accum" });
|
||
wf.payload = device.createBuffer({ size: 2 * cap * WF_PAYLOAD_BYTES, usage: S, label: "wf-payload" });
|
||
wf.counters = device.createBuffer({ size: 64,
|
||
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, label: "wf-counters" });
|
||
wf.indirect = device.createBuffer({ size: 16,
|
||
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST, label: "wf-indirect" });
|
||
if (!wf.paramsRing) {
|
||
wf.paramsRing = device.createBuffer({ size: WF_PARAM_SLOTS * 256,
|
||
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, label: "wf-params" });
|
||
}
|
||
wf.cap = cap;
|
||
return wf;
|
||
}
|
||
|
||
env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
|
||
if (!rtState.vertHeap) rtInit();
|
||
const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
|
||
|
||
// Insert helpers at the marker; prepend prelude.
|
||
const marker = "// @CRAFTER_RT_LIBRARY_HELPERS_HERE";
|
||
let beforeHelpers = userPart;
|
||
let afterHelpers = "";
|
||
const mi = userPart.indexOf(marker);
|
||
if (mi >= 0) {
|
||
beforeHelpers = userPart.substring(0, mi);
|
||
afterHelpers = userPart.substring(mi + marker.length);
|
||
}
|
||
// Wavefront assembly: types + bindings | user CH/miss/resolve + wfPayload
|
||
// + switches (beforeHelpers) | pure helpers | wavefront helpers | user
|
||
// raygen + the five @compute entry points (afterHelpers).
|
||
const fullWgsl = rtWgslTypes + rtWgslWavefrontBindings + "\n"
|
||
+ beforeHelpers + "\n" + rtWgslPureHelpers + "\n"
|
||
+ rtWgslWavefrontHelpers + "\n" + afterHelpers;
|
||
|
||
// Parse user bindings (same wire format as wgpuLoadCustomShader). For
|
||
// the wavefront RT pipeline, group 0 = WfParams, group 1 = data heaps,
|
||
// group 2 = indirect args — so user bindings must start at group 3.
|
||
const userBindings = [];
|
||
if (bindingsCount > 0) {
|
||
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
|
||
for (let i = 0; i < bindingsCount; i++) {
|
||
const g = dv.getUint8(i*8 + 0);
|
||
if (g < 3) {
|
||
console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 3`);
|
||
return 0;
|
||
}
|
||
userBindings.push({
|
||
group: g,
|
||
binding: dv.getUint8(i*8 + 1),
|
||
kind: dv.getUint8(i*8 + 2),
|
||
pushOffset: dv.getUint32(i*8 + 4, true),
|
||
});
|
||
}
|
||
}
|
||
const byGroup = new Map();
|
||
for (const b of userBindings) {
|
||
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
|
||
byGroup.get(b.group).push(b);
|
||
}
|
||
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
|
||
|
||
try {
|
||
const mod = device.createShaderModule({ code: fullWgsl, label: "rt-wavefront" });
|
||
const paramsBgl = device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
|
||
buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 32 } },
|
||
]});
|
||
const sb = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } });
|
||
const rw = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } });
|
||
const dataBgl = device.createBindGroupLayout({ entries: [
|
||
sb(0), sb(1), sb(2), sb(3), sb(4), sb(5),
|
||
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
|
||
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
|
||
sb(7), sb(8), sb(9),
|
||
rw(10), rw(11), rw(12), rw(13), rw(14), rw(15),
|
||
]});
|
||
const indirectBgl = device.createBindGroupLayout({ entries: [ rw(0) ]});
|
||
const emptyBgl = device.createBindGroupLayout({ entries: [] });
|
||
|
||
// User binding-group layouts for groups 3..highest (pad gaps).
|
||
const userBgls = [];
|
||
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 2;
|
||
for (let g = 3; g <= highest; g++) {
|
||
if (byGroup.has(g)) {
|
||
const entries = byGroup.get(g).map(b => {
|
||
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
|
||
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
|
||
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
|
||
else if (b.kind === 2) e.sampler = { type: "filtering" };
|
||
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
|
||
else if (b.kind === 4) e.buffer = { type: "storage" };
|
||
return e;
|
||
});
|
||
userBgls.push(device.createBindGroupLayout({ entries }));
|
||
} else {
|
||
userBgls.push(device.createBindGroupLayout({ entries: [] }));
|
||
}
|
||
}
|
||
// GENERATE / SHADE / RESOLVE may touch user bindings (raygen camera,
|
||
// closesthit albedo, resolve params) → params + data + empty(group2)
|
||
// + user. PREP → params + data + indirect. TRACE → params + data.
|
||
const userLayout = device.createPipelineLayout({
|
||
bindGroupLayouts: [paramsBgl, dataBgl, emptyBgl, ...userBgls] });
|
||
const prepLayout = device.createPipelineLayout({
|
||
bindGroupLayouts: [paramsBgl, dataBgl, indirectBgl] });
|
||
const traceLayout = device.createPipelineLayout({
|
||
bindGroupLayouts: [paramsBgl, dataBgl] });
|
||
const mk = (layout, ep) => device.createComputePipeline({
|
||
layout, compute: { module: mod, entryPoint: ep } });
|
||
|
||
const entry = {
|
||
genPipe: mk(userLayout, "wfGenerate"),
|
||
prepPipe: mk(prepLayout, "wfPrep"),
|
||
tracePipe: mk(traceLayout, "wfTrace"),
|
||
shadePipe: mk(userLayout, "wfShade"),
|
||
resolvePipe: mk(userLayout, "wfResolve"),
|
||
paramsBgl, dataBgl, indirectBgl, emptyBgl, userBgls,
|
||
byGroup, sortedGroups,
|
||
};
|
||
const handle = newHandle();
|
||
rtPipelines.set(handle, entry);
|
||
return handle;
|
||
} catch (e) {
|
||
console.error("[crafter-wgpu] RT pipeline compile failed:", e);
|
||
console.error("[crafter-wgpu] WGSL was:\n", fullWgsl);
|
||
return 0;
|
||
}
|
||
};
|
||
|
||
// Build the user @group(3+) bind groups for a pass, returning a list of
|
||
// { group, bindGroup } to set. Shared by GENERATE / SHADE / RESOLVE.
|
||
function wfUserBindGroups(pipe, handlesPtr, handlesCount) {
|
||
const out = [];
|
||
if (handlesCount <= 0) return out;
|
||
const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
|
||
let handleIdx = 0;
|
||
let bglIdx = 0;
|
||
const highest = pipe.sortedGroups.length ? pipe.sortedGroups[pipe.sortedGroups.length - 1] : 2;
|
||
for (let g = 3; g <= highest; g++) {
|
||
if (pipe.byGroup.has(g)) {
|
||
const entries = pipe.byGroup.get(g).map(b => {
|
||
const h = handles[handleIdx++];
|
||
let resource;
|
||
if (b.kind === 0) resource = { buffer: buffers.get(h) };
|
||
else if (b.kind === 1) resource = textureViews.get(h);
|
||
else if (b.kind === 2) resource = samplers.get(h);
|
||
else if (b.kind === 3) resource = textureViews.get(h);
|
||
else if (b.kind === 4) resource = { buffer: buffers.get(h) };
|
||
return { binding: b.binding, resource };
|
||
});
|
||
out.push({ group: g, bindGroup: device.createBindGroup({ layout: pipe.userBgls[bglIdx], entries }) });
|
||
}
|
||
bglIdx++;
|
||
}
|
||
return out;
|
||
}
|
||
|
||
env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
|
||
tlasBufHandle, instanceCount, gx, gy,
|
||
handlesPtr, handlesCount, maxDepth) => {
|
||
if (!state.encoder) return;
|
||
const pipe = rtPipelines.get(pipelineHandle);
|
||
const tlas = buffers.get(tlasBufHandle);
|
||
if (!pipe || !tlas) {
|
||
console.error("[crafter-wgpu] wgpuDispatchRT: unknown pipeline or tlas");
|
||
return;
|
||
}
|
||
const entryOrderBuf = buffers.get(rtState.currentEntryOrder);
|
||
const bvhBuf = buffers.get(rtState.currentBvh);
|
||
if (!entryOrderBuf || !bvhBuf) {
|
||
console.error("[crafter-wgpu] wgpuDispatchRT: missing entryOrder/bvh (no TLAS built yet?)");
|
||
return;
|
||
}
|
||
const W = state.width, H = state.height;
|
||
const cap = W * H;
|
||
const depth = Math.max(1, maxDepth | 0);
|
||
const wf = ensureWavefrontBuffers(W, H);
|
||
|
||
// ── Per-pass WfParams ring. queue.writeBuffer lands before submit, so
|
||
// we can't mutate the uniform between passes — instead we pre-write one
|
||
// slot per pass and bind it with a dynamic offset. Slot order:
|
||
// 0 GENERATE
|
||
// 1+3*d .. +2 PREP / TRACE / SHADE for bounce d
|
||
// 1+3*depth RESOLVE
|
||
const passCount = 2 + 3 * depth;
|
||
const ring = new Uint32Array(WF_PARAM_SLOTS * 64); // 256 B = 64 u32 per slot
|
||
const writeSlot = (slot, curIsA, bounce) => {
|
||
const o = slot * 64;
|
||
ring[o + 0] = W; ring[o + 1] = H; ring[o + 2] = cap; ring[o + 3] = curIsA;
|
||
ring[o + 4] = bounce; ring[o + 5] = depth; ring[o + 6] = WF_FIXED_TLAS_NPADDED; ring[o + 7] = 0;
|
||
};
|
||
writeSlot(0, 1, 0); // GENERATE
|
||
for (let d = 0; d < depth; d++) {
|
||
const curIsA = (d % 2 === 0) ? 1 : 0;
|
||
writeSlot(1 + 3 * d + 0, curIsA, d); // PREP
|
||
writeSlot(1 + 3 * d + 1, curIsA, d); // TRACE
|
||
writeSlot(1 + 3 * d + 2, curIsA, d); // SHADE
|
||
}
|
||
writeSlot(1 + 3 * depth, 1, depth); // RESOLVE
|
||
queue.writeBuffer(wf.paramsRing, 0, ring, 0, passCount * 64);
|
||
|
||
const outView = state.outIsPing ? state.pingView : state.pongView;
|
||
const paramsBg = device.createBindGroup({
|
||
layout: pipe.paramsBgl,
|
||
entries: [{ binding: 0, resource: { buffer: wf.paramsRing, offset: 0, size: 256 } }],
|
||
});
|
||
const dataBg = device.createBindGroup({
|
||
layout: pipe.dataBgl,
|
||
entries: [
|
||
{ binding: 0, resource: { buffer: tlas } },
|
||
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
|
||
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
|
||
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
|
||
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
|
||
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
|
||
{ binding: 6, resource: outView },
|
||
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
|
||
{ binding: 8, resource: { buffer: entryOrderBuf } },
|
||
{ binding: 9, resource: { buffer: bvhBuf } },
|
||
{ binding: 10, resource: { buffer: wf.raysA } },
|
||
{ binding: 11, resource: { buffer: wf.raysB } },
|
||
{ binding: 12, resource: { buffer: wf.hits } },
|
||
{ binding: 13, resource: { buffer: wf.accum } },
|
||
{ binding: 14, resource: { buffer: wf.counters } },
|
||
{ binding: 15, resource: { buffer: wf.payload } },
|
||
],
|
||
});
|
||
const indirectBg = device.createBindGroup({
|
||
layout: pipe.indirectBgl,
|
||
entries: [{ binding: 0, resource: { buffer: wf.indirect } }],
|
||
});
|
||
const userBgs = wfUserBindGroups(pipe, handlesPtr, handlesCount);
|
||
|
||
// Close the frame's shared compute pass; the wavefront runs as its own
|
||
// sequence of passes on the same encoder (implicit barrier between each
|
||
// makes PREP's atomic writes visible to TRACE, etc.), then we reopen it.
|
||
if (state.pass) { state.pass.end(); state.pass = null; }
|
||
const enc = state.encoder;
|
||
const tileX = gx, tileY = gy;
|
||
const slotOff = (slot) => slot * 256;
|
||
|
||
// Zero the two emit counters before GENERATE.
|
||
enc.clearBuffer(wf.counters, 0, 64);
|
||
|
||
const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
|
||
|
||
// GENERATE
|
||
{
|
||
const p = enc.beginComputePass({ label: "wf-generate" });
|
||
p.setPipeline(pipe.genPipe);
|
||
p.setBindGroup(0, paramsBg, [slotOff(0)]);
|
||
p.setBindGroup(1, dataBg);
|
||
setUser(p);
|
||
p.dispatchWorkgroups(tileX, tileY, 1);
|
||
p.end();
|
||
}
|
||
for (let d = 0; d < depth; d++) {
|
||
const prepSlot = 1 + 3 * d + 0;
|
||
const traceSlot = 1 + 3 * d + 1;
|
||
const shadeSlot = 1 + 3 * d + 2;
|
||
// PREP — publish indirect args, zero next counter.
|
||
{
|
||
const p = enc.beginComputePass({ label: "wf-prep" });
|
||
p.setPipeline(pipe.prepPipe);
|
||
p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
|
||
p.setBindGroup(1, dataBg);
|
||
p.setBindGroup(2, indirectBg);
|
||
p.dispatchWorkgroups(1, 1, 1);
|
||
p.end();
|
||
}
|
||
// TRACE — indirect over the live ray list.
|
||
{
|
||
const p = enc.beginComputePass({ label: "wf-trace" });
|
||
p.setPipeline(pipe.tracePipe);
|
||
p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
|
||
p.setBindGroup(1, dataBg);
|
||
p.dispatchWorkgroupsIndirect(wf.indirect, 0);
|
||
p.end();
|
||
}
|
||
// SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
|
||
{
|
||
const p = enc.beginComputePass({ label: "wf-shade" });
|
||
p.setPipeline(pipe.shadePipe);
|
||
p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
|
||
p.setBindGroup(1, dataBg);
|
||
setUser(p);
|
||
p.dispatchWorkgroupsIndirect(wf.indirect, 0);
|
||
p.end();
|
||
}
|
||
}
|
||
// RESOLVE — tonemap accum → output image.
|
||
{
|
||
const p = enc.beginComputePass({ label: "wf-resolve" });
|
||
p.setPipeline(pipe.resolvePipe);
|
||
p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
|
||
p.setBindGroup(1, dataBg);
|
||
setUser(p);
|
||
p.dispatchWorkgroups(tileX, tileY, 1);
|
||
p.end();
|
||
}
|
||
|
||
// Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
|
||
// before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
|
||
state.pass = enc.beginComputePass();
|
||
state.outIsPing = !state.outIsPing;
|
||
};
|
||
|
||
// ── Standalone compute pipelines ────────────────────────────────────────
|
||
//
|
||
// Mirrors the native Vulkan `ComputeShader` API: user-authored compute
|
||
// pipelines that dispatch outside any UI render pass. Unlike
|
||
// wgpuLoadCustomShader (which lives inside the UI flow with ping/pong
|
||
// textures at @group(1)), these have NO library-supplied bindings —
|
||
// users declare all groups themselves.
|
||
//
|
||
// Layout contract:
|
||
// @group(0) @binding(0) uniform PushData // optional, only if pushUniformSize > 0
|
||
// @group(1+) @binding(N) // user bindings via UICustomBinding
|
||
//
|
||
// rayQuery: same flag as wgpuLoadCustomShader. When set, prepends the
|
||
// RT prelude + rayQuery library; user shader must NOT redeclare the
|
||
// resulting @group(1) layout (tlas/bvh/mesh heaps). The `bindings`
|
||
// list must start at @group(2) in that case.
|
||
|
||
const computePipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups, pushUniformSize, rayQueryCapable }
|
||
|
||
env.wgpuLoadComputePipeline = (wgslPtr, wgslLen, pushUniformSize,
|
||
bindingsPtr, bindingsCount, rayQueryFlag) => {
|
||
if (!rtState.vertHeap && rayQueryFlag) rtInit();
|
||
const userWgsl = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
|
||
const wgsl = rayQueryFlag
|
||
// rayQueryLib's _rqTraverseBlas/_rqTraverseTlas call _rtAabb,
|
||
// _rtFetchTri, _rtTri from rtWgslPureHelpers — must prepend
|
||
// the pure helper subset (NOT the megakernel-only traversal
|
||
// routines, which reference user-emitted runAnyHit/runMiss/
|
||
// runClosestHit and won't compile outside the raygen pipeline).
|
||
? (rtWgslTypes + rtWgslMegakernelBindings + rtWgslPureHelpers + rtWgslRayQueryLib + "\n" + userWgsl)
|
||
: userWgsl;
|
||
|
||
const bindings = [];
|
||
if (bindingsCount > 0) {
|
||
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
|
||
for (let i = 0; i < bindingsCount; i++) {
|
||
bindings.push({
|
||
group: dv.getUint8(i*8 + 0),
|
||
binding: dv.getUint8(i*8 + 1),
|
||
kind: dv.getUint8(i*8 + 2),
|
||
pushOffset: dv.getUint32(i*8 + 4, true),
|
||
});
|
||
}
|
||
}
|
||
|
||
// Bind-group layouts. The push uniform sits at @group(0) if present;
|
||
// user bindings start at @group(1) (or @group(2) when rayQuery is on
|
||
// — see rtWgslMegakernelBindings).
|
||
const bgls = [];
|
||
if (pushUniformSize > 0) {
|
||
bgls.push(device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
|
||
buffer: { type: "uniform", minBindingSize: pushUniformSize } },
|
||
]}));
|
||
} else {
|
||
bgls.push(device.createBindGroupLayout({ entries: [] }));
|
||
}
|
||
|
||
if (rayQueryFlag) {
|
||
// group(1) is the megakernel RT bindings — same layout as the RT
|
||
// pipeline's group(1). The injected rayQuery library reads it.
|
||
bgls.push(device.createBindGroupLayout({ entries: [
|
||
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
|
||
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
|
||
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
|
||
]}));
|
||
}
|
||
|
||
const byGroup = new Map();
|
||
const userGroupBase = rayQueryFlag ? 2 : 1;
|
||
for (const b of bindings) {
|
||
if (b.group < userGroupBase) {
|
||
console.error(`[crafter-wgpu] compute pipeline: @group(${b.group}) reserved; user groups must be >= ${userGroupBase}`);
|
||
return 0;
|
||
}
|
||
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
|
||
byGroup.get(b.group).push(b);
|
||
}
|
||
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
|
||
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : userGroupBase - 1;
|
||
for (let g = userGroupBase; g <= highest; g++) {
|
||
if (byGroup.has(g)) {
|
||
const entries = byGroup.get(g).map(b => {
|
||
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
|
||
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
|
||
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
|
||
else if (b.kind === 2) e.sampler = { type: "filtering" };
|
||
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
|
||
else if (b.kind === 4) e.buffer = { type: "storage" }; // read-write storage
|
||
return e;
|
||
});
|
||
bgls.push(device.createBindGroupLayout({ entries }));
|
||
} else {
|
||
bgls.push(device.createBindGroupLayout({ entries: [] }));
|
||
}
|
||
}
|
||
|
||
let pipeline;
|
||
try {
|
||
const mod = device.createShaderModule({ code: wgsl, label: "compute-pipeline" });
|
||
// Async compile-info to surface parse errors with line numbers.
|
||
mod.getCompilationInfo().then(info => {
|
||
const issues = info.messages.filter(m => m.type === "error" || m.type === "warning");
|
||
if (issues.length === 0) return;
|
||
const lines = wgsl.split("\n");
|
||
for (const m of issues) {
|
||
const ln = m.lineNum || 0, col = m.linePos || 0;
|
||
console[m.type === "error" ? "error" : "warn"](
|
||
`[crafter-wgpu] compute-pipeline ${m.type} at ${ln}:${col}: ${m.message}`);
|
||
for (let i = Math.max(1, ln - 3); i <= Math.min(lines.length, ln + 3); i++) {
|
||
const marker = i === ln ? ">> " : " ";
|
||
console.log(`${marker}${String(i).padStart(4)}: ${lines[i - 1]}`);
|
||
}
|
||
}
|
||
});
|
||
const layout = device.createPipelineLayout({ bindGroupLayouts: bgls });
|
||
pipeline = device.createComputePipeline({ layout, compute: { module: mod, entryPoint: "main" } });
|
||
} catch (e) {
|
||
console.error("[crafter-wgpu] compute pipeline compile failed:", e);
|
||
return 0;
|
||
}
|
||
|
||
const handle = newHandle();
|
||
computePipelines.set(handle, {
|
||
pipeline, bgls, byGroup, sortedGroups,
|
||
pushUniformSize, userGroupBase,
|
||
rayQueryCapable: !!rayQueryFlag,
|
||
});
|
||
return handle;
|
||
};
|
||
|
||
// Dispatch a standalone compute pipeline. Works both inside the per-frame
|
||
// UI compute pass (mid-frame physics tick) and outside it (game-tick
|
||
// dispatch from update lambda) — matching the wgpuBuildTLAS pattern.
|
||
//
|
||
// Push data: if the pipeline declared a push uniform (pushUniformSize > 0
|
||
// at load), `pushPtr` points at exactly that many bytes. We allocate a
|
||
// transient uniform buffer per dispatch — fine for the physics rate
|
||
// (handful of dispatches per substep, single-digit substeps per frame).
|
||
//
|
||
// Handles: parallel to the UICustomBinding[] declaration order at load
|
||
// time. Each handle resolves through the engine's buffer/texture/
|
||
// sampler tables to the live WebGPU resource.
|
||
env.wgpuDispatchCompute = (pipelineHandle, pushPtr, pushBytes,
|
||
handlesPtr, handlesCount, gx, gy, gz) => {
|
||
const pipe = computePipelines.get(pipelineHandle);
|
||
if (!pipe) {
|
||
console.error("[crafter-wgpu] wgpuDispatchCompute: unknown pipeline", pipelineHandle);
|
||
return;
|
||
}
|
||
|
||
// ── Push uniform: transient buffer per dispatch ────────────────────
|
||
let pushBG = null;
|
||
if (pipe.pushUniformSize > 0) {
|
||
const buf = device.createBuffer({
|
||
size: Math.max(16, (pipe.pushUniformSize + 15) & ~15),
|
||
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
||
});
|
||
if (pushBytes > 0) {
|
||
queue.writeBuffer(buf, 0, memU8().subarray(pushPtr, pushPtr + pushBytes));
|
||
}
|
||
pushBG = device.createBindGroup({
|
||
layout: pipe.bgls[0],
|
||
entries: [{ binding: 0, resource: { buffer: buf, offset: 0, size: pipe.pushUniformSize } }],
|
||
});
|
||
} else {
|
||
pushBG = device.createBindGroup({ layout: pipe.bgls[0], entries: [] });
|
||
}
|
||
|
||
// ── RT bindings (rayQuery only) ────────────────────────────────────
|
||
let rtBG = null;
|
||
if (pipe.rayQueryCapable) {
|
||
const tlasBuf = buffers.get(rtState.currentTlas);
|
||
const orderBuf = buffers.get(rtState.currentEntryOrder);
|
||
const bvhBuf = buffers.get(rtState.currentBvh);
|
||
if (!tlasBuf || !orderBuf || !bvhBuf) {
|
||
console.error("[crafter-wgpu] wgpuDispatchCompute rayQuery: no TLAS built yet");
|
||
return;
|
||
}
|
||
const outView = state.outIsPing ? state.pingView : state.pongView;
|
||
rtBG = device.createBindGroup({
|
||
layout: pipe.bgls[1],
|
||
entries: [
|
||
{ binding: 0, resource: { buffer: tlasBuf } },
|
||
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
|
||
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
|
||
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
|
||
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
|
||
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
|
||
{ binding: 6, resource: outView },
|
||
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
|
||
{ binding: 8, resource: { buffer: orderBuf } },
|
||
{ binding: 9, resource: { buffer: bvhBuf } },
|
||
],
|
||
});
|
||
}
|
||
|
||
// ── User bind groups from handles ──────────────────────────────────
|
||
const handleArr = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
|
||
const userBGs = [];
|
||
let handleCursor = 0;
|
||
const userGroupBase = pipe.userGroupBase;
|
||
const userBglStart = pipe.bgls.length - (pipe.sortedGroups.length
|
||
? (pipe.sortedGroups[pipe.sortedGroups.length - 1] - userGroupBase + 1)
|
||
: 0);
|
||
let bglIdx = userBglStart;
|
||
if (pipe.sortedGroups.length > 0) {
|
||
for (let g = userGroupBase; g <= pipe.sortedGroups[pipe.sortedGroups.length - 1]; g++) {
|
||
if (pipe.byGroup.has(g)) {
|
||
const entries = pipe.byGroup.get(g).map(b => {
|
||
const h = handleArr[handleCursor++];
|
||
let resource;
|
||
if (b.kind === 0 || b.kind === 4) resource = { buffer: buffers.get(h) };
|
||
else if (b.kind === 1) resource = textureViews.get(h);
|
||
else if (b.kind === 2) resource = samplers.get(h);
|
||
else if (b.kind === 3) resource = textureViews.get(h);
|
||
return { binding: b.binding, resource };
|
||
});
|
||
userBGs.push(device.createBindGroup({ layout: pipe.bgls[bglIdx++], entries }));
|
||
} else {
|
||
userBGs.push(device.createBindGroup({ layout: pipe.bgls[bglIdx++], entries: [] }));
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── Execute: attach to active pass, or create a standalone one. ────
|
||
const runDispatch = (pass) => {
|
||
pass.setPipeline(pipe.pipeline);
|
||
pass.setBindGroup(0, pushBG);
|
||
if (rtBG) pass.setBindGroup(1, rtBG);
|
||
for (let i = 0; i < userBGs.length; i++) {
|
||
pass.setBindGroup(userGroupBase + i, userBGs[i]);
|
||
}
|
||
pass.dispatchWorkgroups(gx, gy, gz);
|
||
};
|
||
|
||
if (state.pass) {
|
||
// Mid-frame: dispatch into the current UI compute pass.
|
||
runDispatch(state.pass);
|
||
} else {
|
||
// Standalone: build encoder + pass + submit. Mirrors wgpuBuildTLAS.
|
||
const enc = state.encoder || device.createCommandEncoder();
|
||
const ownEncoder = !state.encoder;
|
||
const pass = enc.beginComputePass({ label: "plain-compute" });
|
||
runDispatch(pass);
|
||
pass.end();
|
||
if (ownEncoder) queue.submit([enc.finish()]);
|
||
}
|
||
};
|
||
|
||
console.log("[crafter-wgpu] init complete; env handlers wired");
|
||
|
||
// Memory diagnostic. Logs handle-table sizes every 5s so a slow leak
|
||
// shows up as monotonic growth. Comment out for production builds.
|
||
setInterval(() => {
|
||
const m = (performance.memory && performance.memory.usedJSHeapSize) || 0;
|
||
console.log(`[crafter-wgpu] mem: js=${(m / 1024 / 1024).toFixed(1)}MB`
|
||
+ ` buffers=${buffers.size}`
|
||
+ ` textures=${textures.size}`
|
||
+ ` samplers=${samplers.size}`
|
||
+ ` customPipelines=${customPipelines.size}`
|
||
+ ` rtPipelines=${rtPipelines.size}`
|
||
+ ` computePipelines=${computePipelines.size}`
|
||
+ ` bindGroupCache=${state.bindGroupCache ? state.bindGroupCache.size : 0}`);
|
||
}, 5000);
|
||
} catch (e) {
|
||
// Capture any throw so the stub error messages name the real cause
|
||
// instead of "(no error captured)". Re-throw so runtime.js's catch
|
||
// also logs it.
|
||
initError = e instanceof Error ? e : new Error(String(e));
|
||
console.error("[crafter-wgpu] init failed:", initError);
|
||
throw initError;
|
||
}
|
||
})(); // end window.crafter_webbuild_env_ready
|