Crafter.Graphics/additional/dom-webgpu.js
catbot afc0292fab WebGPU RT: dynamic TLAS sweep-tree depth (next_pow2 instances)
The LBVH bitonic sort still runs over the full 16384 (sentinels sink to
the tail), but the sweep tree is now built and traced at depth
log2(next_pow2(nReal)) instead of a fixed 14. Add nPadded to LbvhPC; leaf
init + bottom-up refit use it; the host passes the same next_pow2 to the
trace via WfParams.tlasNPadded. Renders correctly at 512 instances
(depth 9). The fragile sort phases are untouched.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 20:28:12 +00:00

3496 lines
154 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
Crafter.Graphics WebGPU bridge — DOM mode UI compute pipeline.
Surface model (high-level, deliberately not 1:1 with WebGPU):
- JS owns the GPUDevice/queue/compute pipelines/bind-group cache.
- C++ owns slot allocation and per-frame logic; it calls into ~15 imports.
- Standard UI shaders are embedded as WGSL strings at the bottom of this
file and compiled once at startup.
Ping-pong output strategy (per Decision 2 in plan):
- Two rgba8unorm storage textures sized to the canvas.
- Each Dispatch reads `prev` (sampled), writes `out` (storage, write-only).
- JS swaps the two between dispatches, so item-order overdraw works.
- At frame end, the current `out` is blitted to the canvas via
copyTextureToTexture (canvas configured as rgba8unorm to match).
This file is loaded as <script type="module">. Top-level await blocks
runtime.js's _start() until adapter + device are resolved, so by the time
main() runs every import here is fully wired.
*/
// ─── env stubs (assigned synchronously, BEFORE any async work) ────────────
// The wasm module's import-link step needs every declared wgpu* import to
// resolve to a Function. If init below throws, the stubs stay in place so
// the wasm still links — and the call site gets a clear error at runtime
// instead of "import object field X is not a Function" at link time.
window.crafter_webbuild_env = window.crafter_webbuild_env || {};
window.crafter_webbuild_env.table = window.crafter_webbuild_env.table
|| new WebAssembly.Table({ initial: 4, element: "anyfunc" });
let initError = null;
function stub(name) {
return (...args) => {
const msg = `[crafter-wgpu] ${name}() called but WebGPU init failed: ${initError?.message ?? "(no error captured)"}`;
console.error(msg);
throw new Error(msg);
};
}
{
const e = window.crafter_webbuild_env;
for (const n of [
"wgpuGetCanvasWidth", "wgpuGetCanvasHeight", "wgpuSurfaceWidth", "wgpuSurfaceHeight",
"wgpuInit", "wgpuCreateBuffer", "wgpuWriteBuffer", "wgpuWriteBufferRange",
"wgpuReadbackEnqueue", "wgpuReadbackPoll", "wgpuReadbackReady", "wgpuDestroyBuffer",
"wgpuCreateAtlasTexture", "wgpuWriteAtlasRegion", "wgpuDestroyTexture",
"wgpuCreateImage2D", "wgpuWriteImage2D",
"wgpuCreateImage2DArray", "wgpuWriteImage2DLayer",
"wgpuCreateLinearClampSampler", "wgpuCreateLinearRepeatSampler",
"wgpuFrameBegin", "wgpuFrameEnd",
"wgpuDispatchQuads", "wgpuDispatchCircles", "wgpuDispatchImages", "wgpuDispatchText",
"wgpuLoadCustomShader", "wgpuDispatchCustom",
"wgpuRegisterMeshBLAS", "wgpuLoadRTPipeline", "wgpuDispatchRT", "wgpuBuildTLAS",
"wgpuLoadComputePipeline", "wgpuDispatchCompute",
]) {
// Read-write ints don't need a stub-throw; return 0 for the size queries.
e[n] = n.endsWith("Width") || n.endsWith("Height")
? () => 0
: (n === "wgpuRegisterMeshBLAS" ? () => 0 : stub(n));
}
}
// ─── canvas + device init (runs before _start) ───────────────────────────
// Wrapped in an async IIFE assigned to window.crafter_webbuild_env_ready so
// the runtime.js shim can `await` it explicitly before calling _start().
// Sibling <script type="module"> top-level awaits are NOT reliably
// serialized in Firefox (verified 2026-05), so we can't depend on this
// file's TLA to block runtime.js by itself.
window.crafter_webbuild_env_ready = (async () => {
try {
if (!navigator.gpu) {
document.body.innerHTML = "<p style=\"font-family:sans-serif;padding:24px\">"
+ "WebGPU not available in this browser. Try Chrome 121+ / Firefox 141+ / Safari 26+.</p>";
initError = new Error("WebGPU unavailable");
throw initError;
}
const canvas = document.createElement("canvas");
canvas.id = "crafter-canvas";
canvas.style.cssText = "position:fixed;inset:0;width:100vw;height:100vh;display:block;";
document.body.style.margin = "0";
document.body.appendChild(canvas);
function syncCanvasSize() {
// Canvas pixel size = CSS size × devicePixelRatio so the GPU draws
// at physical pixel resolution on HiDPI displays — otherwise the
// browser upscales whatever we rendered at logical size and the
// result looks blurry. The CSS rule still pins display size at
// 100vw/100vh, so the canvas paints its physical buffer back down
// into logical pixels with no perceived layout change.
//
// Mouse events arrive in CSS pixels; dom-env.js multiplies them by
// window.crafter_dpr before dispatching so the wasm-side hit tests
// share the physical-pixel coordinate space with window.width/.height.
const dpr = window.devicePixelRatio || 1;
window.crafter_dpr = dpr;
const w = Math.max(1, Math.round(window.innerWidth * dpr));
const h = Math.max(1, Math.round(window.innerHeight * dpr));
if (canvas.width !== w) canvas.width = w;
if (canvas.height !== h) canvas.height = h;
return { w, h };
}
syncCanvasSize();
const adapter = await navigator.gpu.requestAdapter();
if (!adapter) {
initError = new Error("navigator.gpu.requestAdapter() returned null (no compatible adapter)");
console.error("[crafter-wgpu]", initError.message);
throw initError;
}
// Ask for everything the adapter is willing to give us, up to the values
// the RT pipeline actually needs. The megakernel prelude declares 7
// storage buffers at group(1) (tlasEntries / bvhNodes / meshRecords /
// vertices / indices / primRemap / vertexAttribs); user pipelines like
// 3DForts add more at group(2), and the WebGPU baseline of 8 isn't
// enough. Adapters routinely report 10+ — clamp our request to whatever
// the adapter actually supports so the call doesn't reject on baseline-
// only devices. Same pattern for storage textures (we use 1 output image
// per dispatch but headroom is cheap) and for the global storage-buffer
// pool which is the per-pipeline count's parent budget.
const adapterLimits = adapter.limits || {};
const requiredLimits = {};
const clamp = (name, want) => {
const cap = adapterLimits[name];
if (typeof cap === "number" && cap > 0) {
requiredLimits[name] = Math.min(want, cap);
}
};
clamp("maxStorageBuffersPerShaderStage", 16);
clamp("maxStorageBuffersInPipelineLayout", 16);
clamp("maxStorageTexturesPerShaderStage", 8);
// The TLAS BVH build runs one workgroup of up to N threads in shared
// memory (bitonic sort over morton codes + sweep-tree refit). Need the
// per-workgroup invocation cap raised from the default 256.
clamp("maxComputeInvocationsPerWorkgroup", 1024);
clamp("maxComputeWorkgroupSizeX", 1024);
// Wavefront RT work buffers are sized to W·H rays. At 1080p the payload
// store (≈245 MB) and hit buffer (≈214 MB) blow past the 128 MB baseline
// storage-buffer binding size, and the whole set past the 256 MB baseline
// maxBufferSize — request whatever the adapter actually allows (4090/Dawn
// reports 1 GB+). maxComputeWorkgroupsPerDimension bounds the indirect
// TRACE/SHADE 1-D dispatch (ceil(W·H/64) ≈ 32k workgroups at 1080p; the
// 65535 default covers it, but request the adapter max for headroom).
clamp("maxBufferSize", 1 << 30);
clamp("maxStorageBufferBindingSize", 1 << 30);
clamp("maxComputeWorkgroupsPerDimension", 65535);
// Per-pass GPU timing for the wavefront tracer (RTStress HUD / PR numbers).
const tsSupported = adapter.features && adapter.features.has("timestamp-query");
const requiredFeatures = tsSupported ? ["timestamp-query"] : [];
const device = await adapter.requestDevice({ requiredLimits, requiredFeatures });
const queue = device.queue;
const ctx = canvas.getContext("webgpu");
const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
ctx.configure({ device, format: canvasFormat, alphaMode: "opaque",
usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_DST });
device.lost.then((info) => {
console.error("[crafter-wgpu] device lost:", info.message);
state.gpuLost = true;
});
device.addEventListener("uncapturederror", (e) => {
console.error("[crafter-wgpu] uncaptured error:", e.error && e.error.message);
});
// ─── handle tables ─────────────────────────────────────────────────────
const buffers = new Map(); // handle → GPUBuffer
const textures = new Map(); // handle → GPUTexture
const textureViews = new Map(); // handle → GPUTextureView (mirrors textures key for the view)
const samplers = new Map(); // handle → GPUSampler
let nextHandle = 1;
function newHandle() { return nextHandle++; }
// ─── ping-pong storage textures ────────────────────────────────────────
const state = {
pingTex: null, pingView: null,
pongTex: null, pongView: null,
outIsPing: true, // current "out" target
width: 0, height: 0,
encoder: null,
pass: null,
headerRing: null, // GPUBuffer; uniform header writes ring through this
headerRingSize: 0,
headerRingOffset: 0,
bindGroupCache: new Map(), // key → GPUBindGroup
gpuLost: false,
};
function recreatePingPong(w, h) {
const usage = GPUTextureUsage.STORAGE_BINDING
| GPUTextureUsage.TEXTURE_BINDING
| GPUTextureUsage.COPY_SRC
| GPUTextureUsage.COPY_DST; // COPY_DST so we can clear it
if (state.pingTex) state.pingTex.destroy();
if (state.pongTex) state.pongTex.destroy();
state.pingTex = device.createTexture({ size: [w, h], format: "rgba8unorm", usage });
state.pongTex = device.createTexture({ size: [w, h], format: "rgba8unorm", usage });
state.pingView = state.pingTex.createView();
state.pongView = state.pongTex.createView();
state.width = w; state.height = h;
state.outIsPing = true;
state.bindGroupCache.clear();
}
function ensureSized() {
const { w, h } = syncCanvasSize();
if (w !== state.width || h !== state.height) {
recreatePingPong(w, h);
// Notify the wasm side that the surface size changed so it can
// fire onResize through Window. The wasm export is added by
// Crafter.Graphics-Window.cpp.
const onResize = wasmExports && wasmExports.__crafterDom_resize;
if (onResize) onResize(1, w, h);
}
}
// Header ring buffer: 256-byte-aligned slots holding UIDispatchHeader (48
// bytes of meaningful data, padded to 256). Wraps at frame boundary.
const HEADER_ALIGN = 256;
const HEADER_RING_SLOTS = 64;
state.headerRingSize = HEADER_ALIGN * HEADER_RING_SLOTS;
state.headerRing = device.createBuffer({
size: state.headerRingSize,
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
// ─── pipelines ─────────────────────────────────────────────────────────
const wgslShared = String.raw`
struct UIDispatchHeader {
outImage: u32,
itemBuffer: u32,
surfaceW: u32,
surfaceH: u32,
clipX: f32,
clipY: f32,
clipW: f32,
clipH: f32,
itemCount: u32,
frameIdx: u32,
flags: u32,
_pad: u32,
};
@group(0) @binding(0) var<uniform> hdr : UIDispatchHeader;
@group(1) @binding(0) var outTex : texture_storage_2d<rgba8unorm, write>;
@group(1) @binding(1) var prevTex : texture_2d<f32>;
fn uiResolvePixel(coord: vec2<u32>) -> bool {
if (coord.x >= hdr.surfaceW || coord.y >= hdr.surfaceH) { return false; }
let fx = f32(coord.x); let fy = f32(coord.y);
if (fx < hdr.clipX || fy < hdr.clipY) { return false; }
if (fx >= hdr.clipX + hdr.clipW) { return false; }
if (fy >= hdr.clipY + hdr.clipH) { return false; }
return true;
}
fn uiBlendOver(dst: vec4<f32>, src: vec4<f32>) -> vec4<f32> {
let a = clamp(src.a, 0.0, 1.0);
let rgb = mix(dst.rgb, src.rgb, vec3<f32>(a));
let outA = a + dst.a * (1.0 - a);
return vec4<f32>(rgb, outA);
}
fn uiSdRoundRect(p: vec2<f32>, halfSize: vec2<f32>, r4: vec4<f32>) -> f32 {
var r: vec4<f32> = r4;
// Pick radius for the quadrant p is in. r order: (TL, TR, BR, BL).
let rx = select(r.x, r.z, p.x > 0.0);
let ry = select(r.w, r.y, p.x > 0.0);
let radius = select(ry, rx, p.y > 0.0);
let q = abs(p) - halfSize + vec2<f32>(radius);
return min(max(q.x, q.y), 0.0) + length(max(q, vec2<f32>(0.0))) - radius;
}
`;
const wgslQuads = wgslShared + String.raw`
struct QuadItem {
rect: vec4<f32>,
color: vec4<f32>,
corners: vec4<f32>,
outline: vec4<f32>,
};
@group(2) @binding(0) var<storage, read> items : array<QuadItem>;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let lo = it.rect.xy;
let hi = it.rect.xy + it.rect.zw;
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
let halfSize = it.rect.zw * 0.5;
let p = sp - (it.rect.xy + halfSize);
let d = uiSdRoundRect(p, halfSize, it.corners);
let bodyA = clamp(0.5 - d, 0.0, 1.0);
if (bodyA <= 0.0 && it.outline.x <= 0.0) { continue; }
var src = vec4<f32>(it.color.rgb, it.color.a * bodyA);
if (it.outline.x > 0.0) {
let t = abs(d + it.outline.x * 0.5) - it.outline.x * 0.5;
let outlineA = clamp(0.5 - t, 0.0, 1.0);
src = vec4<f32>(mix(src.rgb, it.outline.yzw, vec3<f32>(outlineA)),
max(src.a, outlineA));
}
if (src.a <= 0.0) { continue; }
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
const wgslCircles = wgslShared + String.raw`
struct CircleItem {
centerRadius: vec4<f32>,
color: vec4<f32>,
outline: vec4<f32>,
};
@group(2) @binding(0) var<storage, read> items : array<CircleItem>;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let center = it.centerRadius.xy;
let radius = it.centerRadius.z;
let d = length(sp - center) - radius;
let bodyA = clamp(0.5 - d, 0.0, 1.0);
if (bodyA <= 0.0 && it.outline.x <= 0.0) { continue; }
var src = vec4<f32>(it.color.rgb, it.color.a * bodyA);
if (it.outline.x > 0.0) {
let t = abs(d + it.outline.x * 0.5) - it.outline.x * 0.5;
let outlineA = clamp(0.5 - t, 0.0, 1.0);
src = vec4<f32>(mix(src.rgb, it.outline.yzw, vec3<f32>(outlineA)),
max(src.a, outlineA));
}
if (src.a <= 0.0) { continue; }
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
const wgslImages = wgslShared + String.raw`
struct ImageItem {
rect: vec4<f32>,
uv: vec4<f32>,
tint: vec4<f32>,
slots: vec4<u32>,
};
@group(2) @binding(0) var<storage, read> items : array<ImageItem>;
@group(3) @binding(0) var imgTex : texture_2d<f32>;
@group(3) @binding(1) var imgSampler : sampler;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let lo = it.rect.xy;
let hi = it.rect.xy + it.rect.zw;
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
let t = (sp - lo) / it.rect.zw;
let uv = vec2<f32>(mix(it.uv.x, it.uv.z, t.x), mix(it.uv.y, it.uv.w, t.y));
let sample = textureSampleLevel(imgTex, imgSampler, uv, 0.0);
let src = sample * it.tint;
if (src.a <= 0.0) { continue; }
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
const wgslText = wgslShared + String.raw`
struct GlyphItem {
rect: vec4<f32>,
uv: vec4<f32>,
color: vec4<f32>,
};
@group(2) @binding(0) var<storage, read> items : array<GlyphItem>;
@group(3) @binding(0) var atlasTex : texture_2d<f32>;
@group(3) @binding(1) var atlasSampler : sampler;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let lo = it.rect.xy;
let hi = it.rect.xy + it.rect.zw;
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
let t = (sp - lo) / it.rect.zw;
let uv = vec2<f32>(mix(it.uv.x, it.uv.z, t.x), mix(it.uv.y, it.uv.w, t.y));
// stb_truetype SDF: pixel value ~128 is the edge. Treat alpha as
// the smoothed step around that midpoint.
let sdf = textureSampleLevel(atlasTex, atlasSampler, uv, 0.0).r;
let alpha = clamp((sdf - 0.5) * 8.0 + 0.5, 0.0, 1.0);
if (alpha <= 0.0) { continue; }
let src = vec4<f32>(it.color.rgb, it.color.a * alpha);
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
function makePipeline(label, wgsl, hasGroup3) {
const mod = device.createShaderModule({ label, code: wgsl });
// Layout: group 0 uniform header, group 1 (out storage + prev sampled),
// group 2 storage items SSBO, optional group 3 (texture + sampler).
const bgl0 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 48 } },
]});
const bgl1 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, texture: { sampleType: "float", viewDimension: "2d" } },
]});
const bgl2 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
]});
const layouts = [bgl0, bgl1, bgl2];
let bgl3 = null;
if (hasGroup3) {
bgl3 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, texture: { sampleType: "float", viewDimension: "2d" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, sampler: { type: "filtering" } },
]});
layouts.push(bgl3);
}
const pl = device.createPipelineLayout({ bindGroupLayouts: layouts });
const pipeline = device.createComputePipeline({
layout: pl,
compute: { module: mod, entryPoint: "main" },
});
return { pipeline, bgl0, bgl1, bgl2, bgl3 };
}
const pipeQuads = makePipeline("ui-quads", wgslQuads, false);
const pipeCircles = makePipeline("ui-circles", wgslCircles, false);
const pipeImages = makePipeline("ui-images", wgslImages, true);
const pipeText = makePipeline("ui-text", wgslText, true);
// Bind groups for group 0 (header uniform with dynamic offset) — one per
// pipeline, references the same ring buffer.
function makeHdrBG(pipe) {
return device.createBindGroup({
layout: pipe.bgl0,
entries: [{ binding: 0, resource: { buffer: state.headerRing, offset: 0, size: 48 } }],
});
}
const hdrBG = {
quads: makeHdrBG(pipeQuads),
circles: makeHdrBG(pipeCircles),
images: makeHdrBG(pipeImages),
text: makeHdrBG(pipeText),
};
// Group 1 changes between dispatches because `out` and `prev` swap on the
// ping-pong. Cached by current ping-pong direction and texture size; the
// stored bind group is reusable across all pipelines that share a
// layout-compatible bgl1 (all standard pipelines and custom shaders do,
// since they declare identical group-1 entries per the contract).
function getGroup1BG(bgl1) {
const key = `g1/${state.outIsPing ? 1 : 0}/${state.width}x${state.height}`;
let bg = state.bindGroupCache.get(key);
if (bg) return bg;
const outView = state.outIsPing ? state.pingView : state.pongView;
const prevView = state.outIsPing ? state.pongView : state.pingView;
bg = device.createBindGroup({
layout: bgl1,
entries: [
{ binding: 0, resource: outView },
{ binding: 1, resource: prevView },
],
});
state.bindGroupCache.set(key, bg);
return bg;
}
function getGroup2BG(pipe, itemsHandle) {
const key = `items/${pipe === pipeQuads ? "q" : pipe === pipeCircles ? "c" : pipe === pipeImages ? "i" : "t"}/${itemsHandle}`;
let bg = state.bindGroupCache.get(key);
if (bg) return bg;
const buf = buffers.get(itemsHandle);
if (!buf) throw new Error(`getGroup2BG: unknown items buffer ${itemsHandle}`);
bg = device.createBindGroup({
layout: pipe.bgl2,
entries: [{ binding: 0, resource: { buffer: buf } }],
});
state.bindGroupCache.set(key, bg);
return bg;
}
function getGroup3BG(pipe, texHandle, sampHandle) {
const key = `t3/${texHandle}/${sampHandle}/${pipe === pipeImages ? "i" : "x"}`;
let bg = state.bindGroupCache.get(key);
if (bg) return bg;
const tex = textureViews.get(texHandle);
const sam = samplers.get(sampHandle);
if (!tex || !sam) throw new Error(`getGroup3BG: unknown view ${texHandle} / sampler ${sampHandle}`);
bg = device.createBindGroup({
layout: pipe.bgl3,
entries: [
{ binding: 0, resource: tex },
{ binding: 1, resource: sam },
],
});
state.bindGroupCache.set(key, bg);
return bg;
}
// ─── wasm import surface ───────────────────────────────────────────────
let wasmExports = null;
// Crafter.Build's runtime.js exposes the wasi instance on
// window.crafter_wasi after instantiation. We grab the exports lazily so
// every import-side function works regardless of call order. memU8 /
// memF32 / memU32 always re-derive the typed-array view because the
// wasm memory's backing ArrayBuffer is detached and replaced whenever
// the wasm grows its memory; caching a typed array would alias to
// freed memory after a grow.
function getExports() {
if (wasmExports) return wasmExports;
const wasi = window.crafter_wasi;
if (!wasi || !wasi.instance) {
throw new Error("[crafter-wgpu] wasm exports not available yet (called too early)");
}
wasmExports = wasi.instance.exports;
return wasmExports;
}
function memU8() { return new Uint8Array(getExports().memory.buffer); }
function memF32() { return new Float32Array(getExports().memory.buffer); }
function memU32() { return new Uint32Array(getExports().memory.buffer); }
// Stubs were assigned at the top of this file; we now overwrite them with
// real implementations now that init has succeeded.
const env = window.crafter_webbuild_env;
env.wgpuGetCanvasWidth = () => canvas.width;
env.wgpuGetCanvasHeight = () => canvas.height;
env.wgpuCreateBuffer = (byteSize) => {
const h = newHandle();
const buf = device.createBuffer({
size: Math.max(16, byteSize),
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
});
buffers.set(h, buf);
return h;
};
env.wgpuWriteBuffer = (handle, srcPtr, byteSize) => {
state.writeBufferCount = (state.writeBufferCount || 0) + 1;
state.lastWriteHandle = handle;
state.lastWriteSize = byteSize;
const buf = buffers.get(handle);
if (!buf) return;
// writeBuffer requires a multiple of 4 bytes and an aligned offset.
const aligned = (byteSize + 3) & ~3;
queue.writeBuffer(buf, 0, memU8().buffer, srcPtr, aligned);
};
// Partial write — copies a sub-range of `srcPtr` into the GPU buffer at
// `dstByteOffset`. Used by BuildTLAS to skip the transform field of
// GPU-owned instances; the physics-tlas-transform compute shader is the
// sole writer of those bytes and we must not clobber its output with a
// stale CPU mirror.
env.wgpuWriteBufferRange = (handle, dstByteOffset, srcPtr, byteSize) => {
const buf = buffers.get(handle);
if (!buf) return;
const aligned = (byteSize + 3) & ~3;
queue.writeBuffer(buf, dstByteOffset, memU8().buffer, srcPtr, aligned);
};
// ── GPU→CPU readback (staging + mapAsync) ──────────────────────────────
//
// WebGPU storage buffers can't be CPU-mapped directly (STORAGE usage is
// incompatible with MAP_READ). Each readback keeps a parallel staging
// buffer (MAP_READ | COPY_DST) at the same size. wgpuReadbackEnqueue
// copies the storage buffer into the staging buffer and kicks off an
// async map. wgpuReadbackPoll synchronously returns whether the map has
// resolved; if so it copies the bytes into the caller's wasm pointer
// and the slot is ready for the next Enqueue.
//
// Used by Forts3D's physics event drain to read the GPU-written
// destroy/hit/splash event queues with a one-frame latency.
const READBACK_IDLE = 0;
const READBACK_PENDING = 1;
const READBACK_READY = 2;
const readbacks = new Map(); // device-buffer handle → { staging, size, state, pendingData }
// Readbacks scheduled this frame that still need their mapAsync kicked
// off — done after the frame's queue.submit so the map waits for the
// compute writes that wrote to `buf` to finish, not just the standalone
// copy encoder.
const pendingReadbackMaps = [];
env.wgpuReadbackEnqueue = (handle, byteSize, resetBytes) => {
const buf = buffers.get(handle);
if (!buf) return;
const aligned = (byteSize + 3) & ~3;
const resetAligned = resetBytes > 0 ? ((resetBytes + 3) & ~3) : 0;
let rb = readbacks.get(handle);
if (!rb) {
rb = {
staging: device.createBuffer({
size: Math.max(16, aligned),
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
}),
size: aligned,
state: READBACK_IDLE,
pendingData: null,
};
readbacks.set(handle, rb);
}
if (rb.state !== READBACK_IDLE) {
// Previous map still in flight (or has data nobody polled yet);
// skip this enqueue AND the paired reset. Events written by the
// current frame's substeps will accumulate in `buf` and get
// picked up by the next successful enqueue (which captures the
// post-accumulation count and then clears it). Resetting here
// without copying would wipe those events before they could be
// drained, causing destroyed projectiles to never reach the CPU.
return;
}
if (state.encoder) {
// Mid-frame: piggy-back the copy on the frame's main encoder so
// it runs AFTER the compute dispatches that wrote `buf`. End the
// active pass, encode the copy (+ optional reset), reopen the
// pass. mapAsync gets deferred to wgpuFrameEnd (after
// queue.submit) — the map resolves once the GPU has finished
// the submitted work, including our just-encoded copy.
if (state.pass) {
state.pass.end();
state.pass = null;
}
state.encoder.copyBufferToBuffer(buf, 0, rb.staging, 0, aligned);
if (resetAligned > 0) {
// Clear the first `resetAligned` bytes of `buf` so the next
// frame's atomic-add starts at 0. Encoded after the copy,
// so the staging captures the pre-clear count. Tied to a
// successful enqueue — if the enqueue was skipped above the
// clear is skipped too, preserving events for the next
// successful drain.
state.encoder.clearBuffer(buf, 0, resetAligned);
}
state.pass = state.encoder.beginComputePass();
rb.state = READBACK_PENDING;
pendingReadbackMaps.push(rb);
} else {
// Standalone (no frame in progress): submit our own encoder and
// kick off mapAsync immediately.
const enc = device.createCommandEncoder();
enc.copyBufferToBuffer(buf, 0, rb.staging, 0, aligned);
if (resetAligned > 0) enc.clearBuffer(buf, 0, resetAligned);
queue.submit([enc.finish()]);
rb.state = READBACK_PENDING;
rb.staging.mapAsync(GPUMapMode.READ).then(() => {
rb.pendingData = new Uint8Array(rb.staging.getMappedRange()).slice();
rb.staging.unmap();
rb.state = READBACK_READY;
}).catch(e => {
console.error("[crafter-wgpu] readback mapAsync failed:", e);
rb.state = READBACK_IDLE;
});
}
};
// Returns 1 if the readback for `handle` has completed and bytes were
// copied into `dstPtr`; returns 0 otherwise (caller retries next frame).
// After a successful poll the slot is idle again, ready for the next
// Enqueue.
env.wgpuReadbackPoll = (handle, dstPtr, byteSize) => {
const rb = readbacks.get(handle);
if (!rb || rb.state !== READBACK_READY) return 0;
memU8().set(rb.pendingData.subarray(0, byteSize), dstPtr);
rb.pendingData = null;
rb.state = READBACK_IDLE;
return 1;
};
// Non-consuming readiness check. Lets callers verify multiple readbacks
// have all resolved before consuming any — needed when a logical drain
// reads parallel header/array buffers and mustn't ack the header until
// the array bytes are also available, or the events get lost.
env.wgpuReadbackReady = (handle) => {
const rb = readbacks.get(handle);
return (rb && rb.state === READBACK_READY) ? 1 : 0;
};
env.wgpuDestroyBuffer = (handle) => {
const buf = buffers.get(handle);
if (buf) { buf.destroy(); buffers.delete(handle); }
// Invalidate any cached bind group that referenced this handle.
for (const k of state.bindGroupCache.keys()) {
if (k.startsWith("items/") && k.endsWith("/" + handle)) {
state.bindGroupCache.delete(k);
}
}
};
env.wgpuCreateAtlasTexture = (w, h) => {
const handle = newHandle();
const tex = device.createTexture({
size: [w, h],
format: "r8unorm",
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
});
textures.set(handle, tex);
textureViews.set(handle, tex.createView());
return handle;
};
env.wgpuWriteAtlasRegion = (handle, srcPtr, srcW, srcH, srcBytesPerRow, dstX, dstY, copyW, copyH) => {
const tex = textures.get(handle);
if (!tex) return;
// For r8unorm, 1 byte per pixel; writeTexture requires bytesPerRow >= 256
// OR == width if width*1 % 256 === 0 — for arbitrary widths we need to
// re-pack into a 256-aligned staging buffer.
const alignedBPR = Math.max(256, (srcBytesPerRow + 255) & ~255);
if (alignedBPR === srcBytesPerRow) {
const bytes = memU8().subarray(srcPtr + dstY * srcBytesPerRow + dstX,
srcPtr + (dstY + copyH) * srcBytesPerRow);
queue.writeTexture(
{ texture: tex, origin: { x: dstX, y: dstY } },
bytes,
{ bytesPerRow: srcBytesPerRow, rowsPerImage: copyH },
{ width: copyW, height: copyH }
);
} else {
// Repack copyW × copyH starting at (dstX, dstY) in the source.
const staging = new Uint8Array(alignedBPR * copyH);
const src = memU8();
for (let y = 0; y < copyH; y++) {
const srcRow = (dstY + y) * srcBytesPerRow + dstX;
staging.set(src.subarray(srcPtr + srcRow, srcPtr + srcRow + copyW),
y * alignedBPR);
}
queue.writeTexture(
{ texture: tex, origin: { x: dstX, y: dstY } },
staging,
{ bytesPerRow: alignedBPR, rowsPerImage: copyH },
{ width: copyW, height: copyH }
);
}
};
env.wgpuDestroyTexture = (handle) => {
const tex = textures.get(handle);
if (tex) { tex.destroy(); textures.delete(handle); textureViews.delete(handle); }
};
// General-purpose 2D rgba8unorm texture, used by Image2D<RGBA8>. Distinct
// from the atlas path (r8unorm, sub-region writes) — this one's a one-shot
// upload of a whole image, sized to the pixel data the caller hands over.
env.wgpuCreateImage2D = (w, h) => {
const handle = newHandle();
const tex = device.createTexture({
size: [w, h],
format: "rgba8unorm",
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
});
textures.set(handle, tex);
textureViews.set(handle, tex.createView());
return handle;
};
// 2D texture array — N layers of identical (w × h) rgba8unorm with
// `mipLevels` mip levels. Pass mipLevels=1 for a single-level texture.
// Used by Image2DArray<RGBA8> to back one material albedo per layer;
// shaders sample with `textureSampleLevel(tex, samp, uv, layerIdx, lod)`
// where lod ∈ [0, mipLevels-1].
env.wgpuCreateImage2DArray = (w, h, layerCount, mipLevels) => {
const handle = newHandle();
const mips = (typeof mipLevels === "number" && mipLevels > 0) ? mipLevels : 1;
const tex = device.createTexture({
size: [w, h, layerCount],
dimension: "2d",
format: "rgba8unorm",
mipLevelCount: mips,
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
});
textures.set(handle, tex);
textureViews.set(handle, tex.createView({
dimension: "2d-array",
arrayLayerCount: layerCount,
mipLevelCount: mips,
}));
return handle;
};
// Upload a single mip level of one array layer. `level` indexes into the
// texture's mip chain; `w` / `h` are the dimensions at that level (= base
// dimensions >> level). Caller supplies the pre-downsampled bytes for
// each level — Image2DArray::UpdateLayer on the C++ side does the box-
// filter chain.
env.wgpuWriteImage2DLayer = (handle, layer, level, srcPtr, byteSize, w, h) => {
const tex = textures.get(handle);
if (!tex) return;
const srcBPR = w * 4;
const alignedBPR = (srcBPR + 255) & ~255;
if (alignedBPR === srcBPR) {
queue.writeTexture(
{ texture: tex, mipLevel: level, origin: [0, 0, layer] },
memU8().subarray(srcPtr, srcPtr + byteSize),
{ bytesPerRow: srcBPR, rowsPerImage: h },
{ width: w, height: h, depthOrArrayLayers: 1 }
);
} else {
const staging = new Uint8Array(alignedBPR * h);
const src = memU8();
for (let y = 0; y < h; y++) {
staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
y * alignedBPR);
}
queue.writeTexture(
{ texture: tex, mipLevel: level, origin: [0, 0, layer] },
staging,
{ bytesPerRow: alignedBPR, rowsPerImage: h },
{ width: w, height: h, depthOrArrayLayers: 1 }
);
}
};
env.wgpuWriteImage2D = (handle, srcPtr, byteSize, w, h) => {
const tex = textures.get(handle);
if (!tex) return;
// queue.writeTexture wants bytesPerRow as a multiple of 256, OR == width*bpp
// when the source is contiguous. RGBA8 = 4 bpp, so bytesPerRow = w*4.
const srcBPR = w * 4;
const alignedBPR = (srcBPR + 255) & ~255;
if (alignedBPR === srcBPR) {
// Already aligned (w * 4 is a multiple of 256 → w is a multiple of 64).
queue.writeTexture(
{ texture: tex },
memU8().subarray(srcPtr, srcPtr + byteSize),
{ bytesPerRow: srcBPR, rowsPerImage: h },
{ width: w, height: h }
);
} else {
// Repack into a 256-aligned staging buffer. One alloc per Update,
// freed when the function returns — fine for asset-load time use.
const staging = new Uint8Array(alignedBPR * h);
const src = memU8();
for (let y = 0; y < h; y++) {
staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
y * alignedBPR);
}
queue.writeTexture(
{ texture: tex },
staging,
{ bytesPerRow: alignedBPR, rowsPerImage: h },
{ width: w, height: h }
);
}
};
env.wgpuCreateLinearClampSampler = () => {
const handle = newHandle();
samplers.set(handle, device.createSampler({
magFilter: "linear", minFilter: "linear",
addressModeU: "clamp-to-edge", addressModeV: "clamp-to-edge",
}));
return handle;
};
env.wgpuCreateLinearRepeatSampler = () => {
const handle = newHandle();
samplers.set(handle, device.createSampler({
magFilter: "linear", minFilter: "linear",
mipmapFilter: "linear",
addressModeU: "repeat", addressModeV: "repeat",
}));
return handle;
};
// ─── per-frame ──────────────────────────────────────────────────────────
env.wgpuFrameBegin = () => {
state.frameBeginCount = (state.frameBeginCount || 0) + 1;
if (state.gpuLost) return;
ensureSized();
state.encoder = device.createCommandEncoder();
state.outIsPing = true; // reset so each frame starts on the same target
state.headerRingOffset = 0;
// DON'T clearBuffer the header ring here. queue.writeBuffer ops from
// writeHeader() are enqueued BEFORE this command buffer's submit,
// so an encoded clearBuffer would wipe them — the dispatches would
// then read all-zero headers and uiResolvePixel would reject every
// pixel (surfaceW=0).
clearStorageTexture(state.encoder, state.outIsPing ? state.pingTex : state.pongTex,
state.width, state.height);
state.pass = state.encoder.beginComputePass();
};
let zeroBuffer = null;
let zeroBufferSize = 0;
function clearStorageTexture(encoder, tex, w, h) {
const bpr = (w * 4 + 255) & ~255;
const need = bpr * h;
if (!zeroBuffer || zeroBufferSize < need) {
if (zeroBuffer) zeroBuffer.destroy();
zeroBuffer = device.createBuffer({ size: need, usage: GPUBufferUsage.COPY_SRC, mappedAtCreation: true });
new Uint8Array(zeroBuffer.getMappedRange()).fill(0);
zeroBuffer.unmap();
zeroBufferSize = need;
}
encoder.copyBufferToTexture(
{ buffer: zeroBuffer, bytesPerRow: bpr, rowsPerImage: h },
{ texture: tex },
{ width: w, height: h, depthOrArrayLayers: 1 }
);
}
env.wgpuFrameEnd = () => {
state.frameEndCount = (state.frameEndCount || 0) + 1;
if (state.gpuLost || !state.encoder) return;
state.pass.end();
state.pass = null;
// Blit last-written ping-pong texture → canvas. After N dispatches,
// state.outIsPing points at the NEXT write target, so the latest
// content lives in the OPPOSITE texture.
const finalTex = state.outIsPing ? state.pongTex : state.pingTex;
const canvasTex = ctx.getCurrentTexture();
state.encoder.copyTextureToTexture(
{ texture: finalTex },
{ texture: canvasTex },
{ width: state.width, height: state.height, depthOrArrayLayers: 1 }
);
queue.submit([state.encoder.finish()]);
state.encoder = null;
// Map the wavefront timestamp readback (its resolve/copy was encoded on
// the just-submitted encoder) and log a per-pass breakdown ~1×/sec.
if (state.tsReadPending) {
const ts = state.tsReadPending;
state.tsReadPending = null;
const n = ts.pendingLabels.length;
ts.readBuf.mapAsync(GPUMapMode.READ, 0, 2 * n * 8).then(() => {
const data = new BigInt64Array(ts.readBuf.getMappedRange(0, 2 * n * 8).slice(0));
ts.readBuf.unmap();
ts.inFlight = false;
wfLogTimestamps(ts, data);
}).catch((e) => {
ts.inFlight = false;
console.error("[crafter-wgpu] timestamp readback failed:", e);
});
}
// Kick off mapAsync for the readbacks whose copyBufferToBuffer we
// piggy-backed onto the just-submitted encoder. Doing this after
// submit ensures the map waits for that submission's GPU work to
// complete, so the staging buffer reflects this frame's compute
// writes (not pre-substep state).
while (pendingReadbackMaps.length > 0) {
const rb = pendingReadbackMaps.pop();
rb.staging.mapAsync(GPUMapMode.READ).then(() => {
rb.pendingData = new Uint8Array(rb.staging.getMappedRange()).slice();
rb.staging.unmap();
rb.state = READBACK_READY;
}).catch(e => {
console.error("[crafter-wgpu] readback mapAsync failed:", e);
rb.state = READBACK_IDLE;
});
}
};
// Write a push struct into the ring buffer at the current offset (which
// is incremented and 256-aligned). Standard dispatches pass just the
// 48-byte UIDispatchHeader; custom shaders may pass up to HEADER_ALIGN
// bytes — anything past the header is the user's per-dispatch struct,
// declared in WGSL as additional fields after UIDispatchHeader at
// @group(0) @binding(0). Returns the dynamic offset to pass to
// setBindGroup.
function writeHeader(headerPtr, bytes = 48) {
const upload = Math.min(bytes, HEADER_ALIGN);
const offset = state.headerRingOffset;
if (offset + HEADER_ALIGN > state.headerRingSize) {
// Ring is small enough that overrun in one frame means too many
// dispatches. Soft-wrap; correctness already requires the ring
// be large enough.
state.headerRingOffset = 0;
}
queue.writeBuffer(state.headerRing, state.headerRingOffset,
memU8().buffer, headerPtr, upload);
state.headerRingOffset += HEADER_ALIGN;
return offset;
}
function dispatchStandard(pipe, hdrBindGroup, headerPtr, gx, gy, itemsHandle, group3) {
if (!state.pass) return;
const off = writeHeader(headerPtr);
state.pass.setPipeline(pipe.pipeline);
state.pass.setBindGroup(0, hdrBindGroup, [off]);
state.pass.setBindGroup(1, getGroup1BG(pipe.bgl1));
state.pass.setBindGroup(2, getGroup2BG(pipe, itemsHandle));
if (group3) state.pass.setBindGroup(3, group3);
state.pass.dispatchWorkgroups(gx, gy, 1);
// Flip ping-pong: the texture we just wrote becomes next dispatch's prev.
state.outIsPing = !state.outIsPing;
}
env.wgpuDispatchQuads = (itemsHandle, headerPtr, gx, gy) => {
state.dispatchQuadsCount = (state.dispatchQuadsCount || 0) + 1;
dispatchStandard(pipeQuads, hdrBG.quads, headerPtr, gx, gy, itemsHandle, null);
};
env.wgpuDispatchCircles = (itemsHandle, headerPtr, gx, gy) => {
dispatchStandard(pipeCircles, hdrBG.circles, headerPtr, gx, gy, itemsHandle, null);
};
env.wgpuDispatchImages = (itemsHandle, headerPtr, gx, gy, texHandle, sampHandle) => {
const g3 = getGroup3BG(pipeImages, texHandle, sampHandle);
dispatchStandard(pipeImages, hdrBG.images, headerPtr, gx, gy, itemsHandle, g3);
};
env.wgpuDispatchText = (itemsHandle, headerPtr, gx, gy, atlasHandle, sampHandle) => {
const g3 = getGroup3BG(pipeText, atlasHandle, sampHandle);
dispatchStandard(pipeText, hdrBG.text, headerPtr, gx, gy, itemsHandle, g3);
};
// ─── custom user-authored shaders ─────────────────────────────────────
//
// Bind-group contract (mirrors :WebGPUComputeShader.cppm):
// group 0 binding 0 — uniform UIDispatchHeader (dynamic offset, 48b)
// group 1 binding 0 — texture_storage_2d<rgba8unorm, write> out
// group 1 binding 1 — texture_2d<f32> prev
// group 2+ — user-declared (UICustomBinding entries)
//
// Each UICustomBinding entry on the wasm side is 8 bytes:
// u8 group, u8 binding, u8 kind, u8 pad, u32 pushOffset
// kind: 0 = read-only-storage SSBO, 1 = sampled tex 2d, 2 = filtering sampler.
const customPipelines = new Map(); // handle → { pipeline, bgls, hdrBG, byGroup }
env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQueryFlag) => {
if (!rtState.vertHeap && rayQueryFlag) rtInit();
const userWgsl = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
// For rayQuery-capable shaders, prepend the RT prelude + ray-query
// library. The user shader can declare its own group 0 / 2+ bindings
// but MUST NOT redeclare group(1) — that's reserved for RT data.
const wgsl = rayQueryFlag
// rayQueryLib's _rqTraverseBlas/_rqTraverseTlas call _rtAabb,
// _rtFetchTri, _rtTri from rtWgslPureHelpers — must prepend
// the pure helper subset (NOT the megakernel-only traversal
// routines, which reference user-emitted runAnyHit/runMiss/
// runClosestHit and won't compile outside the raygen pipeline).
? (rtWgslTypes + rtWgslMegakernelBindings + rtWgslPureHelpers + rtWgslRayQueryLib + "\n" + userWgsl)
: userWgsl;
const bindings = [];
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
for (let i = 0; i < bindingsCount; i++) {
bindings.push({
group: dv.getUint8(i*8 + 0),
binding: dv.getUint8(i*8 + 1),
kind: dv.getUint8(i*8 + 2),
pushOffset: dv.getUint32(i*8 + 4, true),
});
}
// Group bindings by @group(N) for layout creation.
const byGroup = new Map();
for (const b of bindings) {
if (b.group < 2) {
console.error(`[crafter-wgpu] custom shader: @group(${b.group}) reserved; use groups >= 2`);
return 0;
}
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
byGroup.get(b.group).push(b);
}
// Group 0 = header uniform (same for both paths).
// Group 1 = ping-pong out+prev OR RT data (TLAS, BVH, meshRecs, verts,
// idx, primRemap, outImage) when rayQuery flag is on.
//
// Custom shaders may declare a struct at @group(0) @binding(0) up to
// HEADER_ALIGN (256) bytes — the standard UIDispatchHeader is 48 but
// the wgpuDispatchCustom path uploads the full push buffer so user
// shaders can read extra fields past the header. minBindingSize=0
// disables the bound-size lower bound; the actual entry is sized to
// HEADER_ALIGN at bind time.
const bgls = [
device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 0 } },
]}),
rayQueryFlag
? device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
]})
: device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE,
texture: { sampleType: "float", viewDimension: "2d" } },
]}),
];
// Sorted custom groups. Pad any gaps with empty bgls (WebGPU pipeline
// layouts require a contiguous array of GPUBindGroupLayout per group
// index up to the highest used).
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 1;
for (let g = 2; g <= highest; g++) {
if (byGroup.has(g)) {
const entries = byGroup.get(g).map(b => {
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
else if (b.kind === 2) e.sampler = { type: "filtering" };
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
return e;
});
bgls.push(device.createBindGroupLayout({ entries }));
} else {
bgls.push(device.createBindGroupLayout({ entries: [] }));
}
}
let pipeline;
try {
const mod = device.createShaderModule({ code: wgsl });
const layout = device.createPipelineLayout({ bindGroupLayouts: bgls });
pipeline = device.createComputePipeline({ layout, compute: { module: mod, entryPoint: "main" } });
} catch (e) {
console.error("[crafter-wgpu] custom shader compile failed:", e);
if (rayQueryFlag) console.error("[crafter-wgpu] WGSL was:\n", wgsl);
return 0;
}
// Bind the full ring slot (HEADER_ALIGN bytes) so custom shaders can
// declare a struct at @group(0) @binding(0) that's larger than the
// 48-byte UIDispatchHeader. Standard pipelines keep their tight 48-
// byte slice. The bgl0 layout's minBindingSize stays at 48 — bigger
// is allowed.
const hdrBG = device.createBindGroup({
layout: bgls[0],
entries: [{ binding: 0, resource: { buffer: state.headerRing, offset: 0, size: HEADER_ALIGN } }],
});
const handle = newHandle();
customPipelines.set(handle, { pipeline, bgls, hdrBG, byGroup, sortedGroups, rayQueryCapable: !!rayQueryFlag });
return handle;
};
env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handlesCount,
gx, gy, gz) => {
state.dispatchCustomCount = (state.dispatchCustomCount || 0) + 1;
if (!state.pass) return;
const pipe = customPipelines.get(pipelineHandle);
if (!pipe) {
console.error("[crafter-wgpu] wgpuDispatchCustom: unknown pipeline", pipelineHandle);
return;
}
// Write the full push struct (UIDispatchHeader + any custom tail
// fields the shader declares) into the ring. WGSL's bound struct
// size decides how many bytes are actually read on the GPU.
const off = writeHeader(pushPtr, pushBytes);
state.pass.setPipeline(pipe.pipeline);
state.pass.setBindGroup(0, pipe.hdrBG, [off]);
// Group 1: rayQuery-capable shaders get the RT data heaps + the most
// recently built TLAS; everyone else gets the standard ping-pong pair.
if (pipe.rayQueryCapable) {
const tlasBuf = buffers.get(rtState.currentTlas);
if (!tlasBuf) {
console.error("[crafter-wgpu] rayQuery dispatch but no TLAS built yet");
return;
}
const outView = state.outIsPing ? state.pingView : state.pongView;
const orderBuf = buffers.get(rtState.currentEntryOrder);
const bvhBuf = buffers.get(rtState.currentBvh);
if (!orderBuf || !bvhBuf) {
console.error("[crafter-wgpu] wgpuDispatchCustom rayQuery: no entryOrder/bins (TLAS not built)");
return;
}
const rtBG = device.createBindGroup({
layout: pipe.bgls[1],
entries: [
{ binding: 0, resource: { buffer: tlasBuf } },
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
{ binding: 6, resource: outView },
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
{ binding: 8, resource: { buffer: orderBuf } },
{ binding: 9, resource: { buffer: bvhBuf } },
],
});
state.pass.setBindGroup(1, rtBG);
} else {
state.pass.setBindGroup(1, getGroup1BG(pipe.bgls[1]));
}
// Walk bindings in declaration order and assemble bind groups.
// handles[] from wasm is in the SAME order as customBindings, so we
// pick up indices by walking byGroup in the same sorted order.
const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
let handleIdx = 0;
for (const g of pipe.sortedGroups) {
const entries = pipe.byGroup.get(g).map(b => {
const h = handles[handleIdx++];
let resource;
if (b.kind === 0) resource = { buffer: buffers.get(h) };
else if (b.kind === 1) resource = textureViews.get(h);
else if (b.kind === 2) resource = samplers.get(h);
else if (b.kind === 3) resource = textureViews.get(h);
return { binding: b.binding, resource };
});
const bg = device.createBindGroup({ layout: pipe.bgls[g], entries });
state.pass.setBindGroup(g, bg);
}
state.pass.dispatchWorkgroups(gx, gy, gz);
state.outIsPing = !state.outIsPing;
};
// Debug accessor for browser-console diagnostics.
window.crafter_wgpu_state = state;
window.crafter_wgpu_device = device;
window.crafter_wgpu_canvasCtx = ctx;
window.crafter_wgpu_debug = () => ({
width: state.width, height: state.height,
outIsPing: state.outIsPing,
encoderActive: !!state.encoder,
passActive: !!state.pass,
bgCacheSize: state.bindGroupCache.size,
bufferHandles: buffers.size,
textureHandles: textures.size,
samplerHandles: samplers.size,
headerRingOffset: state.headerRingOffset,
frameBeginCount: state.frameBeginCount || 0,
frameEndCount: state.frameEndCount || 0,
dispatchQuadsCount: state.dispatchQuadsCount || 0,
writeBufferCount: state.writeBufferCount || 0,
lastWriteHandle: state.lastWriteHandle,
lastWriteSize: state.lastWriteSize,
});
window.crafter_wgpu_bufferKeys = () => [...buffers.keys()];
// Read back the first QuadItem from a registered buffer to verify the
// GPU sees what the CPU wrote.
window.crafter_wgpu_readBuffer = async (handle, byteSize = 64) => {
const buf = buffers.get(handle);
if (!buf) return "no buffer for handle " + handle;
const read = device.createBuffer({ size: 256, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST });
const enc = device.createCommandEncoder();
enc.copyBufferToBuffer(buf, 0, read, 0, byteSize);
device.queue.submit([enc.finish()]);
await read.mapAsync(GPUMapMode.READ);
const data = new Float32Array(read.getMappedRange().slice(0, byteSize));
read.unmap();
return [...data];
};
// Surface size getters (the wasm side may query these on Resize events).
env.wgpuSurfaceWidth = () => state.width || canvas.width;
env.wgpuSurfaceHeight = () => state.height || canvas.height;
// One-shot init: forces ping-pong allocation at current canvas size so
// any Buffer/Texture creation before the first frame works against a
// concrete size. Called by Crafter::Device::Initialize on the wasm side.
env.wgpuInit = () => {
const { w, h } = syncCanvasSize();
recreatePingPong(w, h);
};
// Resize listener — wires up to the same `resize` event dom-env.js
// listens to. We trigger sizing on next frame begin; no work here.
window.addEventListener("resize", () => { /* ensureSized in wgpuFrameBegin */ });
// ─────────────────────────────────────────────────────────────────────
// ── Software raytracing subsystem ────────────────────────────────────
// ─────────────────────────────────────────────────────────────────────
//
// WebGPU has no hardware RT. The library emulates DXR/VKRT semantics in
// compute: a megakernel raygen pipeline traverses a CPU-built BLAS BVH +
// GPU-built TLAS, dispatches user closesthit / anyhit / miss via a
// generated `switch`. The same traversal kernel is also exposed as a
// rayQuery* function set for regular compute shaders (see the
// rayQueryCapable path under wgpuLoadCustomShader).
//
// The four mesh data streams live in *shared* GPU heaps; each Mesh::Build
// appends to them and gets a u32 handle back. The handle is what the
// application stores in RTInstance::accelerationStructureReference.
// ── WGSL library: shared types + constants (no bindings) ─────────────
// Used by both the megakernel pipeline (which adds group(0..1) bindings)
// and the TLAS-build pipeline (which only uses group(2)). Keeping bindings
// out of the shared block avoids inflating storage-buffer count past the
// 8-per-stage baseline limit on pipelines that don't actually use them.
const rtWgslTypes = String.raw`
struct RTDispatchHeader {
surfaceW: u32,
surfaceH: u32,
instanceCount: u32,
flags: u32,
};
struct RayDesc {
origin: vec3<f32>,
tMin: f32,
direction: vec3<f32>,
tMax: f32,
};
struct HitInfo {
t: f32,
instanceId: u32,
primitiveId: u32,
hitGroupIndex: u32,
attribs: vec2<f32>,
objectRayOrigin: vec3<f32>,
objectRayDirection: vec3<f32>,
objectToWorldR0: vec4<f32>,
objectToWorldR1: vec4<f32>,
objectToWorldR2: vec4<f32>,
customIndex: u32,
};
// Matches Crafter::BVHNode in interfaces/Crafter.Graphics-Mesh.cppm.
struct BVHNode {
aabbMin: vec3<f32>,
firstChildOrPrim: u32,
aabbMax: vec3<f32>,
primCount: u32,
};
// Per-mesh record. Indexed by RTInstance::accelerationStructureReference.
// attribsOffset is the per-mesh base index (in u32 words) into the
// vertexAttribs heap; meshes registered without per-vertex attribs leave
// it 0 (the heap entries at that range are also 0 / never touched). The
// per-vertex stride lives in the user's WGSL — the library doesn't store
// it because the layout is example-defined (Sponza uses 8 u32 / vertex
// for VertexNormalTangentUVPacked).
struct MeshRecord {
rootAabbMin: vec3<f32>,
vertexOffset: u32,
rootAabbMax: vec3<f32>,
indexOffset: u32,
bvhOffset: u32,
primRemapOffset: u32,
triangleCount: u32,
attribsOffset: u32,
};
// Per-instance TLAS record built by the TLAS-build compute pass.
struct TLASEntry {
aabbMin: vec3<f32>,
maskHGOffset: u32,
aabbMax: vec3<f32>,
blasMeshIdx: u32,
objectToWorldR0: vec4<f32>,
objectToWorldR1: vec4<f32>,
objectToWorldR2: vec4<f32>,
worldToObjectR0: vec4<f32>,
worldToObjectR1: vec4<f32>,
worldToObjectR2: vec4<f32>,
customIndex: u32,
instanceFlags: u32,
_pad0: u32,
_pad1: u32,
};
// ── Ray flag mirror of VkGeometryInstanceFlagBitsKHR + DXR ray flags ──
const RT_FLAG_OPAQUE: u32 = 0x1u;
const RT_FLAG_NO_OPAQUE: u32 = 0x2u;
const RT_FLAG_TERMINATE_ON_FIRST_HIT: u32 = 0x4u;
const RT_FLAG_SKIP_CLOSEST_HIT: u32 = 0x8u;
const RT_FLAG_CULL_BACK_FACING_TRIANGLES: u32 = 0x10u;
const RT_FLAG_CULL_FRONT_FACING_TRIANGLES: u32 = 0x20u;
const RT_FLAG_CULL_OPAQUE: u32 = 0x40u;
const RT_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
const RT_FLAG_SKIP_TRIANGLES: u32 = 0x100u;
const RT_FLAG_SKIP_AABBS: u32 = 0x200u;
const RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE: u32 = 0x1u;
const RT_INSTANCE_TRIANGLE_FLIP_FACING: u32 = 0x2u;
const RT_INSTANCE_FORCE_OPAQUE: u32 = 0x4u;
const RT_INSTANCE_FORCE_NO_OPAQUE: u32 = 0x8u;
const RT_ANYHIT_ACCEPT: u32 = 0u;
const RT_ANYHIT_IGNORE: u32 = 1u;
const RT_ANYHIT_END_SEARCH: u32 = 2u;
const RT_INTERSECTION_NONE: u32 = 0u;
const RT_INTERSECTION_TRIANGLE: u32 = 1u;
`;
// Megakernel-only bindings. Concatenated after rtWgslTypes for the
// raygen pipeline; the TLAS-build pipeline omits these because it doesn't
// touch them — declaring them would push it past 8 storage buffers per
// stage on the WebGPU baseline.
const rtWgslMegakernelBindings = String.raw`
@group(0) @binding(0) var<uniform> hdr : RTDispatchHeader;
@group(1) @binding(0) var<storage,read> tlasEntries : array<TLASEntry>;
@group(1) @binding(1) var<storage,read> bvhNodes : array<BVHNode>;
@group(1) @binding(2) var<storage,read> meshRecords : array<MeshRecord>;
@group(1) @binding(3) var<storage,read> vertices : array<f32>;
@group(1) @binding(4) var<storage,read> indices : array<u32>;
@group(1) @binding(5) var<storage,read> primRemap : array<u32>;
@group(1) @binding(6) var outImage : texture_storage_2d<rgba8unorm, write>;
@group(1) @binding(7) var<storage,read> vertexAttribs : array<u32>;
// TLAS Morton-sorted permutation: tlasEntryOrder[i] gives the
// tlasEntries[] index that BVH leaf i should sample.
@group(1) @binding(8) var<storage,read> tlasEntryOrder : array<u32>;
// Sweep-tree BVH built by the LBVH-build pass. 2 * N_PADDED - 1
// nodes = 2047 for N_PADDED = 1024. Internal nodes at [0, N_PADDED - 1);
// leaves at [N_PADDED - 1, 2 * N_PADDED - 1). For internal node i,
// children are 2i+1 and 2i+2 (implicit perfect binary tree). Each node
// stores just its world-space AABB.
struct BvhNode {
aabbMin: vec3<f32>,
_pad0: u32,
aabbMax: vec3<f32>,
_pad1: u32,
};
@group(1) @binding(9) var<storage,read> tlasBvhNodes : array<BvhNode>;
const TLAS_BVH_N_PADDED: u32 = 16384u;
const TLAS_BVH_LEAVES_START: u32 = TLAS_BVH_N_PADDED - 1u;
`;
// ── WGSL library: helpers + traverseBlas + traverseTlas + traceRay ───
// Injected after the user-supplied closesthit/anyhit/miss sources +
// mega-switch dispatchers (which PipelineRTWebGPU emits). User raygen
// sources sit after this block so they can call traceRay.
// The "pure" subset of the RT helpers — no calls into runAnyHit /
// runClosestHit / traceRay, so this can be prepended ahead of compute
// pipelines using rayQuery without dragging in megakernel-only symbols.
const rtWgslPureHelpers = String.raw`
fn _rtFetchTri(meshRec: MeshRecord, triIndex: u32) -> array<vec3<f32>, 3> {
let baseIdx = meshRec.indexOffset + triIndex * 3u;
let i0 = indices[baseIdx + 0u];
let i1 = indices[baseIdx + 1u];
let i2 = indices[baseIdx + 2u];
let baseV = meshRec.vertexOffset;
let v0i = (baseV + i0) * 3u;
let v1i = (baseV + i1) * 3u;
let v2i = (baseV + i2) * 3u;
return array<vec3<f32>, 3>(
vec3<f32>(vertices[v0i + 0u], vertices[v0i + 1u], vertices[v0i + 2u]),
vec3<f32>(vertices[v1i + 0u], vertices[v1i + 1u], vertices[v1i + 2u]),
vec3<f32>(vertices[v2i + 0u], vertices[v2i + 1u], vertices[v2i + 2u]),
);
}
fn _rtAabb(ro: vec3<f32>, invRd: vec3<f32>, mn: vec3<f32>, mx: vec3<f32>, tMax: f32) -> bool {
// Reject degenerate (mn > mx) boxes outright. The min(t0,t1)/
// max(t0,t1) trick below silently re-orients an inverted box
// and would otherwise return true for sentinel-padded BVH leaves
// — letting rays "hit" empty slots and accidentally re-traverse
// instance 0 via the OOB outOrder → tlasEntries[0] path.
if (any(mn > mx)) { return false; }
let t0 = (mn - ro) * invRd;
let t1 = (mx - ro) * invRd;
let tmin = min(t0, t1);
let tmax = max(t0, t1);
let tEnter = max(max(tmin.x, tmin.y), tmin.z);
let tExit = min(min(tmax.x, tmax.y), tmax.z);
return tExit >= max(tEnter, 0.0) && tEnter <= tMax;
}
// AABB test that also returns the (clamped) entry distance, for ordered
// nearest-child-first traversal. t is meaningless when hit == false.
struct _RtAabbHit { hit: bool, t: f32 };
fn _rtAabbT(ro: vec3<f32>, invRd: vec3<f32>, mn: vec3<f32>, mx: vec3<f32>, tMax: f32) -> _RtAabbHit {
var r: _RtAabbHit;
r.hit = false;
r.t = 0.0;
if (any(mn > mx)) { return r; }
let t0 = (mn - ro) * invRd;
let t1 = (mx - ro) * invRd;
let tmin = min(t0, t1);
let tmax = max(t0, t1);
let tEnter = max(max(tmin.x, tmin.y), tmin.z);
let tExit = min(min(tmax.x, tmax.y), tmax.z);
if (tExit >= max(tEnter, 0.0) && tEnter <= tMax) {
r.hit = true;
r.t = max(tEnter, 0.0);
}
return r;
}
struct _RtTriHit { hit: bool, t: f32, u: f32, v: f32 };
fn _rtTri(ro: vec3<f32>, rd: vec3<f32>, p0: vec3<f32>, p1: vec3<f32>, p2: vec3<f32>,
tMin: f32, tMax: f32) -> _RtTriHit {
var r: _RtTriHit;
r.hit = false;
let e1 = p1 - p0;
let e2 = p2 - p0;
let h = cross(rd, e2);
let a = dot(e1, h);
if (abs(a) < 1e-8) { return r; }
let f = 1.0 / a;
let s = ro - p0;
let u = f * dot(s, h);
if (u < 0.0 || u > 1.0) { return r; }
let q = cross(s, e1);
let v = f * dot(rd, q);
if (v < 0.0 || u + v > 1.0) { return r; }
let t = f * dot(e2, q);
if (t < tMin || t > tMax) { return r; }
r.hit = true; r.t = t; r.u = u; r.v = v;
return r;
}
`;
// ════════════════════════════════════════════════════════════════════════
// WAVEFRONT RT — streaming tracer (GENERATE → PREP → TRACE → SHADE →
// RESOLVE). Replaces the megakernel. The C++ side (PipelineRTWebGPU) emits
// the user sources, the per-stage SBT switches, the Payload-typed wfPayload
// binding, and the five @compute entry points; this JS injects the bindings
// + library helpers the entry points call.
// ════════════════════════════════════════════════════════════════════════
// Bindings prelude for the wavefront pipeline. group(0) is the per-pass
// WfParams uniform (dynamic-offset ring — one slot per pass so curIsA /
// bounce can vary between passes inside one submit). group(1) carries the
// geometry heaps (0..9, identical to the old megakernel layout so the
// register/build paths are unchanged) plus the wavefront work buffers
// (10..14); wfPayload at binding 15 is emitted in the codegen region after
// the user's `struct Payload`. group(2) is the indirect-args buffer, bound
// only by PREP (a buffer used as INDIRECT in a dispatch may not also be
// bound read-write in that same dispatch — so TRACE/SHADE must not bind it).
const rtWgslWavefrontBindings = String.raw`
struct WfParams {
surfaceW: u32,
surfaceH: u32,
rayCapacity: u32,
curIsA: u32, // 1 → current ray buffer is A (emit-next = B); 0 → B
bounce: u32,
maxDepth: u32,
tlasNPadded: u32, // TLAS sweep-tree padded leaf count (descent depth)
flags: u32,
};
// One in-flight ray. 64 bytes; origin/direction vec3-aligned to 16.
struct WfRay {
origin: vec3<f32>,
tMin: f32,
direction: vec3<f32>,
tMax: f32,
pixel: u32, // linear framebuffer pixel this ray contributes to
flags: u32,
cullMask: u32,
missIndex: u32,
sbtRecordOffset: u32,
payloadSlot: u32, // index into wfPayload
kind: u32, // 0 primary, 1 continuation (informational)
_pad: u32,
};
// TRACE → SHADE handoff. Mirrors HitInfo + a hitKind (0 miss, 1 triangle).
struct HitResult {
t: f32,
instanceId: u32,
primitiveId: u32,
hitGroupIndex: u32,
attribs: vec2<f32>,
hitKind: u32,
customIndex: u32,
objectRayOrigin: vec3<f32>,
_p0: f32,
objectRayDirection: vec3<f32>,
_p1: f32,
objectToWorldR0: vec4<f32>,
objectToWorldR1: vec4<f32>,
objectToWorldR2: vec4<f32>,
};
struct BvhNode {
aabbMin: vec3<f32>,
_pad0: u32,
aabbMax: vec3<f32>,
_pad1: u32,
};
@group(0) @binding(0) var<uniform> wfParams : WfParams;
@group(1) @binding(0) var<storage,read> tlasEntries : array<TLASEntry>;
@group(1) @binding(1) var<storage,read> bvhNodes : array<BVHNode>;
@group(1) @binding(2) var<storage,read> meshRecords : array<MeshRecord>;
@group(1) @binding(3) var<storage,read> vertices : array<f32>;
@group(1) @binding(4) var<storage,read> indices : array<u32>;
@group(1) @binding(5) var<storage,read> primRemap : array<u32>;
@group(1) @binding(6) var outImage : texture_storage_2d<rgba8unorm, write>;
@group(1) @binding(7) var<storage,read> vertexAttribs : array<u32>;
@group(1) @binding(8) var<storage,read> tlasEntryOrder : array<u32>;
@group(1) @binding(9) var<storage,read> tlasBvhNodes : array<BvhNode>;
@group(1) @binding(10) var<storage,read_write> wfRaysA : array<WfRay>;
@group(1) @binding(11) var<storage,read_write> wfRaysB : array<WfRay>;
@group(1) @binding(12) var<storage,read_write> wfHits : array<HitResult>;
@group(1) @binding(13) var<storage,read_write> wfAccum : array<vec4<f32>>;
@group(1) @binding(14) var<storage,read_write> wfCounters : array<atomic<u32>>;
// @group(1) @binding(15) wfPayload : array<Payload> — emitted by codegen.
@group(2) @binding(0) var<storage,read_write> wfIndirect : array<u32>;
`;
// Library helpers the codegen entry points call. Sits after the pure
// helpers (_rtAabb/_rtTri/_rtFetchTri) and after the user's Payload +
// wfPayload binding, so rtEmit*/_wfShade can name Payload/wfPayload.
const rtWgslWavefrontHelpers = String.raw`
var<private> _wfPixel: u32 = 0u;
// Live ray count for the current buffer, clamped to capacity (the emit
// counter can overshoot capacity; dropped rays were never written).
fn _wfCurCount() -> u32 {
let raw = select(atomicLoad(&wfCounters[1]), atomicLoad(&wfCounters[0]),
wfParams.curIsA == 1u);
return min(raw, wfParams.rayCapacity);
}
// Add linear radiance to the pixel this SHADE/GENERATE thread owns. Safe
// without atomics: at most one ray per pixel per bounce, and bounces run
// in separate passes (implicit barrier between them).
fn rtAccumulate(rgb: vec3<f32>) {
wfAccum[_wfPixel] = wfAccum[_wfPixel] + vec4<f32>(rgb, 0.0);
}
// raygen → emit the pixel's primary ray. Bounce 0's current buffer is
// always A, so primaries land in A with their payload in the A region
// [0, rayCapacity).
fn rtEmitPrimaryRay(origin: vec3<f32>, tMin: f32, dir: vec3<f32>, tMax: f32,
flags: u32, cullMask: u32, sbtRecordOffset: u32,
missIndex: u32, payload: Payload) {
let slot = atomicAdd(&wfCounters[0], 1u);
if (slot >= wfParams.rayCapacity) { return; }
var r: WfRay;
r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax;
r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask;
r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset;
r.payloadSlot = slot; r.kind = 0u;
wfRaysA[slot] = r;
wfPayload[slot] = payload;
}
// closesthit/miss → spawn a continuation/shadow ray into the NEXT buffer
// (the one the upcoming TRACE will read). Payload travels with it; the
// next buffer's payload region is [rayCapacity, 2*rayCapacity) for B.
fn rtEmitRay(origin: vec3<f32>, tMin: f32, dir: vec3<f32>, tMax: f32,
flags: u32, cullMask: u32, sbtRecordOffset: u32,
missIndex: u32, payload: Payload) {
let nextIsA = wfParams.curIsA == 0u;
let counterIdx = select(1u, 0u, nextIsA);
let slot = atomicAdd(&wfCounters[counterIdx], 1u);
if (slot >= wfParams.rayCapacity) { return; }
let payloadBase = select(wfParams.rayCapacity, 0u, nextIsA);
var r: WfRay;
r.origin = origin; r.tMin = tMin; r.direction = dir; r.tMax = tMax;
r.pixel = _wfPixel; r.flags = flags; r.cullMask = cullMask;
r.missIndex = missIndex; r.sbtRecordOffset = sbtRecordOffset;
r.payloadSlot = payloadBase + slot; r.kind = 1u;
if (nextIsA) { wfRaysA[slot] = r; } else { wfRaysB[slot] = r; }
wfPayload[r.payloadSlot] = payload;
}
// Opaque-only BLAS descent (no anyhit — TRACE runs zero user code).
fn _rtwTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
instanceId: u32, hitGroupBase: u32,
bestHit: ptr<function, HitInfo>,
bestT: ptr<function, f32>) -> bool {
let invD = vec3<f32>(1.0) / rayObj.direction;
var stack: array<u32, 32>;
var sp: u32 = 0u;
var nodeRel: u32 = 0u;
loop {
let absI = meshRec.bvhOffset + nodeRel;
let node = bvhNodes[absI];
if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, *bestT)) {
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
if (node.primCount > 0u) {
for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
let verts = _rtFetchTri(meshRec, triIndex);
let tr = _rtTri(rayObj.origin, rayObj.direction,
verts[0], verts[1], verts[2], rayObj.tMin, *bestT);
if (!tr.hit) { continue; }
let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
let facing = dot(geomNormal, rayObj.direction);
if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; }
if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
var candidate: HitInfo;
candidate.t = tr.t;
candidate.instanceId = instanceId;
candidate.primitiveId = triIndex;
candidate.hitGroupIndex = hitGroupBase;
candidate.attribs = vec2<f32>(tr.u, tr.v);
candidate.objectRayOrigin = rayObj.origin;
candidate.objectRayDirection = rayObj.direction;
*bestHit = candidate;
*bestT = tr.t;
if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
}
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
// Internal node: descend the nearer child first; push the farther
// only when it hits (and re-cull it against bestT when popped).
let left = node.firstChildOrPrim;
let right = left + 1u;
let ln = bvhNodes[meshRec.bvhOffset + left];
let rn = bvhNodes[meshRec.bvhOffset + right];
let lr = _rtAabbT(rayObj.origin, invD, ln.aabbMin, ln.aabbMax, *bestT);
let rr = _rtAabbT(rayObj.origin, invD, rn.aabbMin, rn.aabbMax, *bestT);
if (lr.hit && rr.hit) {
if (sp + 1u < 32u) {
if (lr.t <= rr.t) { stack[sp] = right; sp = sp + 1u; stack[sp] = left; sp = sp + 1u; }
else { stack[sp] = left; sp = sp + 1u; stack[sp] = right; sp = sp + 1u; }
}
} else if (lr.hit) {
if (sp < 32u) { stack[sp] = left; sp = sp + 1u; }
} else if (rr.hit) {
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
}
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp];
}
return false;
}
fn _rtwTraverseTlas(rayWorld: RayDesc, flags: u32, cullMask: u32,
sbtRecordOffset: u32,
bestHit: ptr<function, HitInfo>,
bestT: ptr<function, f32>) -> bool {
let invD = vec3<f32>(1.0) / rayWorld.direction;
let leavesStart = wfParams.tlasNPadded - 1u;
var stack: array<u32, 32>;
var sp: u32 = 0u;
stack[sp] = 0u; sp = sp + 1u;
loop {
if (sp == 0u) { break; }
sp = sp - 1u;
let nodeIdx = stack[sp];
let node = tlasBvhNodes[nodeIdx];
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, *bestT)) { continue; }
if (nodeIdx >= leavesStart) {
let leafIdx = nodeIdx - leavesStart;
let i = tlasEntryOrder[leafIdx];
if (i == 0xFFFFFFFFu) { continue; }
let inst = tlasEntries[i];
let instanceMask = inst.maskHGOffset & 0xFFu;
if ((instanceMask & cullMask) == 0u) { continue; }
if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, *bestT)) { continue; }
let r0 = inst.worldToObjectR0;
let r1 = inst.worldToObjectR1;
let r2 = inst.worldToObjectR2;
var rayObj: RayDesc;
rayObj.origin = vec3<f32>(
dot(r0.xyz, rayWorld.origin) + r0.w,
dot(r1.xyz, rayWorld.origin) + r1.w,
dot(r2.xyz, rayWorld.origin) + r2.w);
rayObj.direction = vec3<f32>(
dot(r0.xyz, rayWorld.direction),
dot(r1.xyz, rayWorld.direction),
dot(r2.xyz, rayWorld.direction));
rayObj.tMin = rayWorld.tMin;
rayObj.tMax = *bestT;
var effective = flags;
let iflags = inst.instanceFlags;
if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
}
let hitGroupOffset = inst.maskHGOffset >> 8u;
let hitGroupBase = sbtRecordOffset + hitGroupOffset;
let meshRec = meshRecords[inst.blasMeshIdx];
let pre = *bestT;
let endSearch = _rtwTraverseBlas(rayObj, effective, meshRec, i, hitGroupBase, bestHit, bestT);
if ((*bestT) < pre || endSearch) {
(*bestHit).objectToWorldR0 = inst.objectToWorldR0;
(*bestHit).objectToWorldR1 = inst.objectToWorldR1;
(*bestHit).objectToWorldR2 = inst.objectToWorldR2;
(*bestHit).customIndex = inst.customIndex;
}
if (endSearch) { return true; }
if ((*bestT) < pre && (effective & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
} else {
// Internal node: nearest-child-first. Sentinel-padded leaves
// carry an inverted AABB so _rtAabbT rejects them for free.
let left = 2u * nodeIdx + 1u;
let right = 2u * nodeIdx + 2u;
let ln = tlasBvhNodes[left];
let rn = tlasBvhNodes[right];
let lr = _rtAabbT(rayWorld.origin, invD, ln.aabbMin, ln.aabbMax, *bestT);
let rr = _rtAabbT(rayWorld.origin, invD, rn.aabbMin, rn.aabbMax, *bestT);
if (lr.hit && rr.hit) {
if (sp + 1u < 32u) {
if (lr.t <= rr.t) { stack[sp] = right; sp = sp + 1u; stack[sp] = left; sp = sp + 1u; }
else { stack[sp] = left; sp = sp + 1u; stack[sp] = right; sp = sp + 1u; }
}
} else if (lr.hit) {
if (sp < 32u) { stack[sp] = left; sp = sp + 1u; }
} else if (rr.hit) {
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
}
}
}
return false;
}
fn _wfReadRay(i: u32) -> WfRay {
if (wfParams.curIsA == 1u) { return wfRaysA[i]; }
return wfRaysB[i];
}
// PREP — publish indirect args for the upcoming TRACE/SHADE; zero the next
// buffer's emit counter.
fn _wfPrep() {
let n = _wfCurCount();
wfIndirect[0] = (n + 63u) / 64u;
wfIndirect[1] = 1u;
wfIndirect[2] = 1u;
if (wfParams.curIsA == 1u) { atomicStore(&wfCounters[1], 0u); }
else { atomicStore(&wfCounters[0], 0u); }
}
// TRACE — pure traversal, zero user code.
fn _wfTrace(i: u32) {
if (i >= _wfCurCount()) { return; }
let ray = _wfReadRay(i);
var rd: RayDesc;
rd.origin = ray.origin; rd.tMin = ray.tMin;
rd.direction = ray.direction; rd.tMax = ray.tMax;
var bestHit: HitInfo;
bestHit.t = ray.tMax;
var bestT = ray.tMax;
_rtwTraverseTlas(rd, ray.flags, ray.cullMask & 0xFFu, ray.sbtRecordOffset, &bestHit, &bestT);
var hr: HitResult;
if (bestT < ray.tMax) {
hr.t = bestHit.t;
hr.instanceId = bestHit.instanceId;
hr.primitiveId = bestHit.primitiveId;
hr.hitGroupIndex = bestHit.hitGroupIndex;
hr.attribs = bestHit.attribs;
hr.hitKind = 1u;
hr.customIndex = bestHit.customIndex;
hr.objectRayOrigin = bestHit.objectRayOrigin;
hr.objectRayDirection = bestHit.objectRayDirection;
hr.objectToWorldR0 = bestHit.objectToWorldR0;
hr.objectToWorldR1 = bestHit.objectToWorldR1;
hr.objectToWorldR2 = bestHit.objectToWorldR2;
} else {
hr.hitKind = 0u;
}
wfHits[i] = hr;
}
// SHADE — dispatch to runMiss / runClosestHit with the ray's payload.
fn _wfShade(i: u32) {
if (i >= _wfCurCount()) { return; }
let ray = _wfReadRay(i);
let hr = wfHits[i];
_wfPixel = ray.pixel;
var payload: Payload = wfPayload[ray.payloadSlot];
var rd: RayDesc;
rd.origin = ray.origin; rd.tMin = ray.tMin;
rd.direction = ray.direction; rd.tMax = ray.tMax;
if (hr.hitKind == 0u) {
runMiss(ray.missIndex, rd, &payload);
} else if ((ray.flags & RT_FLAG_SKIP_CLOSEST_HIT) == 0u) {
var hit: HitInfo;
hit.t = hr.t;
hit.instanceId = hr.instanceId;
hit.primitiveId = hr.primitiveId;
hit.hitGroupIndex = hr.hitGroupIndex;
hit.attribs = hr.attribs;
hit.customIndex = hr.customIndex;
hit.objectRayOrigin = hr.objectRayOrigin;
hit.objectRayDirection = hr.objectRayDirection;
hit.objectToWorldR0 = hr.objectToWorldR0;
hit.objectToWorldR1 = hr.objectToWorldR1;
hit.objectToWorldR2 = hr.objectToWorldR2;
runClosestHit(hr.hitGroupIndex, rd, hit, &payload);
}
}
`;
// ── WGSL library: rayQuery API for non-megakernel compute shaders ────
//
// Mirrors GL_EXT_ray_query semantics that 3DForts's physics shaders use
// (projectile-collide, splash, builder-pick). User WGSL:
// var rq: RayQuery;
// rayQueryInitialize(&rq, 0u, flags, mask, origin, tMin, dir, tMax);
// while (rayQueryProceed(&rq)) {} // run traversal to completion
// if (rayQueryGetCommittedIntersectionType(&rq) != RT_INTERSECTION_NONE) {
// let t = rayQueryGetCommittedT(&rq);
// ...
// }
//
// v1 simplification: traversal force-opaques every hit (no anyhit). The
// user can still test for triangle vs miss and read t/instance/bary.
// Anyhit-style candidate inspection is a future extension.
const rtWgslRayQueryLib = String.raw`
struct RayQuery {
ray: RayDesc,
flags: u32,
cullMask: u32,
committedType: u32,
committedT: f32,
committedInstanceId: u32,
committedInstanceCustomIndex: u32,
committedPrimitiveIndex: u32,
committedBarycentrics: vec2<f32>,
committedObjectRayOrigin: vec3<f32>,
committedObjectRayDirection: vec3<f32>,
committedObjectToWorldR0: vec4<f32>,
committedObjectToWorldR1: vec4<f32>,
committedObjectToWorldR2: vec4<f32>,
committedWorldToObjectR0: vec4<f32>,
committedWorldToObjectR1: vec4<f32>,
committedWorldToObjectR2: vec4<f32>,
done: u32,
};
fn _rqTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
instanceId: u32, customIndex: u32,
inst: TLASEntry,
rq: ptr<function, RayQuery>) {
let invD = vec3<f32>(1.0) / rayObj.direction;
var stack: array<u32, 32>;
var sp: u32 = 0u;
var nodeRel: u32 = 0u;
loop {
let abs = meshRec.bvhOffset + nodeRel;
let node = bvhNodes[abs];
if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
if (node.primCount > 0u) {
for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
let verts = _rtFetchTri(meshRec, triIndex);
let tr = _rtTri(rayObj.origin, rayObj.direction,
verts[0], verts[1], verts[2],
rayObj.tMin, (*rq).committedT);
if (!tr.hit) { continue; }
let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
let facing = dot(geomNormal, rayObj.direction);
if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; }
if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
(*rq).committedType = RT_INTERSECTION_TRIANGLE;
(*rq).committedT = tr.t;
(*rq).committedInstanceId = instanceId;
(*rq).committedInstanceCustomIndex = customIndex;
(*rq).committedPrimitiveIndex = triIndex;
(*rq).committedBarycentrics = vec2<f32>(tr.u, tr.v);
(*rq).committedObjectRayOrigin = rayObj.origin;
(*rq).committedObjectRayDirection = rayObj.direction;
(*rq).committedObjectToWorldR0 = inst.objectToWorldR0;
(*rq).committedObjectToWorldR1 = inst.objectToWorldR1;
(*rq).committedObjectToWorldR2 = inst.objectToWorldR2;
(*rq).committedWorldToObjectR0 = inst.worldToObjectR0;
(*rq).committedWorldToObjectR1 = inst.worldToObjectR1;
(*rq).committedWorldToObjectR2 = inst.worldToObjectR2;
}
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
let left = node.firstChildOrPrim;
let right = left + 1u;
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
nodeRel = left;
}
}
fn _rqTraverseTlas(rq: ptr<function, RayQuery>) {
let rayWorld = (*rq).ray;
let invD = vec3<f32>(1.0) / rayWorld.direction;
let cullMask = (*rq).cullMask;
let rayFlags = (*rq).flags;
// Stack-based BVH descent — same shape as _rtTraverseTlas.
var stack: array<u32, 24>;
var sp: u32 = 0u;
stack[sp] = 0u; sp = sp + 1u;
loop {
if (sp == 0u) { break; }
sp = sp - 1u;
let nodeIdx = stack[sp];
let node = tlasBvhNodes[nodeIdx];
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
continue;
}
if (nodeIdx >= TLAS_BVH_LEAVES_START) {
let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START;
let i = tlasEntryOrder[leafIdx];
if (i == 0xFFFFFFFFu) { continue; }
let inst = tlasEntries[i];
let instanceMask = inst.maskHGOffset & 0xFFu;
if ((instanceMask & cullMask) == 0u) { continue; }
if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, (*rq).committedT)) { continue; }
let r0 = inst.worldToObjectR0;
let r1 = inst.worldToObjectR1;
let r2 = inst.worldToObjectR2;
var rayObj: RayDesc;
rayObj.origin = vec3<f32>(
dot(r0.xyz, rayWorld.origin) + r0.w,
dot(r1.xyz, rayWorld.origin) + r1.w,
dot(r2.xyz, rayWorld.origin) + r2.w,
);
rayObj.direction = vec3<f32>(
dot(r0.xyz, rayWorld.direction),
dot(r1.xyz, rayWorld.direction),
dot(r2.xyz, rayWorld.direction),
);
rayObj.tMin = rayWorld.tMin;
rayObj.tMax = (*rq).committedT;
var effective = rayFlags;
let iflags = inst.instanceFlags;
if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
}
let meshRec = meshRecords[inst.blasMeshIdx];
_rqTraverseBlas(rayObj, effective, meshRec, i, inst.customIndex, inst, rq);
if ((rayFlags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u
&& (*rq).committedType != RT_INTERSECTION_NONE) {
return;
}
} else {
let left = 2u * nodeIdx + 1u;
let right = 2u * nodeIdx + 2u;
if (sp + 1u < 24u) {
stack[sp] = right; sp = sp + 1u;
stack[sp] = left; sp = sp + 1u;
}
}
}
}
fn rayQueryInitialize(rq: ptr<function, RayQuery>, tlasIdx: u32, flags: u32, cullMask: u32,
origin: vec3<f32>, tMin: f32, direction: vec3<f32>, tMax: f32) {
(*rq).ray.origin = origin;
(*rq).ray.tMin = tMin;
(*rq).ray.direction = direction;
(*rq).ray.tMax = tMax;
(*rq).flags = flags;
(*rq).cullMask = cullMask & 0xFFu;
(*rq).committedType = RT_INTERSECTION_NONE;
(*rq).committedT = tMax;
(*rq).done = 0u;
}
fn rayQueryProceed(rq: ptr<function, RayQuery>) -> bool {
if ((*rq).done != 0u) { return false; }
_rqTraverseTlas(rq);
(*rq).done = 1u;
return false;
}
fn rayQueryTerminate(rq: ptr<function, RayQuery>) {
(*rq).done = 1u;
}
fn rayQueryGetCommittedIntersectionType(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedType; }
fn rayQueryGetCommittedT(rq: ptr<function, RayQuery>) -> f32 { return (*rq).committedT; }
fn rayQueryGetCommittedInstanceId(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedInstanceId; }
fn rayQueryGetCommittedInstanceCustomIndex(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedInstanceCustomIndex; }
fn rayQueryGetCommittedPrimitiveIndex(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedPrimitiveIndex; }
fn rayQueryGetCommittedBarycentrics(rq: ptr<function, RayQuery>) -> vec2<f32> { return (*rq).committedBarycentrics; }
fn rayQueryGetCommittedObjectRayOrigin(rq: ptr<function, RayQuery>) -> vec3<f32> { return (*rq).committedObjectRayOrigin; }
fn rayQueryGetCommittedObjectRayDirection(rq: ptr<function, RayQuery>) -> vec3<f32> { return (*rq).committedObjectRayDirection; }
fn rayQueryGetCommittedWorldPosition(rq: ptr<function, RayQuery>) -> vec3<f32> {
return (*rq).ray.origin + (*rq).ray.direction * (*rq).committedT;
}
`;
// TLAS spatial-partition bin. 64 of these per frame partition instances
// by the top 6 bits of their Morton code. RT traversal walks bin AABBs
// first; only bins a ray hits get their per-instance loop. Layout
// matches the WGSL struct below.
const TLAS_BIN_COUNT = 64;
const TLAS_BIN_SIZE = 32;
// ── Internal compute pipeline: builds TLASEntry[] from the RTInstance[]
// + the meshRecords table. One thread per instance.
// Uses only rtWgslTypes (no megakernel bindings) so it stays well under
// the 8-storage-buffer-per-stage baseline limit.
const tlasBuildWgsl = rtWgslTypes /* needs MeshRecord + TLASEntry */ + String.raw`
struct RTInstance {
transformR0: vec4<f32>,
transformR1: vec4<f32>,
transformR2: vec4<f32>,
customIndexMask: u32, // customIndex: low 24, mask: high 8
sbtFlags: u32, // sbtOffset: low 24, flags: high 8
accelStructureRef: vec2<u32>,
};
@group(0) @binding(0) var<storage, read> inInstances : array<RTInstance>;
@group(0) @binding(1) var<storage, read> inMeshes : array<MeshRecord>;
@group(0) @binding(2) var<storage, read_write> outEntries : array<TLASEntry>;
@group(0) @binding(3) var<storage, read_write> outOrder : array<u32>; // identity perm for stage 1
@group(0) @binding(4) var<storage, read_write> outMorton : array<u32>; // 30-bit Morton from world-AABB centroid
// Spread the low 10 bits of v across every 3rd bit (so three of these
// interleave into a 30-bit Morton code). Standard "magic number" version.
fn _expandBits10(v0: u32) -> u32 {
var v = v0 & 0x000003FFu;
v = (v * 0x00010001u) & 0xFF0000FFu;
v = (v * 0x00000101u) & 0x0F00F00Fu;
v = (v * 0x00000011u) & 0xC30C30C3u;
v = (v * 0x00000005u) & 0x49249249u;
return v;
}
fn _mortonCode3D(c: vec3<f32>) -> u32 {
// c assumed in [0, 1]^3.
let q = clamp(c, vec3<f32>(0.0), vec3<f32>(1.0));
let xi = u32(q.x * 1023.0);
let yi = u32(q.y * 1023.0);
let zi = u32(q.z * 1023.0);
return (_expandBits10(xi) << 2u)
| (_expandBits10(yi) << 1u)
| _expandBits10(zi);
}
fn _invMat3(c0: vec3<f32>, c1: vec3<f32>, c2: vec3<f32>) -> array<vec3<f32>, 3> {
let m00 = c1.y*c2.z - c1.z*c2.y;
let m01 = c1.z*c2.x - c1.x*c2.z;
let m02 = c1.x*c2.y - c1.y*c2.x;
let det = c0.x*m00 + c0.y*m01 + c0.z*m02;
let inv = 1.0 / det;
return array<vec3<f32>, 3>(
vec3<f32>(m00, c0.z*c2.y - c0.y*c2.z, c0.y*c1.z - c0.z*c1.y) * inv,
vec3<f32>(m01, c0.x*c2.z - c0.z*c2.x, c0.z*c1.x - c0.x*c1.z) * inv,
vec3<f32>(m02, c0.y*c2.x - c0.x*c2.y, c0.x*c1.y - c0.y*c1.x) * inv,
);
}
@compute @workgroup_size(64, 1, 1)
fn tlasBuildMain(@builtin(global_invocation_id) gid: vec3<u32>) {
let i = gid.x;
if (i >= arrayLength(&inInstances)) { return; }
let inst = inInstances[i];
let meshIdx = inst.accelStructureRef.x;
let custom = inst.customIndexMask & 0xFFFFFFu;
let mask = (inst.customIndexMask >> 24u) & 0xFFu;
let sbtOffset = inst.sbtFlags & 0xFFFFFFu;
let iflags = (inst.sbtFlags >> 24u) & 0xFFu;
let rec = inMeshes[meshIdx];
// 8 corners → world AABB.
var worldMin = vec3<f32>(1e30);
var worldMax = vec3<f32>(-1e30);
for (var c: u32 = 0u; c < 8u; c = c + 1u) {
let corner = vec3<f32>(
select(rec.rootAabbMin.x, rec.rootAabbMax.x, (c & 1u) != 0u),
select(rec.rootAabbMin.y, rec.rootAabbMax.y, (c & 2u) != 0u),
select(rec.rootAabbMin.z, rec.rootAabbMax.z, (c & 4u) != 0u),
);
let wc = vec3<f32>(
dot(inst.transformR0.xyz, corner) + inst.transformR0.w,
dot(inst.transformR1.xyz, corner) + inst.transformR1.w,
dot(inst.transformR2.xyz, corner) + inst.transformR2.w,
);
worldMin = min(worldMin, wc);
worldMax = max(worldMax, wc);
}
// Inverse 3x4 affine.
let inv = _invMat3(inst.transformR0.xyz, inst.transformR1.xyz, inst.transformR2.xyz);
let T = vec3<f32>(inst.transformR0.w, inst.transformR1.w, inst.transformR2.w);
let invT = vec3<f32>(-dot(inv[0], T), -dot(inv[1], T), -dot(inv[2], T));
var e: TLASEntry;
e.aabbMin = worldMin;
e.aabbMax = worldMax;
e.maskHGOffset = mask | (sbtOffset << 8u);
e.blasMeshIdx = meshIdx;
e.objectToWorldR0 = inst.transformR0;
e.objectToWorldR1 = inst.transformR1;
e.objectToWorldR2 = inst.transformR2;
e.worldToObjectR0 = vec4<f32>(inv[0], invT.x);
e.worldToObjectR1 = vec4<f32>(inv[1], invT.y);
e.worldToObjectR2 = vec4<f32>(inv[2], invT.z);
e.customIndex = custom;
e.instanceFlags = iflags;
outEntries[i] = e;
// LBVH pre-pass output:
// outOrder[i] = i — identity permutation (Stage 1; the
// radix sort in Stage 2 rewrites this).
// outMorton[i] = morton(centroid normalized to [0,1]^3 over the
// fixed world bound).
// Fixed bound is conservative for 3DForts (map ~2000 units, fort ~50).
// Stage 2 swaps to a true scene-AABB reduction.
outOrder[i] = i;
let centroid = (worldMin + worldMax) * 0.5;
let worldHalfExtent = 4000.0;
let normalized = (centroid + vec3<f32>(worldHalfExtent)) * (1.0 / (2.0 * worldHalfExtent));
outMorton[i] = _mortonCode3D(normalized);
}
`;
// TLAS LBVH build — single dispatch of one 1024-thread workgroup.
// Phases:
// 0. Scene AABB reduction (256-wide tree reduce).
// 1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel
// slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to
// 0xFFFE so no real key collides with the sentinel.
// 2. Bitonic sort of the packed keys ascending (in-place in sortA).
// Data-oblivious network — 105 compare-exchange sub-stages over
// 2^14 keys — so it cannot exhibit the count-dependent corruption
// the old LSD radix scatter did (TODO-lbvh-sort.md, strategy #5).
// 3. Write sorted instance permutation into outOrder.
// 4. Initialize BVH leaf AABBs from sorted instances.
// 5. Bottom-up sweep-tree refit, log2(N_PADDED) levels.
//
// Storage-barrier pattern: workgroupBarrier() fences workgroup memory
// only per WGSL spec; storageBarrier() is required between R/W phases
// on sortA and outBvh — including between every bitonic sub-stage, which
// reads and writes sortA. Both are called at every storage boundary.
//
// Hard cap: LBVH_MAX = 16384. The bitonic sort gives BVH leaves Morton
// (Z-order) spatial coherence, and the degenerate-AABB fix in _rtAabb
// keeps sentinel-only subtrees from being traversed. Per-ray cost scales
// with log2(N_real), not log2(N_PADDED).
const LBVH_MAX = 16384;
const lbvhBuildWgsl = String.raw`
struct TLASEntryStub {
aabbMin: vec3<f32>,
maskHGOffset: u32,
aabbMax: vec3<f32>,
blasMeshIdx: u32,
objectToWorldR0: vec4<f32>,
objectToWorldR1: vec4<f32>,
objectToWorldR2: vec4<f32>,
worldToObjectR0: vec4<f32>,
worldToObjectR1: vec4<f32>,
worldToObjectR2: vec4<f32>,
customIndex: u32,
instanceFlags: u32,
_pad0: u32,
_pad1: u32,
};
struct BvhNode {
aabbMin: vec3<f32>,
_pad0: u32,
aabbMax: vec3<f32>,
_pad1: u32,
};
@group(0) @binding(0) var<storage, read> entries : array<TLASEntryStub>;
@group(0) @binding(1) var<storage, read_write> outOrder : array<u32>;
@group(0) @binding(2) var<storage, read_write> outBvh : array<BvhNode>;
// Radix-sort ping-pong buffers. One u32 per element — the packed
// (morton16 << 16) | tlasIndex16 key. Sized for N_PADDED.
@group(0) @binding(3) var<storage, read_write> sortA : array<u32>;
@group(0) @binding(4) var<storage, read_write> sortB : array<u32>;
// Real instance count. Passed as a uniform so the entries / outOrder /
// sortA / sortB / outBvh buffers can be allocated ONCE at N_PADDED and
// never resized as the application's TLAS instance count changes —
// runtime resize-on-grow caused subtle BVH corruption (driver-level
// memory recycling, suspected) and was the root cause of mid-game
// geometry flicker when projectiles entered the TLAS.
// nPadded = next_pow2(max(nReal,1)), supplied by the host. The bitonic
// sort still runs over the full N_PADDED (sentinels sink to the tail), but
// the sweep tree is built (and traced) at depth log2(nPadded) so descent
// tracks the real instance count instead of a fixed 14.
struct LbvhPC { nReal: u32, nPadded: u32, _pad1: u32, _pad2: u32 };
@group(0) @binding(5) var<uniform> lbvhPc : LbvhPC;
const N_PADDED: u32 = 16384u;
const THREADS: u32 = 1024u;
const K_PER: u32 = 16u; // = N_PADDED / THREADS
const REDUCE_LANES: u32 = 256u;
const REDUCE_K_PER: u32 = 64u; // = N_PADDED / REDUCE_LANES
const LEVELS: u32 = 14u; // log2(N_PADDED)
// Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is
// 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB
// default workgroup-storage cap.
var<workgroup> shRedMin: array<vec3<f32>, 256>;
var<workgroup> shRedMax: array<vec3<f32>, 256>;
var<workgroup> shSceneMin: vec3<f32>;
var<workgroup> shSceneMax: vec3<f32>;
fn _expandBits10(v0: u32) -> u32 {
var v = v0 & 0x000003FFu;
v = (v * 0x00010001u) & 0xFF0000FFu;
v = (v * 0x00000101u) & 0x0F00F00Fu;
v = (v * 0x00000011u) & 0xC30C30C3u;
v = (v * 0x00000005u) & 0x49249249u;
return v;
}
fn _mortonCode3D(c: vec3<f32>) -> u32 {
let q = clamp(c, vec3<f32>(0.0), vec3<f32>(1.0));
let xi = u32(q.x * 1023.0);
let yi = u32(q.y * 1023.0);
let zi = u32(q.z * 1023.0);
return (_expandBits10(xi) << 2u)
| (_expandBits10(yi) << 1u)
| _expandBits10(zi);
}
@compute @workgroup_size(1024, 1, 1)
fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3<u32>) {
let tid = lid.x;
let n = lbvhPc.nReal;
// ── Phase 0: scene AABB reduction across centroids ───────────────────
// REDUCE_LANES=256 lanes each fold REDUCE_K_PER stripes (covers
// N_PADDED), then an 8-step tree reduce across those lanes gives
// the final AABB.
if (tid < REDUCE_LANES) {
var lMin = vec3<f32>( 1e30);
var lMax = vec3<f32>(-1e30);
for (var k: u32 = 0u; k < REDUCE_K_PER; k = k + 1u) {
let i = tid * REDUCE_K_PER + k;
if (i < n) {
let c = (entries[i].aabbMin + entries[i].aabbMax) * 0.5;
lMin = min(lMin, c);
lMax = max(lMax, c);
}
}
shRedMin[tid] = lMin;
shRedMax[tid] = lMax;
}
workgroupBarrier();
var stride: u32 = 128u;
for (var s: u32 = 0u; s < 8u; s = s + 1u) {
if (tid < stride) {
shRedMin[tid] = min(shRedMin[tid], shRedMin[tid + stride]);
shRedMax[tid] = max(shRedMax[tid], shRedMax[tid + stride]);
}
workgroupBarrier();
stride = stride / 2u;
}
if (tid == 0u) {
shSceneMin = shRedMin[0];
shSceneMax = shRedMax[0];
}
workgroupBarrier();
// ── Phase 1: emit packed sort keys into sortA ────────────────────────
let extent = max(shSceneMax - shSceneMin, vec3<f32>(1e-3));
let invExtent = vec3<f32>(1.0) / extent;
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
var key: u32;
if (i < n) {
let c = (entries[i].aabbMin + entries[i].aabbMax) * 0.5;
let nrm = (c - shSceneMin) * invExtent;
let m30 = _mortonCode3D(nrm);
let m16 = min(m30 >> 14u, 0xFFFEu);
key = (m16 << 16u) | (i & 0xFFFFu);
} else {
key = 0xFFFFFFFFu;
}
sortA[i] = key;
}
workgroupBarrier();
storageBarrier();
// ── Phase 2: bitonic sort of sortA[0..N_PADDED) ascending ────────────
// Replaces the previous LSD radix scatter, which produced
// count-dependent corruption (TODO-lbvh-sort.md): a memory-ordering
// bug in the Hillis-Steele scan / parallel scatter that surfaced only
// for certain Morton distributions (a small object beside a tight
// cluster), making fort geometry flicker. Despite careful review the
// exact race was never pinned down.
//
// A bitonic sorting network is DATA-OBLIVIOUS: the sequence of
// compare-exchanges depends only on N_PADDED, never on the key values.
// That eliminates the entire class of distribution-dependent races the
// radix sort tripped over (TODO strategy #5). N_PADDED is a power of
// two so the network is exact; sentinel keys (0xFFFFFFFF) compare
// largest and settle at the tail — exactly where Phase 4 expects them.
//
// Single workgroup, storage-resident: 16384 u32 = 64 KB exceeds the
// workgroup-storage cap, so the keys stay in sortA. Each of the 1024
// threads owns PAIRS_PER_THREAD = (N_PADDED/2)/THREADS = 8 compare-
// exchanges per sub-stage. A bitonic network over 2^14 keys has
// sum(p for p in 1..=14) = 105 sub-stages; storageBarrier() fences
// sortA between each so one sub-stage's writes are visible to the next.
// sortB is unused by this path (left bound; harmless).
const PAIRS: u32 = N_PADDED / 2u; // 8192 compare-exchanges / sub-stage
const PAIRS_PER_THREAD: u32 = PAIRS / THREADS; // 8
for (var k: u32 = 2u; k <= N_PADDED; k = k << 1u) {
for (var j: u32 = k >> 1u; j > 0u; j = j >> 1u) {
for (var t: u32 = 0u; t < PAIRS_PER_THREAD; t = t + 1u) {
// Linear pair id p in [0, N_PADDED/2). Map it to the lower
// index lo of the compared pair by inserting a 0 bit at
// position log2(j): lo has that bit clear, hi = lo | j.
let p = t * THREADS + tid;
let lo = ((p & ~(j - 1u)) << 1u) | (p & (j - 1u));
let hi = lo | j;
let a = sortA[lo];
let b = sortA[hi];
// Sort direction for this bitonic block. lo and hi differ
// only in bit log2(j) (< log2(k)), so both agree on (x & k).
let ascending = (lo & k) == 0u;
if ((a > b) == ascending) {
sortA[lo] = b;
sortA[hi] = a;
}
}
storageBarrier();
workgroupBarrier();
}
}
// Sorted keys (ascending; sentinels last) now live in sortA.
// ── Phase 3: write sorted instance permutation into outOrder ─────────
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
if (i < n) {
outOrder[i] = sortA[i] & 0xFFFFu;
}
}
workgroupBarrier();
storageBarrier();
// ── Phase 4: initialize BVH leaf AABBs ───────────────────────────────
// Only the first nPadded sorted slots become leaves of the (smaller)
// sweep tree; reals occupy [0,nReal), the rest sink as sentinels.
let nPadded = max(lbvhPc.nPadded, 1u);
let leafPerThread = (nPadded + THREADS - 1u) / THREADS;
for (var k: u32 = 0u; k < leafPerThread; k = k + 1u) {
let i = k * THREADS + tid;
if (i < nPadded) {
let leafIdx = nPadded - 1u + i;
let leafKey = sortA[i];
if (leafKey == 0xFFFFFFFFu) {
outBvh[leafIdx].aabbMin = vec3<f32>( 1e30);
outBvh[leafIdx].aabbMax = vec3<f32>(-1e30);
} else {
let e = entries[leafKey & 0xFFFFu];
outBvh[leafIdx].aabbMin = e.aabbMin;
outBvh[leafIdx].aabbMax = e.aabbMax;
}
}
}
workgroupBarrier();
storageBarrier();
// ── Phase 5: bottom-up sweep-tree refit, log2(nPadded) levels ───────
// Deepest internal level has nPadded/2 nodes. The loop bound is uniform
// across the workgroup (depends only on nPadded), so the barriers stay
// in uniform control flow.
var levelCount: u32 = nPadded / 2u;
var levelStart: u32 = nPadded / 2u - 1u;
loop {
if (levelCount == 0u) { break; }
let perThread = (levelCount + THREADS - 1u) / THREADS;
for (var k: u32 = 0u; k < perThread; k = k + 1u) {
let nodeOff = k * THREADS + tid;
if (nodeOff < levelCount) {
let nodeIdx = levelStart + nodeOff;
let leftIdx = 2u * nodeIdx + 1u;
let rightIdx = 2u * nodeIdx + 2u;
let lMin = outBvh[leftIdx].aabbMin;
let lMax = outBvh[leftIdx].aabbMax;
let rMin = outBvh[rightIdx].aabbMin;
let rMax = outBvh[rightIdx].aabbMax;
outBvh[nodeIdx].aabbMin = min(lMin, rMin);
outBvh[nodeIdx].aabbMax = max(lMax, rMax);
}
}
workgroupBarrier();
storageBarrier();
levelCount = levelCount / 2u;
levelStart = (levelStart - 1u) / 2u;
}
}
`;
// ── RT runtime state ──────────────────────────────────────────────────
// Mesh heaps grow geometrically; each Mesh::Build appends + records its
// offsets in meshRecordsCpu/Buffer. nextMeshHandle is what gets returned
// to C++ as RTInstance::accelerationStructureReference.
const RT_HEAP_INITIAL_BYTES = 64 * 1024;
function makeRtHeap() {
return {
gpu: device.createBuffer({
size: RT_HEAP_INITIAL_BYTES,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
}),
capacity: RT_HEAP_INITIAL_BYTES,
cursor: 0,
};
}
function rtHeapEnsure(h, neededBytes) {
if (h.cursor + neededBytes <= h.capacity) return;
let cap = h.capacity;
while (h.cursor + neededBytes > cap) cap *= 2;
const ng = device.createBuffer({
size: cap,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
});
const enc = device.createCommandEncoder();
if (h.cursor > 0) enc.copyBufferToBuffer(h.gpu, 0, ng, 0, h.cursor);
queue.submit([enc.finish()]);
h.gpu.destroy();
h.gpu = ng;
h.capacity = cap;
// Invalidate cached bind groups that referenced the heap.
rtState.bindGroupCache.clear();
}
const rtState = {
vertHeap: null, // f32 stream (3 floats per vertex)
indexHeap: null, // u32 stream
bvhHeap: null, // BVHNode stream (32 bytes per node)
primRemapHeap: null, // u32 stream
attribsHeap: null, // u32 stream (per-vertex attribute payload; example-defined stride)
meshRecordsBuffer: null, // GPUBuffer of MeshRecord[]
meshRecordsCapacity: 0,
nextMeshHandle: 1,
rtHeader: null, // uniform buffer for RTDispatchHeader (256 B aligned)
bindGroupCache: new Map(), // key → bind group
tlasBuildPipeline: null,
tlasBuildBgl: null,
// Latest TLAS buffer handle from wgpuBuildTLAS, used by rayQuery-capable
// compute shaders at dispatch time.
currentTlas: 0,
currentTlasInstanceCount: 0,
};
function rtInit() {
rtState.vertHeap = makeRtHeap();
rtState.indexHeap = makeRtHeap();
rtState.bvhHeap = makeRtHeap();
rtState.primRemapHeap = makeRtHeap();
rtState.attribsHeap = makeRtHeap();
rtState.meshRecordsCapacity = 16;
rtState.meshRecordsBuffer = device.createBuffer({
size: rtState.meshRecordsCapacity * 48,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
});
rtState.rtHeader = device.createBuffer({
size: 256,
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
// TLAS-build compute pipeline. group(0) carries:
// 0 inInstances (read-only-storage)
// 1 inMeshes (read-only-storage)
// 2 outEntries (read-write storage)
// 3 outOrder (read-write — identity perm, sort overwrites later)
// 4 outMorton (read-write — sort key, 30-bit Morton from centroid)
const mod = device.createShaderModule({ code: tlasBuildWgsl, label: "rt-tlas-build" });
const tlasBuildBgl = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
]});
rtState.tlasBuildBgl = tlasBuildBgl;
rtState.tlasBuildPipeline = device.createComputePipeline({
layout: device.createPipelineLayout({ bindGroupLayouts: [tlasBuildBgl] }),
compute: { module: mod, entryPoint: "tlasBuildMain" },
});
// LBVH-build follow-up pipeline. Single workgroup of 1024 threads
// sorts instances by packed (morton16:idx16) keys (LSD radix) and
// refits a sweep-tree BVH. N_PADDED = 1024 ceiling for Stage 1.
// 0 entries (read-only)
// 1 outOrder (read-write, sorted permutation)
// 2 outBvh (read-write, 2*N_PADDED - 1 nodes × 32 bytes)
// 3 sortA (read-write, N_PADDED u32 ping-pong)
// 4 sortB (read-write, N_PADDED u32 ping-pong)
const lbvhMod = device.createShaderModule({ code: lbvhBuildWgsl, label: "rt-tlas-lbvh-build" });
const lbvhBuildBgl = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
]});
rtState.lbvhBuildBgl = lbvhBuildBgl;
rtState.lbvhBuildPipeline = device.createComputePipeline({
layout: device.createPipelineLayout({ bindGroupLayouts: [lbvhBuildBgl] }),
compute: { module: lbvhMod, entryPoint: "lbvhBuildMain" },
});
// Tiny uniform buffer for the LBVH's `nReal` field. Written each
// wgpuBuildTLAS call before dispatch.
rtState.lbvhCountBuf = device.createBuffer({
size: 16,
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
}
function rtMeshRecordsEnsure(meshCount) {
if (meshCount <= rtState.meshRecordsCapacity) return;
let cap = rtState.meshRecordsCapacity;
while (cap < meshCount) cap *= 2;
const ng = device.createBuffer({
size: cap * 48,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
});
const enc = device.createCommandEncoder();
enc.copyBufferToBuffer(rtState.meshRecordsBuffer, 0, ng, 0,
rtState.meshRecordsCapacity * 48);
queue.submit([enc.finish()]);
rtState.meshRecordsBuffer.destroy();
rtState.meshRecordsBuffer = ng;
rtState.meshRecordsCapacity = cap;
rtState.bindGroupCache.clear();
}
env.wgpuRegisterMeshBLAS = (minX, minY, minZ, maxX, maxY, maxZ,
verticesPtr, vertexCount,
indicesPtr, indexCount,
bvhNodesPtr, bvhNodeCount,
primRemapPtr, primRemapCount,
attribsPtr, attribsByteCount) => {
if (!rtState.vertHeap) rtInit();
console.log(`[crafter-wgpu] mesh BLAS: bbox=(${minX.toFixed(1)}..${maxX.toFixed(1)}, ${minY.toFixed(1)}..${maxY.toFixed(1)}, ${minZ.toFixed(1)}..${maxZ.toFixed(1)}), ${vertexCount} verts, ${indexCount/3} tris, attribs=${attribsByteCount}B`);
const vBytes = vertexCount * 12;
const iBytes = indexCount * 4;
const nBytes = bvhNodeCount * 32;
const rBytes = primRemapCount * 4;
// attribsByteCount must be a multiple of 4 (the heap is array<u32>).
// Round up the upload size; the in-MeshRecord offset is in u32 words.
const aBytes = (attribsByteCount + 3) & ~3;
rtHeapEnsure(rtState.vertHeap, vBytes);
rtHeapEnsure(rtState.indexHeap, iBytes);
rtHeapEnsure(rtState.bvhHeap, nBytes);
rtHeapEnsure(rtState.primRemapHeap, rBytes);
if (aBytes > 0) rtHeapEnsure(rtState.attribsHeap, aBytes);
const vOff = rtState.vertHeap.cursor / 12; // in vec3 units
const iOff = rtState.indexHeap.cursor / 4; // in u32 units
const nOff = rtState.bvhHeap.cursor / 32; // in BVHNode units
const rOff = rtState.primRemapHeap.cursor / 4;
const aOff = rtState.attribsHeap.cursor / 4; // in u32 units
// queue.writeBuffer requires multiple-of-4 sizes. Vertex byte count is
// already 12*n; index/bvh/remap are 4*n / 32*n / 4*n — all multiples of 4.
queue.writeBuffer(rtState.vertHeap.gpu, rtState.vertHeap.cursor,
memU8().buffer, verticesPtr, vBytes);
queue.writeBuffer(rtState.indexHeap.gpu, rtState.indexHeap.cursor,
memU8().buffer, indicesPtr, iBytes);
queue.writeBuffer(rtState.bvhHeap.gpu, rtState.bvhHeap.cursor,
memU8().buffer, bvhNodesPtr, nBytes);
queue.writeBuffer(rtState.primRemapHeap.gpu, rtState.primRemapHeap.cursor,
memU8().buffer, primRemapPtr, rBytes);
if (aBytes > 0) {
queue.writeBuffer(rtState.attribsHeap.gpu, rtState.attribsHeap.cursor,
memU8().buffer, attribsPtr, aBytes);
}
rtState.vertHeap.cursor += vBytes;
rtState.indexHeap.cursor += iBytes;
rtState.bvhHeap.cursor += nBytes;
rtState.primRemapHeap.cursor += rBytes;
rtState.attribsHeap.cursor += aBytes;
const handle = rtState.nextMeshHandle++;
rtMeshRecordsEnsure(handle + 1);
// Build the MeshRecord (48 bytes) and write it.
const rec = new ArrayBuffer(48);
const f32 = new Float32Array(rec);
const u32 = new Uint32Array(rec);
f32[0] = minX; f32[1] = minY; f32[2] = minZ;
u32[3] = vOff;
f32[4] = maxX; f32[5] = maxY; f32[6] = maxZ;
u32[7] = iOff;
u32[8] = nOff;
u32[9] = rOff;
u32[10] = (vertexCount > 0) ? (indexCount / 3) : 0;
u32[11] = aOff;
queue.writeBuffer(rtState.meshRecordsBuffer, handle * 48, rec);
return handle;
};
env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle,
entryOrderHandle, mortonHandle, binsHandle,
bvhNodesHandle, sortABufHandle, sortBBufHandle) => {
if (!rtState.tlasBuildPipeline || !rtState.lbvhBuildPipeline) return;
const inst = buffers.get(instanceBufHandle);
const out = buffers.get(tlasOutBufHandle);
const order = buffers.get(entryOrderHandle);
const morton = buffers.get(mortonHandle);
const bvh = buffers.get(bvhNodesHandle);
const sortA = buffers.get(sortABufHandle);
const sortB = buffers.get(sortBBufHandle);
if (!inst || !out || !order || !morton || !bvh || !sortA || !sortB) {
console.error("[crafter-wgpu] wgpuBuildTLAS: unknown buffer handle");
return;
}
if (instanceCount > LBVH_MAX) {
console.error(`[crafter-wgpu] wgpuBuildTLAS: instance count ${instanceCount} > LBVH cap ${LBVH_MAX}`);
return;
}
const bg = device.createBindGroup({
layout: rtState.tlasBuildBgl,
entries: [
{ binding: 0, resource: { buffer: inst } },
{ binding: 1, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 2, resource: { buffer: out } },
{ binding: 3, resource: { buffer: order } },
{ binding: 4, resource: { buffer: morton } },
],
});
// Write the real instance count + the dynamic padded leaf count
// (next_pow2) to the LBVH uniform. The sort still runs over the full
// N_PADDED, but the sweep tree is built at depth log2(nPadded).
const countBuf = new Uint32Array(4);
countBuf[0] = instanceCount;
countBuf[1] = wfNextPow2(instanceCount);
queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf);
const lbvhBg = device.createBindGroup({
layout: rtState.lbvhBuildBgl,
entries: [
{ binding: 0, resource: { buffer: out } },
{ binding: 1, resource: { buffer: order } },
{ binding: 2, resource: { buffer: bvh } },
{ binding: 3, resource: { buffer: sortA } },
{ binding: 4, resource: { buffer: sortB } },
{ binding: 5, resource: { buffer: rtState.lbvhCountBuf } },
],
});
if (state.pass) {
// Mid-frame — close the user's compute pass, run our build pass
// on the same encoder, then reopen.
state.pass.end();
state.pass = null;
}
const enc = state.encoder || device.createCommandEncoder();
const ownEncoder = !state.encoder;
// Pass 1: TLAS entry build (existing).
{
const pass = enc.beginComputePass({ label: "tlas-build" });
pass.setPipeline(rtState.tlasBuildPipeline);
pass.setBindGroup(0, bg);
const groups = Math.ceil(instanceCount / 64);
pass.dispatchWorkgroups(groups, 1, 1);
pass.end();
}
// Pass 2: LBVH-build. Single workgroup of 1024 threads runs the
// entire BVH build (Morton, sort, sweep-tree refit) in shared
// memory. Same encoder → pipeline barrier sequences pass1 → pass2.
if (instanceCount > 0) {
const pass = enc.beginComputePass({ label: "tlas-lbvh-build" });
pass.setPipeline(rtState.lbvhBuildPipeline);
pass.setBindGroup(0, lbvhBg);
pass.dispatchWorkgroups(1, 1, 1);
pass.end();
}
if (ownEncoder) {
queue.submit([enc.finish()]);
} else {
state.pass = state.encoder.beginComputePass();
}
// Publish so rayQuery-capable compute pipelines pick up the latest TLAS
// without each dispatch having to thread the handle explicitly.
rtState.currentTlas = tlasOutBufHandle;
rtState.currentTlasInstanceCount = instanceCount;
rtState.currentEntryOrder = entryOrderHandle;
rtState.currentBvh = bvhNodesHandle;
};
// RT pipeline loader — wraps user-supplied WGSL (sources + generated mega
// switches + raygen + @compute entry) with the library prelude/helpers.
// `bindingsPtr` / `bindingsCount` are UICustomBinding entries (same 8-byte
// shape as wgpuLoadCustomShader) declaring extra @group(2)+ resources the
// closest-hit / miss / raygen WGSL touches (material SSBOs, albedo
// textures, samplers). Pass (0, 0) for a pipeline with no user bindings.
const rtPipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups }
// Per-payload byte budget in wfPayload (rounded up; user Payload structs
// must fit). Sponza's Payload is 48 B; 64 leaves headroom while keeping
// 2·W·H·64 B ≈ 265 MB at 1080p.
const WF_PAYLOAD_BYTES = 64;
// Dynamic-offset uniform ring: one WfParams slot per wavefront pass. 128
// slots covers maxDepth up to ~42 (1 + 3·maxDepth + 1 passes).
const WF_PARAM_SLOTS = 128;
const WF_TLAS_MAX_NPADDED = 16384; // LBVH sort capacity (N_PADDED)
// Smallest power of two >= max(n,1), clamped to the LBVH capacity. The
// TLAS sweep tree is built and traced at this depth so descent tracks the
// real instance count instead of a fixed 16384-leaf (depth-14) tree.
function wfNextPow2(n) {
let p = 1;
while (p < n && p < WF_TLAS_MAX_NPADDED) p <<= 1;
return p;
}
function ensureWavefrontBuffers(W, H) {
const cap = W * H;
rtState.wf = rtState.wf || { cap: 0 };
const wf = rtState.wf;
if (wf.cap === cap && wf.raysA) return wf;
for (const b of [wf.raysA, wf.raysB, wf.hits, wf.accum, wf.counters,
wf.payload, wf.indirect]) { if (b) b.destroy(); }
const S = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST;
wf.raysA = device.createBuffer({ size: cap * 64, usage: S, label: "wf-raysA" });
wf.raysB = device.createBuffer({ size: cap * 64, usage: S, label: "wf-raysB" });
wf.hits = device.createBuffer({ size: cap * 112, usage: S, label: "wf-hits" });
wf.accum = device.createBuffer({ size: cap * 16, usage: S, label: "wf-accum" });
wf.payload = device.createBuffer({ size: 2 * cap * WF_PAYLOAD_BYTES, usage: S, label: "wf-payload" });
wf.counters = device.createBuffer({ size: 64,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, label: "wf-counters" });
wf.indirect = device.createBuffer({ size: 16,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST, label: "wf-indirect" });
if (!wf.paramsRing) {
wf.paramsRing = device.createBuffer({ size: WF_PARAM_SLOTS * 256,
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, label: "wf-params" });
}
wf.cap = cap;
return wf;
}
// ── GPU timestamp-query harness ──────────────────────────────────────────
//
// One QuerySet with 2 slots per wavefront pass; each beginComputePass writes
// begin/end timestamps. After the passes we resolve into a buffer and read
// it back (deferred to after submit, like the readback path). Deltas are
// summed per pass label and printed ~1×/sec as a per-pass breakdown.
const WF_TS_MAX_PASSES = 64; // covers maxDepth up to ~20
function wfEnsureTimestamps() {
if (!tsSupported) return null;
if (rtState.ts) return rtState.ts;
const cap = 2 * WF_TS_MAX_PASSES;
rtState.ts = {
capacity: cap,
querySet: device.createQuerySet({ type: "timestamp", count: cap }),
resolveBuf: device.createBuffer({ size: cap * 8,
usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC }),
readBuf: device.createBuffer({ size: cap * 8,
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST }),
inFlight: false,
lastLog: 0,
pendingLabels: null,
};
return rtState.ts;
}
function wfLogTimestamps(ts, data) {
// data: BigInt64Array of ns timestamps, [begin0,end0,begin1,end1,...].
const now = Date.now();
if (now - ts.lastLog < 1000) return; // throttle to ~1/sec
ts.lastLog = now;
const labels = ts.pendingLabels;
if (!labels) return;
const sums = new Map(); // label → ns
let totalNs = 0;
for (let i = 0; i < labels.length; i++) {
const dt = Number(data[2*i + 1] - data[2*i + 0]);
if (dt < 0) continue;
sums.set(labels[i], (sums.get(labels[i]) || 0) + dt);
totalNs += dt;
}
const order = ["GENERATE", "PREP", "TRACE", "SHADE", "RESOLVE"];
const parts = order.filter(k => sums.has(k))
.map(k => `${k} ${(sums.get(k)/1000).toFixed(1)}us`);
console.log(`[crafter-wgpu] RT passes: ${parts.join(" | ")} | total ${(totalNs/1000).toFixed(1)}us`);
}
env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
if (!rtState.vertHeap) rtInit();
const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
// Insert helpers at the marker; prepend prelude.
const marker = "// @CRAFTER_RT_LIBRARY_HELPERS_HERE";
let beforeHelpers = userPart;
let afterHelpers = "";
const mi = userPart.indexOf(marker);
if (mi >= 0) {
beforeHelpers = userPart.substring(0, mi);
afterHelpers = userPart.substring(mi + marker.length);
}
// Wavefront assembly: types + bindings | user CH/miss/resolve + wfPayload
// + switches (beforeHelpers) | pure helpers | wavefront helpers | user
// raygen + the five @compute entry points (afterHelpers).
const fullWgsl = rtWgslTypes + rtWgslWavefrontBindings + "\n"
+ beforeHelpers + "\n" + rtWgslPureHelpers + "\n"
+ rtWgslWavefrontHelpers + "\n" + afterHelpers;
// Parse user bindings (same wire format as wgpuLoadCustomShader). For
// the wavefront RT pipeline, group 0 = WfParams, group 1 = data heaps,
// group 2 = indirect args — so user bindings must start at group 3.
const userBindings = [];
if (bindingsCount > 0) {
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
for (let i = 0; i < bindingsCount; i++) {
const g = dv.getUint8(i*8 + 0);
if (g < 3) {
console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 3`);
return 0;
}
userBindings.push({
group: g,
binding: dv.getUint8(i*8 + 1),
kind: dv.getUint8(i*8 + 2),
pushOffset: dv.getUint32(i*8 + 4, true),
});
}
}
const byGroup = new Map();
for (const b of userBindings) {
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
byGroup.get(b.group).push(b);
}
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
try {
const mod = device.createShaderModule({ code: fullWgsl, label: "rt-wavefront" });
const paramsBgl = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 32 } },
]});
const sb = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } });
const rw = (b) => ({ binding: b, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } });
const dataBgl = device.createBindGroupLayout({ entries: [
sb(0), sb(1), sb(2), sb(3), sb(4), sb(5),
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
sb(7), sb(8), sb(9),
rw(10), rw(11), rw(12), rw(13), rw(14), rw(15),
]});
const indirectBgl = device.createBindGroupLayout({ entries: [ rw(0) ]});
const emptyBgl = device.createBindGroupLayout({ entries: [] });
// User binding-group layouts for groups 3..highest (pad gaps).
const userBgls = [];
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 2;
for (let g = 3; g <= highest; g++) {
if (byGroup.has(g)) {
const entries = byGroup.get(g).map(b => {
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
else if (b.kind === 2) e.sampler = { type: "filtering" };
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
else if (b.kind === 4) e.buffer = { type: "storage" };
return e;
});
userBgls.push(device.createBindGroupLayout({ entries }));
} else {
userBgls.push(device.createBindGroupLayout({ entries: [] }));
}
}
// GENERATE / SHADE / RESOLVE may touch user bindings (raygen camera,
// closesthit albedo, resolve params) → params + data + empty(group2)
// + user. PREP → params + data + indirect. TRACE → params + data.
const userLayout = device.createPipelineLayout({
bindGroupLayouts: [paramsBgl, dataBgl, emptyBgl, ...userBgls] });
const prepLayout = device.createPipelineLayout({
bindGroupLayouts: [paramsBgl, dataBgl, indirectBgl] });
const traceLayout = device.createPipelineLayout({
bindGroupLayouts: [paramsBgl, dataBgl] });
const mk = (layout, ep) => device.createComputePipeline({
layout, compute: { module: mod, entryPoint: ep } });
const entry = {
genPipe: mk(userLayout, "wfGenerate"),
prepPipe: mk(prepLayout, "wfPrep"),
tracePipe: mk(traceLayout, "wfTrace"),
shadePipe: mk(userLayout, "wfShade"),
resolvePipe: mk(userLayout, "wfResolve"),
paramsBgl, dataBgl, indirectBgl, emptyBgl, userBgls,
byGroup, sortedGroups,
};
const handle = newHandle();
rtPipelines.set(handle, entry);
return handle;
} catch (e) {
console.error("[crafter-wgpu] RT pipeline compile failed:", e);
console.error("[crafter-wgpu] WGSL was:\n", fullWgsl);
return 0;
}
};
// Build the user @group(3+) bind groups for a pass, returning a list of
// { group, bindGroup } to set. Shared by GENERATE / SHADE / RESOLVE.
function wfUserBindGroups(pipe, handlesPtr, handlesCount) {
const out = [];
if (handlesCount <= 0) return out;
const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
let handleIdx = 0;
let bglIdx = 0;
const highest = pipe.sortedGroups.length ? pipe.sortedGroups[pipe.sortedGroups.length - 1] : 2;
for (let g = 3; g <= highest; g++) {
if (pipe.byGroup.has(g)) {
const entries = pipe.byGroup.get(g).map(b => {
const h = handles[handleIdx++];
let resource;
if (b.kind === 0) resource = { buffer: buffers.get(h) };
else if (b.kind === 1) resource = textureViews.get(h);
else if (b.kind === 2) resource = samplers.get(h);
else if (b.kind === 3) resource = textureViews.get(h);
else if (b.kind === 4) resource = { buffer: buffers.get(h) };
return { binding: b.binding, resource };
});
out.push({ group: g, bindGroup: device.createBindGroup({ layout: pipe.userBgls[bglIdx], entries }) });
}
bglIdx++;
}
return out;
}
env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
tlasBufHandle, instanceCount, gx, gy,
handlesPtr, handlesCount, maxDepth) => {
if (!state.encoder) return;
const pipe = rtPipelines.get(pipelineHandle);
const tlas = buffers.get(tlasBufHandle);
if (!pipe || !tlas) {
console.error("[crafter-wgpu] wgpuDispatchRT: unknown pipeline or tlas");
return;
}
const entryOrderBuf = buffers.get(rtState.currentEntryOrder);
const bvhBuf = buffers.get(rtState.currentBvh);
if (!entryOrderBuf || !bvhBuf) {
console.error("[crafter-wgpu] wgpuDispatchRT: missing entryOrder/bvh (no TLAS built yet?)");
return;
}
const W = state.width, H = state.height;
const cap = W * H;
const depth = Math.max(1, maxDepth | 0);
const wf = ensureWavefrontBuffers(W, H);
// ── Per-pass WfParams ring. queue.writeBuffer lands before submit, so
// we can't mutate the uniform between passes — instead we pre-write one
// slot per pass and bind it with a dynamic offset. Slot order:
// 0 GENERATE
// 1+3*d .. +2 PREP / TRACE / SHADE for bounce d
// 1+3*depth RESOLVE
const passCount = 2 + 3 * depth;
// TLAS descent depth = log2(tlasNPadded); must match the value the
// build used (both derive next_pow2 from the same instance count).
const tlasNPadded = wfNextPow2(instanceCount);
const ring = new Uint32Array(WF_PARAM_SLOTS * 64); // 256 B = 64 u32 per slot
const writeSlot = (slot, curIsA, bounce) => {
const o = slot * 64;
ring[o + 0] = W; ring[o + 1] = H; ring[o + 2] = cap; ring[o + 3] = curIsA;
ring[o + 4] = bounce; ring[o + 5] = depth; ring[o + 6] = tlasNPadded; ring[o + 7] = 0;
};
writeSlot(0, 1, 0); // GENERATE
for (let d = 0; d < depth; d++) {
const curIsA = (d % 2 === 0) ? 1 : 0;
writeSlot(1 + 3 * d + 0, curIsA, d); // PREP
writeSlot(1 + 3 * d + 1, curIsA, d); // TRACE
writeSlot(1 + 3 * d + 2, curIsA, d); // SHADE
}
writeSlot(1 + 3 * depth, 1, depth); // RESOLVE
queue.writeBuffer(wf.paramsRing, 0, ring, 0, passCount * 64);
const outView = state.outIsPing ? state.pingView : state.pongView;
const paramsBg = device.createBindGroup({
layout: pipe.paramsBgl,
entries: [{ binding: 0, resource: { buffer: wf.paramsRing, offset: 0, size: 256 } }],
});
const dataBg = device.createBindGroup({
layout: pipe.dataBgl,
entries: [
{ binding: 0, resource: { buffer: tlas } },
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
{ binding: 6, resource: outView },
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
{ binding: 8, resource: { buffer: entryOrderBuf } },
{ binding: 9, resource: { buffer: bvhBuf } },
{ binding: 10, resource: { buffer: wf.raysA } },
{ binding: 11, resource: { buffer: wf.raysB } },
{ binding: 12, resource: { buffer: wf.hits } },
{ binding: 13, resource: { buffer: wf.accum } },
{ binding: 14, resource: { buffer: wf.counters } },
{ binding: 15, resource: { buffer: wf.payload } },
],
});
const indirectBg = device.createBindGroup({
layout: pipe.indirectBgl,
entries: [{ binding: 0, resource: { buffer: wf.indirect } }],
});
const userBgs = wfUserBindGroups(pipe, handlesPtr, handlesCount);
// Close the frame's shared compute pass; the wavefront runs as its own
// sequence of passes on the same encoder (implicit barrier between each
// makes PREP's atomic writes visible to TRACE, etc.), then we reopen it.
if (state.pass) { state.pass.end(); state.pass = null; }
const enc = state.encoder;
const tileX = gx, tileY = gy;
const slotOff = (slot) => slot * 256;
// Zero the two emit counters before GENERATE.
enc.clearBuffer(wf.counters, 0, 64);
const setUser = (pass) => { for (const u of userBgs) pass.setBindGroup(u.group, u.bindGroup); };
// GPU timing: write begin/end timestamps around each pass (2 query
// slots per pass), then resolve + read back after submit.
const ts = wfEnsureTimestamps();
const capture = !!(ts && !ts.inFlight);
const tsLabels = [];
const beginPass = (label, tsName) => {
const desc = { label };
if (capture && tsLabels.length < WF_TS_MAX_PASSES) {
desc.timestampWrites = {
querySet: ts.querySet,
beginningOfPassWriteIndex: 2 * tsLabels.length,
endOfPassWriteIndex: 2 * tsLabels.length + 1,
};
tsLabels.push(tsName);
}
return enc.beginComputePass(desc);
};
// GENERATE
{
const p = beginPass("wf-generate", "GENERATE");
p.setPipeline(pipe.genPipe);
p.setBindGroup(0, paramsBg, [slotOff(0)]);
p.setBindGroup(1, dataBg);
setUser(p);
p.dispatchWorkgroups(tileX, tileY, 1);
p.end();
}
for (let d = 0; d < depth; d++) {
const prepSlot = 1 + 3 * d + 0;
const traceSlot = 1 + 3 * d + 1;
const shadeSlot = 1 + 3 * d + 2;
// PREP — publish indirect args, zero next counter.
{
const p = beginPass("wf-prep", "PREP");
p.setPipeline(pipe.prepPipe);
p.setBindGroup(0, paramsBg, [slotOff(prepSlot)]);
p.setBindGroup(1, dataBg);
p.setBindGroup(2, indirectBg);
p.dispatchWorkgroups(1, 1, 1);
p.end();
}
// TRACE — indirect over the live ray list.
{
const p = beginPass("wf-trace", "TRACE");
p.setPipeline(pipe.tracePipe);
p.setBindGroup(0, paramsBg, [slotOff(traceSlot)]);
p.setBindGroup(1, dataBg);
p.dispatchWorkgroupsIndirect(wf.indirect, 0);
p.end();
}
// SHADE — indirect; runs user closesthit/miss, may emit + accumulate.
{
const p = beginPass("wf-shade", "SHADE");
p.setPipeline(pipe.shadePipe);
p.setBindGroup(0, paramsBg, [slotOff(shadeSlot)]);
p.setBindGroup(1, dataBg);
setUser(p);
p.dispatchWorkgroupsIndirect(wf.indirect, 0);
p.end();
}
}
// RESOLVE — tonemap accum → output image.
{
const p = beginPass("wf-resolve", "RESOLVE");
p.setPipeline(pipe.resolvePipe);
p.setBindGroup(0, paramsBg, [slotOff(1 + 3 * depth)]);
p.setBindGroup(1, dataBg);
setUser(p);
p.dispatchWorkgroups(tileX, tileY, 1);
p.end();
}
if (capture && tsLabels.length > 0) {
enc.resolveQuerySet(ts.querySet, 0, 2 * tsLabels.length, ts.resolveBuf, 0);
enc.copyBufferToBuffer(ts.resolveBuf, 0, ts.readBuf, 0, 2 * tsLabels.length * 8);
ts.inFlight = true;
ts.pendingLabels = tsLabels;
state.tsReadPending = ts;
}
// Reopen the frame's shared pass so wgpuFrameEnd / later UI work as
// before, and flip ping-pong so the blit picks the texture RESOLVE wrote.
state.pass = enc.beginComputePass();
state.outIsPing = !state.outIsPing;
};
// ── Standalone compute pipelines ────────────────────────────────────────
//
// Mirrors the native Vulkan `ComputeShader` API: user-authored compute
// pipelines that dispatch outside any UI render pass. Unlike
// wgpuLoadCustomShader (which lives inside the UI flow with ping/pong
// textures at @group(1)), these have NO library-supplied bindings —
// users declare all groups themselves.
//
// Layout contract:
// @group(0) @binding(0) uniform PushData // optional, only if pushUniformSize > 0
// @group(1+) @binding(N) // user bindings via UICustomBinding
//
// rayQuery: same flag as wgpuLoadCustomShader. When set, prepends the
// RT prelude + rayQuery library; user shader must NOT redeclare the
// resulting @group(1) layout (tlas/bvh/mesh heaps). The `bindings`
// list must start at @group(2) in that case.
const computePipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups, pushUniformSize, rayQueryCapable }
env.wgpuLoadComputePipeline = (wgslPtr, wgslLen, pushUniformSize,
bindingsPtr, bindingsCount, rayQueryFlag) => {
if (!rtState.vertHeap && rayQueryFlag) rtInit();
const userWgsl = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
const wgsl = rayQueryFlag
// rayQueryLib's _rqTraverseBlas/_rqTraverseTlas call _rtAabb,
// _rtFetchTri, _rtTri from rtWgslPureHelpers — must prepend
// the pure helper subset (NOT the megakernel-only traversal
// routines, which reference user-emitted runAnyHit/runMiss/
// runClosestHit and won't compile outside the raygen pipeline).
? (rtWgslTypes + rtWgslMegakernelBindings + rtWgslPureHelpers + rtWgslRayQueryLib + "\n" + userWgsl)
: userWgsl;
const bindings = [];
if (bindingsCount > 0) {
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
for (let i = 0; i < bindingsCount; i++) {
bindings.push({
group: dv.getUint8(i*8 + 0),
binding: dv.getUint8(i*8 + 1),
kind: dv.getUint8(i*8 + 2),
pushOffset: dv.getUint32(i*8 + 4, true),
});
}
}
// Bind-group layouts. The push uniform sits at @group(0) if present;
// user bindings start at @group(1) (or @group(2) when rayQuery is on
// — see rtWgslMegakernelBindings).
const bgls = [];
if (pushUniformSize > 0) {
bgls.push(device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
buffer: { type: "uniform", minBindingSize: pushUniformSize } },
]}));
} else {
bgls.push(device.createBindGroupLayout({ entries: [] }));
}
if (rayQueryFlag) {
// group(1) is the megakernel RT bindings — same layout as the RT
// pipeline's group(1). The injected rayQuery library reads it.
bgls.push(device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
]}));
}
const byGroup = new Map();
const userGroupBase = rayQueryFlag ? 2 : 1;
for (const b of bindings) {
if (b.group < userGroupBase) {
console.error(`[crafter-wgpu] compute pipeline: @group(${b.group}) reserved; user groups must be >= ${userGroupBase}`);
return 0;
}
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
byGroup.get(b.group).push(b);
}
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : userGroupBase - 1;
for (let g = userGroupBase; g <= highest; g++) {
if (byGroup.has(g)) {
const entries = byGroup.get(g).map(b => {
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
else if (b.kind === 2) e.sampler = { type: "filtering" };
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
else if (b.kind === 4) e.buffer = { type: "storage" }; // read-write storage
return e;
});
bgls.push(device.createBindGroupLayout({ entries }));
} else {
bgls.push(device.createBindGroupLayout({ entries: [] }));
}
}
let pipeline;
try {
const mod = device.createShaderModule({ code: wgsl, label: "compute-pipeline" });
// Async compile-info to surface parse errors with line numbers.
mod.getCompilationInfo().then(info => {
const issues = info.messages.filter(m => m.type === "error" || m.type === "warning");
if (issues.length === 0) return;
const lines = wgsl.split("\n");
for (const m of issues) {
const ln = m.lineNum || 0, col = m.linePos || 0;
console[m.type === "error" ? "error" : "warn"](
`[crafter-wgpu] compute-pipeline ${m.type} at ${ln}:${col}: ${m.message}`);
for (let i = Math.max(1, ln - 3); i <= Math.min(lines.length, ln + 3); i++) {
const marker = i === ln ? ">> " : " ";
console.log(`${marker}${String(i).padStart(4)}: ${lines[i - 1]}`);
}
}
});
const layout = device.createPipelineLayout({ bindGroupLayouts: bgls });
pipeline = device.createComputePipeline({ layout, compute: { module: mod, entryPoint: "main" } });
} catch (e) {
console.error("[crafter-wgpu] compute pipeline compile failed:", e);
return 0;
}
const handle = newHandle();
computePipelines.set(handle, {
pipeline, bgls, byGroup, sortedGroups,
pushUniformSize, userGroupBase,
rayQueryCapable: !!rayQueryFlag,
});
return handle;
};
// Dispatch a standalone compute pipeline. Works both inside the per-frame
// UI compute pass (mid-frame physics tick) and outside it (game-tick
// dispatch from update lambda) — matching the wgpuBuildTLAS pattern.
//
// Push data: if the pipeline declared a push uniform (pushUniformSize > 0
// at load), `pushPtr` points at exactly that many bytes. We allocate a
// transient uniform buffer per dispatch — fine for the physics rate
// (handful of dispatches per substep, single-digit substeps per frame).
//
// Handles: parallel to the UICustomBinding[] declaration order at load
// time. Each handle resolves through the engine's buffer/texture/
// sampler tables to the live WebGPU resource.
env.wgpuDispatchCompute = (pipelineHandle, pushPtr, pushBytes,
handlesPtr, handlesCount, gx, gy, gz) => {
const pipe = computePipelines.get(pipelineHandle);
if (!pipe) {
console.error("[crafter-wgpu] wgpuDispatchCompute: unknown pipeline", pipelineHandle);
return;
}
// ── Push uniform: transient buffer per dispatch ────────────────────
let pushBG = null;
if (pipe.pushUniformSize > 0) {
const buf = device.createBuffer({
size: Math.max(16, (pipe.pushUniformSize + 15) & ~15),
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
if (pushBytes > 0) {
queue.writeBuffer(buf, 0, memU8().subarray(pushPtr, pushPtr + pushBytes));
}
pushBG = device.createBindGroup({
layout: pipe.bgls[0],
entries: [{ binding: 0, resource: { buffer: buf, offset: 0, size: pipe.pushUniformSize } }],
});
} else {
pushBG = device.createBindGroup({ layout: pipe.bgls[0], entries: [] });
}
// ── RT bindings (rayQuery only) ────────────────────────────────────
let rtBG = null;
if (pipe.rayQueryCapable) {
const tlasBuf = buffers.get(rtState.currentTlas);
const orderBuf = buffers.get(rtState.currentEntryOrder);
const bvhBuf = buffers.get(rtState.currentBvh);
if (!tlasBuf || !orderBuf || !bvhBuf) {
console.error("[crafter-wgpu] wgpuDispatchCompute rayQuery: no TLAS built yet");
return;
}
const outView = state.outIsPing ? state.pingView : state.pongView;
rtBG = device.createBindGroup({
layout: pipe.bgls[1],
entries: [
{ binding: 0, resource: { buffer: tlasBuf } },
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
{ binding: 6, resource: outView },
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
{ binding: 8, resource: { buffer: orderBuf } },
{ binding: 9, resource: { buffer: bvhBuf } },
],
});
}
// ── User bind groups from handles ──────────────────────────────────
const handleArr = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
const userBGs = [];
let handleCursor = 0;
const userGroupBase = pipe.userGroupBase;
const userBglStart = pipe.bgls.length - (pipe.sortedGroups.length
? (pipe.sortedGroups[pipe.sortedGroups.length - 1] - userGroupBase + 1)
: 0);
let bglIdx = userBglStart;
if (pipe.sortedGroups.length > 0) {
for (let g = userGroupBase; g <= pipe.sortedGroups[pipe.sortedGroups.length - 1]; g++) {
if (pipe.byGroup.has(g)) {
const entries = pipe.byGroup.get(g).map(b => {
const h = handleArr[handleCursor++];
let resource;
if (b.kind === 0 || b.kind === 4) resource = { buffer: buffers.get(h) };
else if (b.kind === 1) resource = textureViews.get(h);
else if (b.kind === 2) resource = samplers.get(h);
else if (b.kind === 3) resource = textureViews.get(h);
return { binding: b.binding, resource };
});
userBGs.push(device.createBindGroup({ layout: pipe.bgls[bglIdx++], entries }));
} else {
userBGs.push(device.createBindGroup({ layout: pipe.bgls[bglIdx++], entries: [] }));
}
}
}
// ── Execute: attach to active pass, or create a standalone one. ────
const runDispatch = (pass) => {
pass.setPipeline(pipe.pipeline);
pass.setBindGroup(0, pushBG);
if (rtBG) pass.setBindGroup(1, rtBG);
for (let i = 0; i < userBGs.length; i++) {
pass.setBindGroup(userGroupBase + i, userBGs[i]);
}
pass.dispatchWorkgroups(gx, gy, gz);
};
if (state.pass) {
// Mid-frame: dispatch into the current UI compute pass.
runDispatch(state.pass);
} else {
// Standalone: build encoder + pass + submit. Mirrors wgpuBuildTLAS.
const enc = state.encoder || device.createCommandEncoder();
const ownEncoder = !state.encoder;
const pass = enc.beginComputePass({ label: "plain-compute" });
runDispatch(pass);
pass.end();
if (ownEncoder) queue.submit([enc.finish()]);
}
};
console.log("[crafter-wgpu] init complete; env handlers wired");
// Memory diagnostic. Logs handle-table sizes every 5s so a slow leak
// shows up as monotonic growth. Comment out for production builds.
setInterval(() => {
const m = (performance.memory && performance.memory.usedJSHeapSize) || 0;
console.log(`[crafter-wgpu] mem: js=${(m / 1024 / 1024).toFixed(1)}MB`
+ ` buffers=${buffers.size}`
+ ` textures=${textures.size}`
+ ` samplers=${samplers.size}`
+ ` customPipelines=${customPipelines.size}`
+ ` rtPipelines=${rtPipelines.size}`
+ ` computePipelines=${computePipelines.size}`
+ ` bindGroupCache=${state.bindGroupCache ? state.bindGroupCache.size : 0}`);
}, 5000);
} catch (e) {
// Capture any throw so the stub error messages name the real cause
// instead of "(no error captured)". Re-throw so runtime.js's catch
// also logs it.
initError = e instanceof Error ? e : new Error(String(e));
console.error("[crafter-wgpu] init failed:", initError);
throw initError;
}
})(); // end window.crafter_webbuild_env_ready