Crafter.Graphics/additional/dom-webgpu.js

3104 lines
135 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
Crafter.Graphics WebGPU bridge — DOM mode UI compute pipeline.
Surface model (high-level, deliberately not 1:1 with WebGPU):
- JS owns the GPUDevice/queue/compute pipelines/bind-group cache.
- C++ owns slot allocation and per-frame logic; it calls into ~15 imports.
- Standard UI shaders are embedded as WGSL strings at the bottom of this
file and compiled once at startup.
Ping-pong output strategy (per Decision 2 in plan):
- Two rgba8unorm storage textures sized to the canvas.
- Each Dispatch reads `prev` (sampled), writes `out` (storage, write-only).
- JS swaps the two between dispatches, so item-order overdraw works.
- At frame end, the current `out` is blitted to the canvas via
copyTextureToTexture (canvas configured as rgba8unorm to match).
This file is loaded as <script type="module">. Top-level await blocks
runtime.js's _start() until adapter + device are resolved, so by the time
main() runs every import here is fully wired.
*/
// ─── env stubs (assigned synchronously, BEFORE any async work) ────────────
// The wasm module's import-link step needs every declared wgpu* import to
// resolve to a Function. If init below throws, the stubs stay in place so
// the wasm still links — and the call site gets a clear error at runtime
// instead of "import object field X is not a Function" at link time.
window.crafter_webbuild_env = window.crafter_webbuild_env || {};
window.crafter_webbuild_env.table = window.crafter_webbuild_env.table
|| new WebAssembly.Table({ initial: 4, element: "anyfunc" });
let initError = null;
function stub(name) {
return (...args) => {
const msg = `[crafter-wgpu] ${name}() called but WebGPU init failed: ${initError?.message ?? "(no error captured)"}`;
console.error(msg);
throw new Error(msg);
};
}
{
const e = window.crafter_webbuild_env;
for (const n of [
"wgpuGetCanvasWidth", "wgpuGetCanvasHeight", "wgpuSurfaceWidth", "wgpuSurfaceHeight",
"wgpuInit", "wgpuCreateBuffer", "wgpuWriteBuffer", "wgpuWriteBufferRange",
"wgpuReadbackEnqueue", "wgpuReadbackPoll", "wgpuReadbackReady", "wgpuDestroyBuffer",
"wgpuCreateAtlasTexture", "wgpuWriteAtlasRegion", "wgpuDestroyTexture",
"wgpuCreateImage2D", "wgpuWriteImage2D",
"wgpuCreateImage2DArray", "wgpuWriteImage2DLayer",
"wgpuCreateLinearClampSampler", "wgpuCreateLinearRepeatSampler",
"wgpuFrameBegin", "wgpuFrameEnd",
"wgpuDispatchQuads", "wgpuDispatchCircles", "wgpuDispatchImages", "wgpuDispatchText",
"wgpuLoadCustomShader", "wgpuDispatchCustom",
"wgpuRegisterMeshBLAS", "wgpuLoadRTPipeline", "wgpuDispatchRT", "wgpuBuildTLAS",
"wgpuLoadComputePipeline", "wgpuDispatchCompute",
]) {
// Read-write ints don't need a stub-throw; return 0 for the size queries.
e[n] = n.endsWith("Width") || n.endsWith("Height")
? () => 0
: (n === "wgpuRegisterMeshBLAS" ? () => 0 : stub(n));
}
}
// ─── canvas + device init (runs before _start) ───────────────────────────
// Wrapped in an async IIFE assigned to window.crafter_webbuild_env_ready so
// the runtime.js shim can `await` it explicitly before calling _start().
// Sibling <script type="module"> top-level awaits are NOT reliably
// serialized in Firefox (verified 2026-05), so we can't depend on this
// file's TLA to block runtime.js by itself.
window.crafter_webbuild_env_ready = (async () => {
try {
if (!navigator.gpu) {
document.body.innerHTML = "<p style=\"font-family:sans-serif;padding:24px\">"
+ "WebGPU not available in this browser. Try Chrome 121+ / Firefox 141+ / Safari 26+.</p>";
initError = new Error("WebGPU unavailable");
throw initError;
}
const canvas = document.createElement("canvas");
canvas.id = "crafter-canvas";
canvas.style.cssText = "position:fixed;inset:0;width:100vw;height:100vh;display:block;";
document.body.style.margin = "0";
document.body.appendChild(canvas);
function syncCanvasSize() {
// Canvas pixel size = CSS size × devicePixelRatio so the GPU draws
// at physical pixel resolution on HiDPI displays — otherwise the
// browser upscales whatever we rendered at logical size and the
// result looks blurry. The CSS rule still pins display size at
// 100vw/100vh, so the canvas paints its physical buffer back down
// into logical pixels with no perceived layout change.
//
// Mouse events arrive in CSS pixels; dom-env.js multiplies them by
// window.crafter_dpr before dispatching so the wasm-side hit tests
// share the physical-pixel coordinate space with window.width/.height.
const dpr = window.devicePixelRatio || 1;
window.crafter_dpr = dpr;
const w = Math.max(1, Math.round(window.innerWidth * dpr));
const h = Math.max(1, Math.round(window.innerHeight * dpr));
if (canvas.width !== w) canvas.width = w;
if (canvas.height !== h) canvas.height = h;
return { w, h };
}
syncCanvasSize();
const adapter = await navigator.gpu.requestAdapter();
if (!adapter) {
initError = new Error("navigator.gpu.requestAdapter() returned null (no compatible adapter)");
console.error("[crafter-wgpu]", initError.message);
throw initError;
}
// Ask for everything the adapter is willing to give us, up to the values
// the RT pipeline actually needs. The megakernel prelude declares 7
// storage buffers at group(1) (tlasEntries / bvhNodes / meshRecords /
// vertices / indices / primRemap / vertexAttribs); user pipelines like
// 3DForts add more at group(2), and the WebGPU baseline of 8 isn't
// enough. Adapters routinely report 10+ — clamp our request to whatever
// the adapter actually supports so the call doesn't reject on baseline-
// only devices. Same pattern for storage textures (we use 1 output image
// per dispatch but headroom is cheap) and for the global storage-buffer
// pool which is the per-pipeline count's parent budget.
const adapterLimits = adapter.limits || {};
const requiredLimits = {};
const clamp = (name, want) => {
const cap = adapterLimits[name];
if (typeof cap === "number" && cap > 0) {
requiredLimits[name] = Math.min(want, cap);
}
};
clamp("maxStorageBuffersPerShaderStage", 16);
clamp("maxStorageBuffersInPipelineLayout", 16);
clamp("maxStorageTexturesPerShaderStage", 8);
// The TLAS BVH build runs one workgroup of up to N threads in shared
// memory (bitonic sort over morton codes + sweep-tree refit). Need the
// per-workgroup invocation cap raised from the default 256.
clamp("maxComputeInvocationsPerWorkgroup", 1024);
clamp("maxComputeWorkgroupSizeX", 1024);
const device = await adapter.requestDevice({ requiredLimits });
const queue = device.queue;
const ctx = canvas.getContext("webgpu");
const canvasFormat = "rgba8unorm"; // match storage textures, skip swizzle blit
ctx.configure({ device, format: canvasFormat, alphaMode: "opaque",
usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_DST });
device.lost.then((info) => {
console.error("[crafter-wgpu] device lost:", info.message);
state.gpuLost = true;
});
// ─── handle tables ─────────────────────────────────────────────────────
const buffers = new Map(); // handle → GPUBuffer
const textures = new Map(); // handle → GPUTexture
const textureViews = new Map(); // handle → GPUTextureView (mirrors textures key for the view)
const samplers = new Map(); // handle → GPUSampler
let nextHandle = 1;
function newHandle() { return nextHandle++; }
// ─── ping-pong storage textures ────────────────────────────────────────
const state = {
pingTex: null, pingView: null,
pongTex: null, pongView: null,
outIsPing: true, // current "out" target
width: 0, height: 0,
encoder: null,
pass: null,
headerRing: null, // GPUBuffer; uniform header writes ring through this
headerRingSize: 0,
headerRingOffset: 0,
bindGroupCache: new Map(), // key → GPUBindGroup
gpuLost: false,
};
function recreatePingPong(w, h) {
const usage = GPUTextureUsage.STORAGE_BINDING
| GPUTextureUsage.TEXTURE_BINDING
| GPUTextureUsage.COPY_SRC
| GPUTextureUsage.COPY_DST; // COPY_DST so we can clear it
if (state.pingTex) state.pingTex.destroy();
if (state.pongTex) state.pongTex.destroy();
state.pingTex = device.createTexture({ size: [w, h], format: "rgba8unorm", usage });
state.pongTex = device.createTexture({ size: [w, h], format: "rgba8unorm", usage });
state.pingView = state.pingTex.createView();
state.pongView = state.pongTex.createView();
state.width = w; state.height = h;
state.outIsPing = true;
state.bindGroupCache.clear();
}
function ensureSized() {
const { w, h } = syncCanvasSize();
if (w !== state.width || h !== state.height) {
recreatePingPong(w, h);
// Notify the wasm side that the surface size changed so it can
// fire onResize through Window. The wasm export is added by
// Crafter.Graphics-Window.cpp.
const onResize = wasmExports && wasmExports.__crafterDom_resize;
if (onResize) onResize(1, w, h);
}
}
// Header ring buffer: 256-byte-aligned slots holding UIDispatchHeader (48
// bytes of meaningful data, padded to 256). Wraps at frame boundary.
const HEADER_ALIGN = 256;
const HEADER_RING_SLOTS = 64;
state.headerRingSize = HEADER_ALIGN * HEADER_RING_SLOTS;
state.headerRing = device.createBuffer({
size: state.headerRingSize,
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
// ─── pipelines ─────────────────────────────────────────────────────────
const wgslShared = String.raw`
struct UIDispatchHeader {
outImage: u32,
itemBuffer: u32,
surfaceW: u32,
surfaceH: u32,
clipX: f32,
clipY: f32,
clipW: f32,
clipH: f32,
itemCount: u32,
frameIdx: u32,
flags: u32,
_pad: u32,
};
@group(0) @binding(0) var<uniform> hdr : UIDispatchHeader;
@group(1) @binding(0) var outTex : texture_storage_2d<rgba8unorm, write>;
@group(1) @binding(1) var prevTex : texture_2d<f32>;
fn uiResolvePixel(coord: vec2<u32>) -> bool {
if (coord.x >= hdr.surfaceW || coord.y >= hdr.surfaceH) { return false; }
let fx = f32(coord.x); let fy = f32(coord.y);
if (fx < hdr.clipX || fy < hdr.clipY) { return false; }
if (fx >= hdr.clipX + hdr.clipW) { return false; }
if (fy >= hdr.clipY + hdr.clipH) { return false; }
return true;
}
fn uiBlendOver(dst: vec4<f32>, src: vec4<f32>) -> vec4<f32> {
let a = clamp(src.a, 0.0, 1.0);
let rgb = mix(dst.rgb, src.rgb, vec3<f32>(a));
let outA = a + dst.a * (1.0 - a);
return vec4<f32>(rgb, outA);
}
fn uiSdRoundRect(p: vec2<f32>, halfSize: vec2<f32>, r4: vec4<f32>) -> f32 {
var r: vec4<f32> = r4;
// Pick radius for the quadrant p is in. r order: (TL, TR, BR, BL).
let rx = select(r.x, r.z, p.x > 0.0);
let ry = select(r.w, r.y, p.x > 0.0);
let radius = select(ry, rx, p.y > 0.0);
let q = abs(p) - halfSize + vec2<f32>(radius);
return min(max(q.x, q.y), 0.0) + length(max(q, vec2<f32>(0.0))) - radius;
}
`;
const wgslQuads = wgslShared + String.raw`
struct QuadItem {
rect: vec4<f32>,
color: vec4<f32>,
corners: vec4<f32>,
outline: vec4<f32>,
};
@group(2) @binding(0) var<storage, read> items : array<QuadItem>;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let lo = it.rect.xy;
let hi = it.rect.xy + it.rect.zw;
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
let halfSize = it.rect.zw * 0.5;
let p = sp - (it.rect.xy + halfSize);
let d = uiSdRoundRect(p, halfSize, it.corners);
let bodyA = clamp(0.5 - d, 0.0, 1.0);
if (bodyA <= 0.0 && it.outline.x <= 0.0) { continue; }
var src = vec4<f32>(it.color.rgb, it.color.a * bodyA);
if (it.outline.x > 0.0) {
let t = abs(d + it.outline.x * 0.5) - it.outline.x * 0.5;
let outlineA = clamp(0.5 - t, 0.0, 1.0);
src = vec4<f32>(mix(src.rgb, it.outline.yzw, vec3<f32>(outlineA)),
max(src.a, outlineA));
}
if (src.a <= 0.0) { continue; }
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
const wgslCircles = wgslShared + String.raw`
struct CircleItem {
centerRadius: vec4<f32>,
color: vec4<f32>,
outline: vec4<f32>,
};
@group(2) @binding(0) var<storage, read> items : array<CircleItem>;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let center = it.centerRadius.xy;
let radius = it.centerRadius.z;
let d = length(sp - center) - radius;
let bodyA = clamp(0.5 - d, 0.0, 1.0);
if (bodyA <= 0.0 && it.outline.x <= 0.0) { continue; }
var src = vec4<f32>(it.color.rgb, it.color.a * bodyA);
if (it.outline.x > 0.0) {
let t = abs(d + it.outline.x * 0.5) - it.outline.x * 0.5;
let outlineA = clamp(0.5 - t, 0.0, 1.0);
src = vec4<f32>(mix(src.rgb, it.outline.yzw, vec3<f32>(outlineA)),
max(src.a, outlineA));
}
if (src.a <= 0.0) { continue; }
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
const wgslImages = wgslShared + String.raw`
struct ImageItem {
rect: vec4<f32>,
uv: vec4<f32>,
tint: vec4<f32>,
slots: vec4<u32>,
};
@group(2) @binding(0) var<storage, read> items : array<ImageItem>;
@group(3) @binding(0) var imgTex : texture_2d<f32>;
@group(3) @binding(1) var imgSampler : sampler;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let lo = it.rect.xy;
let hi = it.rect.xy + it.rect.zw;
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
let t = (sp - lo) / it.rect.zw;
let uv = vec2<f32>(mix(it.uv.x, it.uv.z, t.x), mix(it.uv.y, it.uv.w, t.y));
let sample = textureSampleLevel(imgTex, imgSampler, uv, 0.0);
let src = sample * it.tint;
if (src.a <= 0.0) { continue; }
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
const wgslText = wgslShared + String.raw`
struct GlyphItem {
rect: vec4<f32>,
uv: vec4<f32>,
color: vec4<f32>,
};
@group(2) @binding(0) var<storage, read> items : array<GlyphItem>;
@group(3) @binding(0) var atlasTex : texture_2d<f32>;
@group(3) @binding(1) var atlasSampler : sampler;
@compute @workgroup_size(8, 8, 1)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
if (!uiResolvePixel(gid.xy)) { return; }
let coord = vec2<i32>(i32(gid.x), i32(gid.y));
var dst = textureLoad(prevTex, coord, 0);
let sp = vec2<f32>(f32(gid.x) + 0.5, f32(gid.y) + 0.5);
for (var i: u32 = 0u; i < hdr.itemCount; i = i + 1u) {
let it = items[i];
let lo = it.rect.xy;
let hi = it.rect.xy + it.rect.zw;
if (sp.x < lo.x || sp.y < lo.y || sp.x >= hi.x || sp.y >= hi.y) { continue; }
let t = (sp - lo) / it.rect.zw;
let uv = vec2<f32>(mix(it.uv.x, it.uv.z, t.x), mix(it.uv.y, it.uv.w, t.y));
// stb_truetype SDF: pixel value ~128 is the edge. Treat alpha as
// the smoothed step around that midpoint.
let sdf = textureSampleLevel(atlasTex, atlasSampler, uv, 0.0).r;
let alpha = clamp((sdf - 0.5) * 8.0 + 0.5, 0.0, 1.0);
if (alpha <= 0.0) { continue; }
let src = vec4<f32>(it.color.rgb, it.color.a * alpha);
dst = uiBlendOver(dst, src);
}
textureStore(outTex, coord, dst);
}
`;
function makePipeline(label, wgsl, hasGroup3) {
const mod = device.createShaderModule({ label, code: wgsl });
// Layout: group 0 uniform header, group 1 (out storage + prev sampled),
// group 2 storage items SSBO, optional group 3 (texture + sampler).
const bgl0 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 48 } },
]});
const bgl1 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, texture: { sampleType: "float", viewDimension: "2d" } },
]});
const bgl2 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
]});
const layouts = [bgl0, bgl1, bgl2];
let bgl3 = null;
if (hasGroup3) {
bgl3 = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, texture: { sampleType: "float", viewDimension: "2d" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, sampler: { type: "filtering" } },
]});
layouts.push(bgl3);
}
const pl = device.createPipelineLayout({ bindGroupLayouts: layouts });
const pipeline = device.createComputePipeline({
layout: pl,
compute: { module: mod, entryPoint: "main" },
});
return { pipeline, bgl0, bgl1, bgl2, bgl3 };
}
const pipeQuads = makePipeline("ui-quads", wgslQuads, false);
const pipeCircles = makePipeline("ui-circles", wgslCircles, false);
const pipeImages = makePipeline("ui-images", wgslImages, true);
const pipeText = makePipeline("ui-text", wgslText, true);
// Bind groups for group 0 (header uniform with dynamic offset) — one per
// pipeline, references the same ring buffer.
function makeHdrBG(pipe) {
return device.createBindGroup({
layout: pipe.bgl0,
entries: [{ binding: 0, resource: { buffer: state.headerRing, offset: 0, size: 48 } }],
});
}
const hdrBG = {
quads: makeHdrBG(pipeQuads),
circles: makeHdrBG(pipeCircles),
images: makeHdrBG(pipeImages),
text: makeHdrBG(pipeText),
};
// Group 1 changes between dispatches because `out` and `prev` swap on the
// ping-pong. Cached by current ping-pong direction and texture size; the
// stored bind group is reusable across all pipelines that share a
// layout-compatible bgl1 (all standard pipelines and custom shaders do,
// since they declare identical group-1 entries per the contract).
function getGroup1BG(bgl1) {
const key = `g1/${state.outIsPing ? 1 : 0}/${state.width}x${state.height}`;
let bg = state.bindGroupCache.get(key);
if (bg) return bg;
const outView = state.outIsPing ? state.pingView : state.pongView;
const prevView = state.outIsPing ? state.pongView : state.pingView;
bg = device.createBindGroup({
layout: bgl1,
entries: [
{ binding: 0, resource: outView },
{ binding: 1, resource: prevView },
],
});
state.bindGroupCache.set(key, bg);
return bg;
}
function getGroup2BG(pipe, itemsHandle) {
const key = `items/${pipe === pipeQuads ? "q" : pipe === pipeCircles ? "c" : pipe === pipeImages ? "i" : "t"}/${itemsHandle}`;
let bg = state.bindGroupCache.get(key);
if (bg) return bg;
const buf = buffers.get(itemsHandle);
if (!buf) throw new Error(`getGroup2BG: unknown items buffer ${itemsHandle}`);
bg = device.createBindGroup({
layout: pipe.bgl2,
entries: [{ binding: 0, resource: { buffer: buf } }],
});
state.bindGroupCache.set(key, bg);
return bg;
}
function getGroup3BG(pipe, texHandle, sampHandle) {
const key = `t3/${texHandle}/${sampHandle}/${pipe === pipeImages ? "i" : "x"}`;
let bg = state.bindGroupCache.get(key);
if (bg) return bg;
const tex = textureViews.get(texHandle);
const sam = samplers.get(sampHandle);
if (!tex || !sam) throw new Error(`getGroup3BG: unknown view ${texHandle} / sampler ${sampHandle}`);
bg = device.createBindGroup({
layout: pipe.bgl3,
entries: [
{ binding: 0, resource: tex },
{ binding: 1, resource: sam },
],
});
state.bindGroupCache.set(key, bg);
return bg;
}
// ─── wasm import surface ───────────────────────────────────────────────
let wasmExports = null;
// Crafter.Build's runtime.js exposes the wasi instance on
// window.crafter_wasi after instantiation. We grab the exports lazily so
// every import-side function works regardless of call order. memU8 /
// memF32 / memU32 always re-derive the typed-array view because the
// wasm memory's backing ArrayBuffer is detached and replaced whenever
// the wasm grows its memory; caching a typed array would alias to
// freed memory after a grow.
function getExports() {
if (wasmExports) return wasmExports;
const wasi = window.crafter_wasi;
if (!wasi || !wasi.instance) {
throw new Error("[crafter-wgpu] wasm exports not available yet (called too early)");
}
wasmExports = wasi.instance.exports;
return wasmExports;
}
function memU8() { return new Uint8Array(getExports().memory.buffer); }
function memF32() { return new Float32Array(getExports().memory.buffer); }
function memU32() { return new Uint32Array(getExports().memory.buffer); }
// Stubs were assigned at the top of this file; we now overwrite them with
// real implementations now that init has succeeded.
const env = window.crafter_webbuild_env;
env.wgpuGetCanvasWidth = () => canvas.width;
env.wgpuGetCanvasHeight = () => canvas.height;
env.wgpuCreateBuffer = (byteSize) => {
const h = newHandle();
const buf = device.createBuffer({
size: Math.max(16, byteSize),
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
});
buffers.set(h, buf);
return h;
};
env.wgpuWriteBuffer = (handle, srcPtr, byteSize) => {
state.writeBufferCount = (state.writeBufferCount || 0) + 1;
state.lastWriteHandle = handle;
state.lastWriteSize = byteSize;
const buf = buffers.get(handle);
if (!buf) return;
// writeBuffer requires a multiple of 4 bytes and an aligned offset.
const aligned = (byteSize + 3) & ~3;
queue.writeBuffer(buf, 0, memU8().buffer, srcPtr, aligned);
};
// Partial write — copies a sub-range of `srcPtr` into the GPU buffer at
// `dstByteOffset`. Used by BuildTLAS to skip the transform field of
// GPU-owned instances; the physics-tlas-transform compute shader is the
// sole writer of those bytes and we must not clobber its output with a
// stale CPU mirror.
env.wgpuWriteBufferRange = (handle, dstByteOffset, srcPtr, byteSize) => {
const buf = buffers.get(handle);
if (!buf) return;
const aligned = (byteSize + 3) & ~3;
queue.writeBuffer(buf, dstByteOffset, memU8().buffer, srcPtr, aligned);
};
// ── GPU→CPU readback (staging + mapAsync) ──────────────────────────────
//
// WebGPU storage buffers can't be CPU-mapped directly (STORAGE usage is
// incompatible with MAP_READ). Each readback keeps a parallel staging
// buffer (MAP_READ | COPY_DST) at the same size. wgpuReadbackEnqueue
// copies the storage buffer into the staging buffer and kicks off an
// async map. wgpuReadbackPoll synchronously returns whether the map has
// resolved; if so it copies the bytes into the caller's wasm pointer
// and the slot is ready for the next Enqueue.
//
// Used by Forts3D's physics event drain to read the GPU-written
// destroy/hit/splash event queues with a one-frame latency.
const READBACK_IDLE = 0;
const READBACK_PENDING = 1;
const READBACK_READY = 2;
const readbacks = new Map(); // device-buffer handle → { staging, size, state, pendingData }
// Readbacks scheduled this frame that still need their mapAsync kicked
// off — done after the frame's queue.submit so the map waits for the
// compute writes that wrote to `buf` to finish, not just the standalone
// copy encoder.
const pendingReadbackMaps = [];
env.wgpuReadbackEnqueue = (handle, byteSize, resetBytes) => {
const buf = buffers.get(handle);
if (!buf) return;
const aligned = (byteSize + 3) & ~3;
const resetAligned = resetBytes > 0 ? ((resetBytes + 3) & ~3) : 0;
let rb = readbacks.get(handle);
if (!rb) {
rb = {
staging: device.createBuffer({
size: Math.max(16, aligned),
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
}),
size: aligned,
state: READBACK_IDLE,
pendingData: null,
};
readbacks.set(handle, rb);
}
if (rb.state !== READBACK_IDLE) {
// Previous map still in flight (or has data nobody polled yet);
// skip this enqueue AND the paired reset. Events written by the
// current frame's substeps will accumulate in `buf` and get
// picked up by the next successful enqueue (which captures the
// post-accumulation count and then clears it). Resetting here
// without copying would wipe those events before they could be
// drained, causing destroyed projectiles to never reach the CPU.
return;
}
if (state.encoder) {
// Mid-frame: piggy-back the copy on the frame's main encoder so
// it runs AFTER the compute dispatches that wrote `buf`. End the
// active pass, encode the copy (+ optional reset), reopen the
// pass. mapAsync gets deferred to wgpuFrameEnd (after
// queue.submit) — the map resolves once the GPU has finished
// the submitted work, including our just-encoded copy.
if (state.pass) {
state.pass.end();
state.pass = null;
}
state.encoder.copyBufferToBuffer(buf, 0, rb.staging, 0, aligned);
if (resetAligned > 0) {
// Clear the first `resetAligned` bytes of `buf` so the next
// frame's atomic-add starts at 0. Encoded after the copy,
// so the staging captures the pre-clear count. Tied to a
// successful enqueue — if the enqueue was skipped above the
// clear is skipped too, preserving events for the next
// successful drain.
state.encoder.clearBuffer(buf, 0, resetAligned);
}
state.pass = state.encoder.beginComputePass();
rb.state = READBACK_PENDING;
pendingReadbackMaps.push(rb);
} else {
// Standalone (no frame in progress): submit our own encoder and
// kick off mapAsync immediately.
const enc = device.createCommandEncoder();
enc.copyBufferToBuffer(buf, 0, rb.staging, 0, aligned);
if (resetAligned > 0) enc.clearBuffer(buf, 0, resetAligned);
queue.submit([enc.finish()]);
rb.state = READBACK_PENDING;
rb.staging.mapAsync(GPUMapMode.READ).then(() => {
rb.pendingData = new Uint8Array(rb.staging.getMappedRange()).slice();
rb.staging.unmap();
rb.state = READBACK_READY;
}).catch(e => {
console.error("[crafter-wgpu] readback mapAsync failed:", e);
rb.state = READBACK_IDLE;
});
}
};
// Returns 1 if the readback for `handle` has completed and bytes were
// copied into `dstPtr`; returns 0 otherwise (caller retries next frame).
// After a successful poll the slot is idle again, ready for the next
// Enqueue.
env.wgpuReadbackPoll = (handle, dstPtr, byteSize) => {
const rb = readbacks.get(handle);
if (!rb || rb.state !== READBACK_READY) return 0;
memU8().set(rb.pendingData.subarray(0, byteSize), dstPtr);
rb.pendingData = null;
rb.state = READBACK_IDLE;
return 1;
};
// Non-consuming readiness check. Lets callers verify multiple readbacks
// have all resolved before consuming any — needed when a logical drain
// reads parallel header/array buffers and mustn't ack the header until
// the array bytes are also available, or the events get lost.
env.wgpuReadbackReady = (handle) => {
const rb = readbacks.get(handle);
return (rb && rb.state === READBACK_READY) ? 1 : 0;
};
env.wgpuDestroyBuffer = (handle) => {
const buf = buffers.get(handle);
if (buf) { buf.destroy(); buffers.delete(handle); }
// Invalidate any cached bind group that referenced this handle.
for (const k of state.bindGroupCache.keys()) {
if (k.startsWith("items/") && k.endsWith("/" + handle)) {
state.bindGroupCache.delete(k);
}
}
};
env.wgpuCreateAtlasTexture = (w, h) => {
const handle = newHandle();
const tex = device.createTexture({
size: [w, h],
format: "r8unorm",
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
});
textures.set(handle, tex);
textureViews.set(handle, tex.createView());
return handle;
};
env.wgpuWriteAtlasRegion = (handle, srcPtr, srcW, srcH, srcBytesPerRow, dstX, dstY, copyW, copyH) => {
const tex = textures.get(handle);
if (!tex) return;
// For r8unorm, 1 byte per pixel; writeTexture requires bytesPerRow >= 256
// OR == width if width*1 % 256 === 0 — for arbitrary widths we need to
// re-pack into a 256-aligned staging buffer.
const alignedBPR = Math.max(256, (srcBytesPerRow + 255) & ~255);
if (alignedBPR === srcBytesPerRow) {
const bytes = memU8().subarray(srcPtr + dstY * srcBytesPerRow + dstX,
srcPtr + (dstY + copyH) * srcBytesPerRow);
queue.writeTexture(
{ texture: tex, origin: { x: dstX, y: dstY } },
bytes,
{ bytesPerRow: srcBytesPerRow, rowsPerImage: copyH },
{ width: copyW, height: copyH }
);
} else {
// Repack copyW × copyH starting at (dstX, dstY) in the source.
const staging = new Uint8Array(alignedBPR * copyH);
const src = memU8();
for (let y = 0; y < copyH; y++) {
const srcRow = (dstY + y) * srcBytesPerRow + dstX;
staging.set(src.subarray(srcPtr + srcRow, srcPtr + srcRow + copyW),
y * alignedBPR);
}
queue.writeTexture(
{ texture: tex, origin: { x: dstX, y: dstY } },
staging,
{ bytesPerRow: alignedBPR, rowsPerImage: copyH },
{ width: copyW, height: copyH }
);
}
};
env.wgpuDestroyTexture = (handle) => {
const tex = textures.get(handle);
if (tex) { tex.destroy(); textures.delete(handle); textureViews.delete(handle); }
};
// General-purpose 2D rgba8unorm texture, used by Image2D<RGBA8>. Distinct
// from the atlas path (r8unorm, sub-region writes) — this one's a one-shot
// upload of a whole image, sized to the pixel data the caller hands over.
env.wgpuCreateImage2D = (w, h) => {
const handle = newHandle();
const tex = device.createTexture({
size: [w, h],
format: "rgba8unorm",
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
});
textures.set(handle, tex);
textureViews.set(handle, tex.createView());
return handle;
};
// 2D texture array — N layers of identical (w × h) rgba8unorm with
// `mipLevels` mip levels. Pass mipLevels=1 for a single-level texture.
// Used by Image2DArray<RGBA8> to back one material albedo per layer;
// shaders sample with `textureSampleLevel(tex, samp, uv, layerIdx, lod)`
// where lod ∈ [0, mipLevels-1].
env.wgpuCreateImage2DArray = (w, h, layerCount, mipLevels) => {
const handle = newHandle();
const mips = (typeof mipLevels === "number" && mipLevels > 0) ? mipLevels : 1;
const tex = device.createTexture({
size: [w, h, layerCount],
dimension: "2d",
format: "rgba8unorm",
mipLevelCount: mips,
usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
});
textures.set(handle, tex);
textureViews.set(handle, tex.createView({
dimension: "2d-array",
arrayLayerCount: layerCount,
mipLevelCount: mips,
}));
return handle;
};
// Upload a single mip level of one array layer. `level` indexes into the
// texture's mip chain; `w` / `h` are the dimensions at that level (= base
// dimensions >> level). Caller supplies the pre-downsampled bytes for
// each level — Image2DArray::UpdateLayer on the C++ side does the box-
// filter chain.
env.wgpuWriteImage2DLayer = (handle, layer, level, srcPtr, byteSize, w, h) => {
const tex = textures.get(handle);
if (!tex) return;
const srcBPR = w * 4;
const alignedBPR = (srcBPR + 255) & ~255;
if (alignedBPR === srcBPR) {
queue.writeTexture(
{ texture: tex, mipLevel: level, origin: [0, 0, layer] },
memU8().subarray(srcPtr, srcPtr + byteSize),
{ bytesPerRow: srcBPR, rowsPerImage: h },
{ width: w, height: h, depthOrArrayLayers: 1 }
);
} else {
const staging = new Uint8Array(alignedBPR * h);
const src = memU8();
for (let y = 0; y < h; y++) {
staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
y * alignedBPR);
}
queue.writeTexture(
{ texture: tex, mipLevel: level, origin: [0, 0, layer] },
staging,
{ bytesPerRow: alignedBPR, rowsPerImage: h },
{ width: w, height: h, depthOrArrayLayers: 1 }
);
}
};
env.wgpuWriteImage2D = (handle, srcPtr, byteSize, w, h) => {
const tex = textures.get(handle);
if (!tex) return;
// queue.writeTexture wants bytesPerRow as a multiple of 256, OR == width*bpp
// when the source is contiguous. RGBA8 = 4 bpp, so bytesPerRow = w*4.
const srcBPR = w * 4;
const alignedBPR = (srcBPR + 255) & ~255;
if (alignedBPR === srcBPR) {
// Already aligned (w * 4 is a multiple of 256 → w is a multiple of 64).
queue.writeTexture(
{ texture: tex },
memU8().subarray(srcPtr, srcPtr + byteSize),
{ bytesPerRow: srcBPR, rowsPerImage: h },
{ width: w, height: h }
);
} else {
// Repack into a 256-aligned staging buffer. One alloc per Update,
// freed when the function returns — fine for asset-load time use.
const staging = new Uint8Array(alignedBPR * h);
const src = memU8();
for (let y = 0; y < h; y++) {
staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
y * alignedBPR);
}
queue.writeTexture(
{ texture: tex },
staging,
{ bytesPerRow: alignedBPR, rowsPerImage: h },
{ width: w, height: h }
);
}
};
env.wgpuCreateLinearClampSampler = () => {
const handle = newHandle();
samplers.set(handle, device.createSampler({
magFilter: "linear", minFilter: "linear",
addressModeU: "clamp-to-edge", addressModeV: "clamp-to-edge",
}));
return handle;
};
env.wgpuCreateLinearRepeatSampler = () => {
const handle = newHandle();
samplers.set(handle, device.createSampler({
magFilter: "linear", minFilter: "linear",
mipmapFilter: "linear",
addressModeU: "repeat", addressModeV: "repeat",
}));
return handle;
};
// ─── per-frame ──────────────────────────────────────────────────────────
env.wgpuFrameBegin = () => {
state.frameBeginCount = (state.frameBeginCount || 0) + 1;
if (state.gpuLost) return;
ensureSized();
state.encoder = device.createCommandEncoder();
state.outIsPing = true; // reset so each frame starts on the same target
state.headerRingOffset = 0;
// DON'T clearBuffer the header ring here. queue.writeBuffer ops from
// writeHeader() are enqueued BEFORE this command buffer's submit,
// so an encoded clearBuffer would wipe them — the dispatches would
// then read all-zero headers and uiResolvePixel would reject every
// pixel (surfaceW=0).
clearStorageTexture(state.encoder, state.outIsPing ? state.pingTex : state.pongTex,
state.width, state.height);
state.pass = state.encoder.beginComputePass();
};
let zeroBuffer = null;
let zeroBufferSize = 0;
function clearStorageTexture(encoder, tex, w, h) {
const bpr = (w * 4 + 255) & ~255;
const need = bpr * h;
if (!zeroBuffer || zeroBufferSize < need) {
if (zeroBuffer) zeroBuffer.destroy();
zeroBuffer = device.createBuffer({ size: need, usage: GPUBufferUsage.COPY_SRC, mappedAtCreation: true });
new Uint8Array(zeroBuffer.getMappedRange()).fill(0);
zeroBuffer.unmap();
zeroBufferSize = need;
}
encoder.copyBufferToTexture(
{ buffer: zeroBuffer, bytesPerRow: bpr, rowsPerImage: h },
{ texture: tex },
{ width: w, height: h, depthOrArrayLayers: 1 }
);
}
env.wgpuFrameEnd = () => {
state.frameEndCount = (state.frameEndCount || 0) + 1;
if (state.gpuLost || !state.encoder) return;
state.pass.end();
state.pass = null;
// Blit last-written ping-pong texture → canvas. After N dispatches,
// state.outIsPing points at the NEXT write target, so the latest
// content lives in the OPPOSITE texture.
const finalTex = state.outIsPing ? state.pongTex : state.pingTex;
const canvasTex = ctx.getCurrentTexture();
state.encoder.copyTextureToTexture(
{ texture: finalTex },
{ texture: canvasTex },
{ width: state.width, height: state.height, depthOrArrayLayers: 1 }
);
queue.submit([state.encoder.finish()]);
state.encoder = null;
// Kick off mapAsync for the readbacks whose copyBufferToBuffer we
// piggy-backed onto the just-submitted encoder. Doing this after
// submit ensures the map waits for that submission's GPU work to
// complete, so the staging buffer reflects this frame's compute
// writes (not pre-substep state).
while (pendingReadbackMaps.length > 0) {
const rb = pendingReadbackMaps.pop();
rb.staging.mapAsync(GPUMapMode.READ).then(() => {
rb.pendingData = new Uint8Array(rb.staging.getMappedRange()).slice();
rb.staging.unmap();
rb.state = READBACK_READY;
}).catch(e => {
console.error("[crafter-wgpu] readback mapAsync failed:", e);
rb.state = READBACK_IDLE;
});
}
};
// Write a push struct into the ring buffer at the current offset (which
// is incremented and 256-aligned). Standard dispatches pass just the
// 48-byte UIDispatchHeader; custom shaders may pass up to HEADER_ALIGN
// bytes — anything past the header is the user's per-dispatch struct,
// declared in WGSL as additional fields after UIDispatchHeader at
// @group(0) @binding(0). Returns the dynamic offset to pass to
// setBindGroup.
function writeHeader(headerPtr, bytes = 48) {
const upload = Math.min(bytes, HEADER_ALIGN);
const offset = state.headerRingOffset;
if (offset + HEADER_ALIGN > state.headerRingSize) {
// Ring is small enough that overrun in one frame means too many
// dispatches. Soft-wrap; correctness already requires the ring
// be large enough.
state.headerRingOffset = 0;
}
queue.writeBuffer(state.headerRing, state.headerRingOffset,
memU8().buffer, headerPtr, upload);
state.headerRingOffset += HEADER_ALIGN;
return offset;
}
function dispatchStandard(pipe, hdrBindGroup, headerPtr, gx, gy, itemsHandle, group3) {
if (!state.pass) return;
const off = writeHeader(headerPtr);
state.pass.setPipeline(pipe.pipeline);
state.pass.setBindGroup(0, hdrBindGroup, [off]);
state.pass.setBindGroup(1, getGroup1BG(pipe.bgl1));
state.pass.setBindGroup(2, getGroup2BG(pipe, itemsHandle));
if (group3) state.pass.setBindGroup(3, group3);
state.pass.dispatchWorkgroups(gx, gy, 1);
// Flip ping-pong: the texture we just wrote becomes next dispatch's prev.
state.outIsPing = !state.outIsPing;
}
env.wgpuDispatchQuads = (itemsHandle, headerPtr, gx, gy) => {
state.dispatchQuadsCount = (state.dispatchQuadsCount || 0) + 1;
dispatchStandard(pipeQuads, hdrBG.quads, headerPtr, gx, gy, itemsHandle, null);
};
env.wgpuDispatchCircles = (itemsHandle, headerPtr, gx, gy) => {
dispatchStandard(pipeCircles, hdrBG.circles, headerPtr, gx, gy, itemsHandle, null);
};
env.wgpuDispatchImages = (itemsHandle, headerPtr, gx, gy, texHandle, sampHandle) => {
const g3 = getGroup3BG(pipeImages, texHandle, sampHandle);
dispatchStandard(pipeImages, hdrBG.images, headerPtr, gx, gy, itemsHandle, g3);
};
env.wgpuDispatchText = (itemsHandle, headerPtr, gx, gy, atlasHandle, sampHandle) => {
const g3 = getGroup3BG(pipeText, atlasHandle, sampHandle);
dispatchStandard(pipeText, hdrBG.text, headerPtr, gx, gy, itemsHandle, g3);
};
// ─── custom user-authored shaders ─────────────────────────────────────
//
// Bind-group contract (mirrors :WebGPUComputeShader.cppm):
// group 0 binding 0 — uniform UIDispatchHeader (dynamic offset, 48b)
// group 1 binding 0 — texture_storage_2d<rgba8unorm, write> out
// group 1 binding 1 — texture_2d<f32> prev
// group 2+ — user-declared (UICustomBinding entries)
//
// Each UICustomBinding entry on the wasm side is 8 bytes:
// u8 group, u8 binding, u8 kind, u8 pad, u32 pushOffset
// kind: 0 = read-only-storage SSBO, 1 = sampled tex 2d, 2 = filtering sampler.
const customPipelines = new Map(); // handle → { pipeline, bgls, hdrBG, byGroup }
env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQueryFlag) => {
if (!rtState.vertHeap && rayQueryFlag) rtInit();
const userWgsl = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
// For rayQuery-capable shaders, prepend the RT prelude + ray-query
// library. The user shader can declare its own group 0 / 2+ bindings
// but MUST NOT redeclare group(1) — that's reserved for RT data.
const wgsl = rayQueryFlag
// rayQueryLib's _rqTraverseBlas/_rqTraverseTlas call _rtAabb,
// _rtFetchTri, _rtTri from rtWgslPureHelpers — must prepend
// the pure helper subset (NOT the megakernel-only traversal
// routines, which reference user-emitted runAnyHit/runMiss/
// runClosestHit and won't compile outside the raygen pipeline).
? (rtWgslTypes + rtWgslMegakernelBindings + rtWgslPureHelpers + rtWgslRayQueryLib + "\n" + userWgsl)
: userWgsl;
const bindings = [];
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
for (let i = 0; i < bindingsCount; i++) {
bindings.push({
group: dv.getUint8(i*8 + 0),
binding: dv.getUint8(i*8 + 1),
kind: dv.getUint8(i*8 + 2),
pushOffset: dv.getUint32(i*8 + 4, true),
});
}
// Group bindings by @group(N) for layout creation.
const byGroup = new Map();
for (const b of bindings) {
if (b.group < 2) {
console.error(`[crafter-wgpu] custom shader: @group(${b.group}) reserved; use groups >= 2`);
return 0;
}
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
byGroup.get(b.group).push(b);
}
// Group 0 = header uniform (same for both paths).
// Group 1 = ping-pong out+prev OR RT data (TLAS, BVH, meshRecs, verts,
// idx, primRemap, outImage) when rayQuery flag is on.
//
// Custom shaders may declare a struct at @group(0) @binding(0) up to
// HEADER_ALIGN (256) bytes — the standard UIDispatchHeader is 48 but
// the wgpuDispatchCustom path uploads the full push buffer so user
// shaders can read extra fields past the header. minBindingSize=0
// disables the bound-size lower bound; the actual entry is sized to
// HEADER_ALIGN at bind time.
const bgls = [
device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
buffer: { type: "uniform", hasDynamicOffset: true, minBindingSize: 0 } },
]}),
rayQueryFlag
? device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
]})
: device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE,
texture: { sampleType: "float", viewDimension: "2d" } },
]}),
];
// Sorted custom groups. Pad any gaps with empty bgls (WebGPU pipeline
// layouts require a contiguous array of GPUBindGroupLayout per group
// index up to the highest used).
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 1;
for (let g = 2; g <= highest; g++) {
if (byGroup.has(g)) {
const entries = byGroup.get(g).map(b => {
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
else if (b.kind === 2) e.sampler = { type: "filtering" };
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
return e;
});
bgls.push(device.createBindGroupLayout({ entries }));
} else {
bgls.push(device.createBindGroupLayout({ entries: [] }));
}
}
let pipeline;
try {
const mod = device.createShaderModule({ code: wgsl });
const layout = device.createPipelineLayout({ bindGroupLayouts: bgls });
pipeline = device.createComputePipeline({ layout, compute: { module: mod, entryPoint: "main" } });
} catch (e) {
console.error("[crafter-wgpu] custom shader compile failed:", e);
if (rayQueryFlag) console.error("[crafter-wgpu] WGSL was:\n", wgsl);
return 0;
}
// Bind the full ring slot (HEADER_ALIGN bytes) so custom shaders can
// declare a struct at @group(0) @binding(0) that's larger than the
// 48-byte UIDispatchHeader. Standard pipelines keep their tight 48-
// byte slice. The bgl0 layout's minBindingSize stays at 48 — bigger
// is allowed.
const hdrBG = device.createBindGroup({
layout: bgls[0],
entries: [{ binding: 0, resource: { buffer: state.headerRing, offset: 0, size: HEADER_ALIGN } }],
});
const handle = newHandle();
customPipelines.set(handle, { pipeline, bgls, hdrBG, byGroup, sortedGroups, rayQueryCapable: !!rayQueryFlag });
return handle;
};
env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handlesCount,
gx, gy, gz) => {
state.dispatchCustomCount = (state.dispatchCustomCount || 0) + 1;
if (!state.pass) return;
const pipe = customPipelines.get(pipelineHandle);
if (!pipe) {
console.error("[crafter-wgpu] wgpuDispatchCustom: unknown pipeline", pipelineHandle);
return;
}
// Write the full push struct (UIDispatchHeader + any custom tail
// fields the shader declares) into the ring. WGSL's bound struct
// size decides how many bytes are actually read on the GPU.
const off = writeHeader(pushPtr, pushBytes);
state.pass.setPipeline(pipe.pipeline);
state.pass.setBindGroup(0, pipe.hdrBG, [off]);
// Group 1: rayQuery-capable shaders get the RT data heaps + the most
// recently built TLAS; everyone else gets the standard ping-pong pair.
if (pipe.rayQueryCapable) {
const tlasBuf = buffers.get(rtState.currentTlas);
if (!tlasBuf) {
console.error("[crafter-wgpu] rayQuery dispatch but no TLAS built yet");
return;
}
const outView = state.outIsPing ? state.pingView : state.pongView;
const orderBuf = buffers.get(rtState.currentEntryOrder);
const bvhBuf = buffers.get(rtState.currentBvh);
if (!orderBuf || !bvhBuf) {
console.error("[crafter-wgpu] wgpuDispatchCustom rayQuery: no entryOrder/bins (TLAS not built)");
return;
}
const rtBG = device.createBindGroup({
layout: pipe.bgls[1],
entries: [
{ binding: 0, resource: { buffer: tlasBuf } },
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
{ binding: 6, resource: outView },
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
{ binding: 8, resource: { buffer: orderBuf } },
{ binding: 9, resource: { buffer: bvhBuf } },
],
});
state.pass.setBindGroup(1, rtBG);
} else {
state.pass.setBindGroup(1, getGroup1BG(pipe.bgls[1]));
}
// Walk bindings in declaration order and assemble bind groups.
// handles[] from wasm is in the SAME order as customBindings, so we
// pick up indices by walking byGroup in the same sorted order.
const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
let handleIdx = 0;
for (const g of pipe.sortedGroups) {
const entries = pipe.byGroup.get(g).map(b => {
const h = handles[handleIdx++];
let resource;
if (b.kind === 0) resource = { buffer: buffers.get(h) };
else if (b.kind === 1) resource = textureViews.get(h);
else if (b.kind === 2) resource = samplers.get(h);
else if (b.kind === 3) resource = textureViews.get(h);
return { binding: b.binding, resource };
});
const bg = device.createBindGroup({ layout: pipe.bgls[g], entries });
state.pass.setBindGroup(g, bg);
}
state.pass.dispatchWorkgroups(gx, gy, gz);
state.outIsPing = !state.outIsPing;
};
// Debug accessor for browser-console diagnostics.
window.crafter_wgpu_state = state;
window.crafter_wgpu_device = device;
window.crafter_wgpu_canvasCtx = ctx;
window.crafter_wgpu_debug = () => ({
width: state.width, height: state.height,
outIsPing: state.outIsPing,
encoderActive: !!state.encoder,
passActive: !!state.pass,
bgCacheSize: state.bindGroupCache.size,
bufferHandles: buffers.size,
textureHandles: textures.size,
samplerHandles: samplers.size,
headerRingOffset: state.headerRingOffset,
frameBeginCount: state.frameBeginCount || 0,
frameEndCount: state.frameEndCount || 0,
dispatchQuadsCount: state.dispatchQuadsCount || 0,
writeBufferCount: state.writeBufferCount || 0,
lastWriteHandle: state.lastWriteHandle,
lastWriteSize: state.lastWriteSize,
});
window.crafter_wgpu_bufferKeys = () => [...buffers.keys()];
// Read back the first QuadItem from a registered buffer to verify the
// GPU sees what the CPU wrote.
window.crafter_wgpu_readBuffer = async (handle, byteSize = 64) => {
const buf = buffers.get(handle);
if (!buf) return "no buffer for handle " + handle;
const read = device.createBuffer({ size: 256, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST });
const enc = device.createCommandEncoder();
enc.copyBufferToBuffer(buf, 0, read, 0, byteSize);
device.queue.submit([enc.finish()]);
await read.mapAsync(GPUMapMode.READ);
const data = new Float32Array(read.getMappedRange().slice(0, byteSize));
read.unmap();
return [...data];
};
// Surface size getters (the wasm side may query these on Resize events).
env.wgpuSurfaceWidth = () => state.width || canvas.width;
env.wgpuSurfaceHeight = () => state.height || canvas.height;
// One-shot init: forces ping-pong allocation at current canvas size so
// any Buffer/Texture creation before the first frame works against a
// concrete size. Called by Crafter::Device::Initialize on the wasm side.
env.wgpuInit = () => {
const { w, h } = syncCanvasSize();
recreatePingPong(w, h);
};
// Resize listener — wires up to the same `resize` event dom-env.js
// listens to. We trigger sizing on next frame begin; no work here.
window.addEventListener("resize", () => { /* ensureSized in wgpuFrameBegin */ });
// ─────────────────────────────────────────────────────────────────────
// ── Software raytracing subsystem ────────────────────────────────────
// ─────────────────────────────────────────────────────────────────────
//
// WebGPU has no hardware RT. The library emulates DXR/VKRT semantics in
// compute: a megakernel raygen pipeline traverses a CPU-built BLAS BVH +
// GPU-built TLAS, dispatches user closesthit / anyhit / miss via a
// generated `switch`. The same traversal kernel is also exposed as a
// rayQuery* function set for regular compute shaders (see the
// rayQueryCapable path under wgpuLoadCustomShader).
//
// The four mesh data streams live in *shared* GPU heaps; each Mesh::Build
// appends to them and gets a u32 handle back. The handle is what the
// application stores in RTInstance::accelerationStructureReference.
// ── WGSL library: shared types + constants (no bindings) ─────────────
// Used by both the megakernel pipeline (which adds group(0..1) bindings)
// and the TLAS-build pipeline (which only uses group(2)). Keeping bindings
// out of the shared block avoids inflating storage-buffer count past the
// 8-per-stage baseline limit on pipelines that don't actually use them.
const rtWgslTypes = String.raw`
struct RTDispatchHeader {
surfaceW: u32,
surfaceH: u32,
instanceCount: u32,
flags: u32,
};
struct RayDesc {
origin: vec3<f32>,
tMin: f32,
direction: vec3<f32>,
tMax: f32,
};
struct HitInfo {
t: f32,
instanceId: u32,
primitiveId: u32,
hitGroupIndex: u32,
attribs: vec2<f32>,
objectRayOrigin: vec3<f32>,
objectRayDirection: vec3<f32>,
objectToWorldR0: vec4<f32>,
objectToWorldR1: vec4<f32>,
objectToWorldR2: vec4<f32>,
customIndex: u32,
};
// Matches Crafter::BVHNode in interfaces/Crafter.Graphics-Mesh.cppm.
struct BVHNode {
aabbMin: vec3<f32>,
firstChildOrPrim: u32,
aabbMax: vec3<f32>,
primCount: u32,
};
// Per-mesh record. Indexed by RTInstance::accelerationStructureReference.
// attribsOffset is the per-mesh base index (in u32 words) into the
// vertexAttribs heap; meshes registered without per-vertex attribs leave
// it 0 (the heap entries at that range are also 0 / never touched). The
// per-vertex stride lives in the user's WGSL — the library doesn't store
// it because the layout is example-defined (Sponza uses 8 u32 / vertex
// for VertexNormalTangentUVPacked).
struct MeshRecord {
rootAabbMin: vec3<f32>,
vertexOffset: u32,
rootAabbMax: vec3<f32>,
indexOffset: u32,
bvhOffset: u32,
primRemapOffset: u32,
triangleCount: u32,
attribsOffset: u32,
};
// Per-instance TLAS record built by the TLAS-build compute pass.
struct TLASEntry {
aabbMin: vec3<f32>,
maskHGOffset: u32,
aabbMax: vec3<f32>,
blasMeshIdx: u32,
objectToWorldR0: vec4<f32>,
objectToWorldR1: vec4<f32>,
objectToWorldR2: vec4<f32>,
worldToObjectR0: vec4<f32>,
worldToObjectR1: vec4<f32>,
worldToObjectR2: vec4<f32>,
customIndex: u32,
instanceFlags: u32,
_pad0: u32,
_pad1: u32,
};
// ── Ray flag mirror of VkGeometryInstanceFlagBitsKHR + DXR ray flags ──
const RT_FLAG_OPAQUE: u32 = 0x1u;
const RT_FLAG_NO_OPAQUE: u32 = 0x2u;
const RT_FLAG_TERMINATE_ON_FIRST_HIT: u32 = 0x4u;
const RT_FLAG_SKIP_CLOSEST_HIT: u32 = 0x8u;
const RT_FLAG_CULL_BACK_FACING_TRIANGLES: u32 = 0x10u;
const RT_FLAG_CULL_FRONT_FACING_TRIANGLES: u32 = 0x20u;
const RT_FLAG_CULL_OPAQUE: u32 = 0x40u;
const RT_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
const RT_FLAG_SKIP_TRIANGLES: u32 = 0x100u;
const RT_FLAG_SKIP_AABBS: u32 = 0x200u;
const RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE: u32 = 0x1u;
const RT_INSTANCE_TRIANGLE_FLIP_FACING: u32 = 0x2u;
const RT_INSTANCE_FORCE_OPAQUE: u32 = 0x4u;
const RT_INSTANCE_FORCE_NO_OPAQUE: u32 = 0x8u;
const RT_ANYHIT_ACCEPT: u32 = 0u;
const RT_ANYHIT_IGNORE: u32 = 1u;
const RT_ANYHIT_END_SEARCH: u32 = 2u;
const RT_INTERSECTION_NONE: u32 = 0u;
const RT_INTERSECTION_TRIANGLE: u32 = 1u;
`;
// Megakernel-only bindings. Concatenated after rtWgslTypes for the
// raygen pipeline; the TLAS-build pipeline omits these because it doesn't
// touch them — declaring them would push it past 8 storage buffers per
// stage on the WebGPU baseline.
const rtWgslMegakernelBindings = String.raw`
@group(0) @binding(0) var<uniform> hdr : RTDispatchHeader;
@group(1) @binding(0) var<storage,read> tlasEntries : array<TLASEntry>;
@group(1) @binding(1) var<storage,read> bvhNodes : array<BVHNode>;
@group(1) @binding(2) var<storage,read> meshRecords : array<MeshRecord>;
@group(1) @binding(3) var<storage,read> vertices : array<f32>;
@group(1) @binding(4) var<storage,read> indices : array<u32>;
@group(1) @binding(5) var<storage,read> primRemap : array<u32>;
@group(1) @binding(6) var outImage : texture_storage_2d<rgba8unorm, write>;
@group(1) @binding(7) var<storage,read> vertexAttribs : array<u32>;
// TLAS Morton-sorted permutation: tlasEntryOrder[i] gives the
// tlasEntries[] index that BVH leaf i should sample.
@group(1) @binding(8) var<storage,read> tlasEntryOrder : array<u32>;
// Sweep-tree BVH built by the LBVH-build pass. 2 * N_PADDED - 1
// nodes = 2047 for N_PADDED = 1024. Internal nodes at [0, N_PADDED - 1);
// leaves at [N_PADDED - 1, 2 * N_PADDED - 1). For internal node i,
// children are 2i+1 and 2i+2 (implicit perfect binary tree). Each node
// stores just its world-space AABB.
struct BvhNode {
aabbMin: vec3<f32>,
_pad0: u32,
aabbMax: vec3<f32>,
_pad1: u32,
};
@group(1) @binding(9) var<storage,read> tlasBvhNodes : array<BvhNode>;
const TLAS_BVH_N_PADDED: u32 = 16384u;
const TLAS_BVH_LEAVES_START: u32 = TLAS_BVH_N_PADDED - 1u;
`;
const rtWgslPrelude = rtWgslTypes + rtWgslMegakernelBindings;
// ── WGSL library: helpers + traverseBlas + traverseTlas + traceRay ───
// Injected after the user-supplied closesthit/anyhit/miss sources +
// mega-switch dispatchers (which PipelineRTWebGPU emits). User raygen
// sources sit after this block so they can call traceRay.
// The "pure" subset of the RT helpers — no calls into runAnyHit /
// runClosestHit / traceRay, so this can be prepended ahead of compute
// pipelines using rayQuery without dragging in megakernel-only symbols.
const rtWgslPureHelpers = String.raw`
fn _rtFetchTri(meshRec: MeshRecord, triIndex: u32) -> array<vec3<f32>, 3> {
let baseIdx = meshRec.indexOffset + triIndex * 3u;
let i0 = indices[baseIdx + 0u];
let i1 = indices[baseIdx + 1u];
let i2 = indices[baseIdx + 2u];
let baseV = meshRec.vertexOffset;
let v0i = (baseV + i0) * 3u;
let v1i = (baseV + i1) * 3u;
let v2i = (baseV + i2) * 3u;
return array<vec3<f32>, 3>(
vec3<f32>(vertices[v0i + 0u], vertices[v0i + 1u], vertices[v0i + 2u]),
vec3<f32>(vertices[v1i + 0u], vertices[v1i + 1u], vertices[v1i + 2u]),
vec3<f32>(vertices[v2i + 0u], vertices[v2i + 1u], vertices[v2i + 2u]),
);
}
fn _rtAabb(ro: vec3<f32>, invRd: vec3<f32>, mn: vec3<f32>, mx: vec3<f32>, tMax: f32) -> bool {
// Reject degenerate (mn > mx) boxes outright. The min(t0,t1)/
// max(t0,t1) trick below silently re-orients an inverted box
// and would otherwise return true for sentinel-padded BVH leaves
// — letting rays "hit" empty slots and accidentally re-traverse
// instance 0 via the OOB outOrder → tlasEntries[0] path.
if (any(mn > mx)) { return false; }
let t0 = (mn - ro) * invRd;
let t1 = (mx - ro) * invRd;
let tmin = min(t0, t1);
let tmax = max(t0, t1);
let tEnter = max(max(tmin.x, tmin.y), tmin.z);
let tExit = min(min(tmax.x, tmax.y), tmax.z);
return tExit >= max(tEnter, 0.0) && tEnter <= tMax;
}
struct _RtTriHit { hit: bool, t: f32, u: f32, v: f32 };
fn _rtTri(ro: vec3<f32>, rd: vec3<f32>, p0: vec3<f32>, p1: vec3<f32>, p2: vec3<f32>,
tMin: f32, tMax: f32) -> _RtTriHit {
var r: _RtTriHit;
r.hit = false;
let e1 = p1 - p0;
let e2 = p2 - p0;
let h = cross(rd, e2);
let a = dot(e1, h);
if (abs(a) < 1e-8) { return r; }
let f = 1.0 / a;
let s = ro - p0;
let u = f * dot(s, h);
if (u < 0.0 || u > 1.0) { return r; }
let q = cross(s, e1);
let v = f * dot(rd, q);
if (v < 0.0 || u + v > 1.0) { return r; }
let t = f * dot(e2, q);
if (t < tMin || t > tMax) { return r; }
r.hit = true; r.t = t; r.u = u; r.v = v;
return r;
}
`;
// Megakernel-only helpers: traversal routines that invoke runAnyHit /
// runClosestHit / runMiss (emitted by the megakernel SBT switch) and
// `traceRay` that closes over them. Only the raygen-pipeline path
// prepends this.
const rtWgslMegakernelHelpers = String.raw`
// Iterative stack-based BLAS traversal. Returns true if traversal was
// terminated by an END_SEARCH from anyhit (caller should stop entirely).
fn _rtTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
instanceId: u32, hitGroupBase: u32,
bestHit: ptr<function, HitInfo>,
bestT: ptr<function, f32>,
payload: ptr<function, Payload>) -> bool {
let invD = vec3<f32>(1.0) / rayObj.direction;
var stack: array<u32, 32>;
var sp: u32 = 0u;
var nodeRel: u32 = 0u;
loop {
let abs = meshRec.bvhOffset + nodeRel;
let node = bvhNodes[abs];
if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, *bestT)) {
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
if (node.primCount > 0u) {
for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
let verts = _rtFetchTri(meshRec, triIndex);
let tr = _rtTri(rayObj.origin, rayObj.direction,
verts[0], verts[1], verts[2],
rayObj.tMin, *bestT);
if (!tr.hit) { continue; }
let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
let facing = dot(geomNormal, rayObj.direction);
if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; }
if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
var candidate: HitInfo;
candidate.t = tr.t;
candidate.instanceId = instanceId;
candidate.primitiveId = triIndex;
candidate.hitGroupIndex = hitGroupBase;
candidate.attribs = vec2<f32>(tr.u, tr.v);
candidate.objectRayOrigin = rayObj.origin;
candidate.objectRayDirection = rayObj.direction;
let opaque = (flags & RT_FLAG_OPAQUE) != 0u
|| (flags & RT_FLAG_NO_OPAQUE) == 0u; // default opaque
if (opaque) {
*bestHit = candidate;
*bestT = tr.t;
if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
} else {
let r = runAnyHit(hitGroupBase, rayObj, candidate, payload);
if (r == RT_ANYHIT_END_SEARCH) {
*bestHit = candidate;
*bestT = tr.t;
return true;
}
if (r == RT_ANYHIT_ACCEPT) {
*bestHit = candidate;
*bestT = tr.t;
if ((flags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
}
}
}
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
// inner node — push right, descend left
let left = node.firstChildOrPrim;
let right = left + 1u;
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
nodeRel = left;
}
return false;
}
fn _rtTraverseTlas(rayWorld: RayDesc, flags: u32, cullMask: u32,
sbtRecordOffset: u32, sbtRecordStride: u32,
bestHit: ptr<function, HitInfo>,
bestT: ptr<function, f32>,
payload: ptr<function, Payload>) -> bool {
let invD = vec3<f32>(1.0) / rayWorld.direction;
// Stack-based descent of the sweep-tree BVH. Internal nodes
// [0, TLAS_BVH_LEAVES_START); leaves [LEAVES_START, 2*N_PADDED-1).
// Node i's children are 2i+1 / 2i+2 (implicit perfect binary tree).
// Stack depth = tree depth = log2(N_PADDED) = 14 for N_PADDED=16384;
// 24 gives generous headroom.
var stack: array<u32, 24>;
var sp: u32 = 0u;
stack[sp] = 0u; sp = sp + 1u;
loop {
if (sp == 0u) { break; }
sp = sp - 1u;
let nodeIdx = stack[sp];
let node = tlasBvhNodes[nodeIdx];
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, *bestT)) {
continue;
}
if (nodeIdx >= TLAS_BVH_LEAVES_START) {
// Leaf: resolve entry, do the existing per-instance test.
let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START;
let i = tlasEntryOrder[leafIdx];
// Sentinel-padded leaves get instanceMask=0; cullMask check
// (and degenerate AABB above) means they fall out cheaply.
if (i == 0xFFFFFFFFu) { continue; }
let inst = tlasEntries[i];
let instanceMask = inst.maskHGOffset & 0xFFu;
if ((instanceMask & cullMask) == 0u) { continue; }
if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, *bestT)) { continue; }
// Transform ray to object space.
let r0 = inst.worldToObjectR0;
let r1 = inst.worldToObjectR1;
let r2 = inst.worldToObjectR2;
var rayObj: RayDesc;
rayObj.origin = vec3<f32>(
dot(r0.xyz, rayWorld.origin) + r0.w,
dot(r1.xyz, rayWorld.origin) + r1.w,
dot(r2.xyz, rayWorld.origin) + r2.w,
);
rayObj.direction = vec3<f32>(
dot(r0.xyz, rayWorld.direction),
dot(r1.xyz, rayWorld.direction),
dot(r2.xyz, rayWorld.direction),
);
rayObj.tMin = rayWorld.tMin;
rayObj.tMax = *bestT;
var effective = flags;
let iflags = inst.instanceFlags;
if ((iflags & RT_INSTANCE_FORCE_OPAQUE) != 0u) {
effective = (effective | RT_FLAG_OPAQUE) & ~RT_FLAG_NO_OPAQUE;
}
if ((iflags & RT_INSTANCE_FORCE_NO_OPAQUE) != 0u) {
effective = (effective | RT_FLAG_NO_OPAQUE) & ~RT_FLAG_OPAQUE;
}
if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
}
let hitGroupOffset = inst.maskHGOffset >> 8u;
let hitGroupBase = sbtRecordOffset + hitGroupOffset;
let meshRec = meshRecords[inst.blasMeshIdx];
let pre = *bestT;
let endSearch = _rtTraverseBlas(rayObj, effective, meshRec, i, hitGroupBase,
bestHit, bestT, payload);
if (endSearch) { return true; }
if ((*bestT) < pre) {
// record world-space object-to-world for the closest-hit shader
(*bestHit).objectToWorldR0 = inst.objectToWorldR0;
(*bestHit).objectToWorldR1 = inst.objectToWorldR1;
(*bestHit).objectToWorldR2 = inst.objectToWorldR2;
(*bestHit).customIndex = inst.customIndex;
if ((effective & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u) { return true; }
}
} else {
// Internal node: push both children (skip overflow).
let left = 2u * nodeIdx + 1u;
let right = 2u * nodeIdx + 2u;
if (sp + 1u < 24u) {
stack[sp] = right; sp = sp + 1u;
stack[sp] = left; sp = sp + 1u;
}
}
}
return false;
}
fn traceRay(tlasIdx: u32, flags: u32, cullMask: u32,
sbtRecordOffset: u32, sbtRecordStride: u32, missIndex: u32,
rayOrigin: vec3<f32>, rayTMin: f32,
rayDir: vec3<f32>, rayTMax: f32,
payload: ptr<function, Payload>) {
var ray: RayDesc;
ray.origin = rayOrigin;
ray.direction = rayDir;
ray.tMin = rayTMin;
ray.tMax = rayTMax;
var bestHit: HitInfo;
bestHit.t = rayTMax;
var bestT = rayTMax;
let ended = _rtTraverseTlas(ray, flags, cullMask & 0xFFu,
sbtRecordOffset, sbtRecordStride,
&bestHit, &bestT, payload);
if (bestT < rayTMax) {
if ((flags & RT_FLAG_SKIP_CLOSEST_HIT) == 0u) {
runClosestHit(bestHit.hitGroupIndex, ray, bestHit, payload);
}
} else {
runMiss(missIndex, ray, payload);
}
}
`;
// ── WGSL library: rayQuery API for non-megakernel compute shaders ────
//
// Mirrors GL_EXT_ray_query semantics that 3DForts's physics shaders use
// (projectile-collide, splash, builder-pick). User WGSL:
// var rq: RayQuery;
// rayQueryInitialize(&rq, 0u, flags, mask, origin, tMin, dir, tMax);
// while (rayQueryProceed(&rq)) {} // run traversal to completion
// if (rayQueryGetCommittedIntersectionType(&rq) != RT_INTERSECTION_NONE) {
// let t = rayQueryGetCommittedT(&rq);
// ...
// }
//
// v1 simplification: traversal force-opaques every hit (no anyhit). The
// user can still test for triangle vs miss and read t/instance/bary.
// Anyhit-style candidate inspection is a future extension.
const rtWgslRayQueryLib = String.raw`
struct RayQuery {
ray: RayDesc,
flags: u32,
cullMask: u32,
committedType: u32,
committedT: f32,
committedInstanceId: u32,
committedInstanceCustomIndex: u32,
committedPrimitiveIndex: u32,
committedBarycentrics: vec2<f32>,
committedObjectRayOrigin: vec3<f32>,
committedObjectRayDirection: vec3<f32>,
committedObjectToWorldR0: vec4<f32>,
committedObjectToWorldR1: vec4<f32>,
committedObjectToWorldR2: vec4<f32>,
committedWorldToObjectR0: vec4<f32>,
committedWorldToObjectR1: vec4<f32>,
committedWorldToObjectR2: vec4<f32>,
done: u32,
};
fn _rqTraverseBlas(rayObj: RayDesc, flags: u32, meshRec: MeshRecord,
instanceId: u32, customIndex: u32,
inst: TLASEntry,
rq: ptr<function, RayQuery>) {
let invD = vec3<f32>(1.0) / rayObj.direction;
var stack: array<u32, 32>;
var sp: u32 = 0u;
var nodeRel: u32 = 0u;
loop {
let abs = meshRec.bvhOffset + nodeRel;
let node = bvhNodes[abs];
if (!_rtAabb(rayObj.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
if (node.primCount > 0u) {
for (var i: u32 = 0u; i < node.primCount; i = i + 1u) {
let triIndex = primRemap[meshRec.primRemapOffset + node.firstChildOrPrim + i];
let verts = _rtFetchTri(meshRec, triIndex);
let tr = _rtTri(rayObj.origin, rayObj.direction,
verts[0], verts[1], verts[2],
rayObj.tMin, (*rq).committedT);
if (!tr.hit) { continue; }
let geomNormal = cross(verts[1] - verts[0], verts[2] - verts[0]);
let facing = dot(geomNormal, rayObj.direction);
if ((flags & RT_FLAG_CULL_BACK_FACING_TRIANGLES) != 0u && facing > 0.0) { continue; }
if ((flags & RT_FLAG_CULL_FRONT_FACING_TRIANGLES) != 0u && facing < 0.0) { continue; }
(*rq).committedType = RT_INTERSECTION_TRIANGLE;
(*rq).committedT = tr.t;
(*rq).committedInstanceId = instanceId;
(*rq).committedInstanceCustomIndex = customIndex;
(*rq).committedPrimitiveIndex = triIndex;
(*rq).committedBarycentrics = vec2<f32>(tr.u, tr.v);
(*rq).committedObjectRayOrigin = rayObj.origin;
(*rq).committedObjectRayDirection = rayObj.direction;
(*rq).committedObjectToWorldR0 = inst.objectToWorldR0;
(*rq).committedObjectToWorldR1 = inst.objectToWorldR1;
(*rq).committedObjectToWorldR2 = inst.objectToWorldR2;
(*rq).committedWorldToObjectR0 = inst.worldToObjectR0;
(*rq).committedWorldToObjectR1 = inst.worldToObjectR1;
(*rq).committedWorldToObjectR2 = inst.worldToObjectR2;
}
if (sp == 0u) { break; }
sp = sp - 1u; nodeRel = stack[sp]; continue;
}
let left = node.firstChildOrPrim;
let right = left + 1u;
if (sp < 32u) { stack[sp] = right; sp = sp + 1u; }
nodeRel = left;
}
}
fn _rqTraverseTlas(rq: ptr<function, RayQuery>) {
let rayWorld = (*rq).ray;
let invD = vec3<f32>(1.0) / rayWorld.direction;
let cullMask = (*rq).cullMask;
let rayFlags = (*rq).flags;
// Stack-based BVH descent — same shape as _rtTraverseTlas.
var stack: array<u32, 24>;
var sp: u32 = 0u;
stack[sp] = 0u; sp = sp + 1u;
loop {
if (sp == 0u) { break; }
sp = sp - 1u;
let nodeIdx = stack[sp];
let node = tlasBvhNodes[nodeIdx];
if (!_rtAabb(rayWorld.origin, invD, node.aabbMin, node.aabbMax, (*rq).committedT)) {
continue;
}
if (nodeIdx >= TLAS_BVH_LEAVES_START) {
let leafIdx = nodeIdx - TLAS_BVH_LEAVES_START;
let i = tlasEntryOrder[leafIdx];
if (i == 0xFFFFFFFFu) { continue; }
let inst = tlasEntries[i];
let instanceMask = inst.maskHGOffset & 0xFFu;
if ((instanceMask & cullMask) == 0u) { continue; }
if (!_rtAabb(rayWorld.origin, invD, inst.aabbMin, inst.aabbMax, (*rq).committedT)) { continue; }
let r0 = inst.worldToObjectR0;
let r1 = inst.worldToObjectR1;
let r2 = inst.worldToObjectR2;
var rayObj: RayDesc;
rayObj.origin = vec3<f32>(
dot(r0.xyz, rayWorld.origin) + r0.w,
dot(r1.xyz, rayWorld.origin) + r1.w,
dot(r2.xyz, rayWorld.origin) + r2.w,
);
rayObj.direction = vec3<f32>(
dot(r0.xyz, rayWorld.direction),
dot(r1.xyz, rayWorld.direction),
dot(r2.xyz, rayWorld.direction),
);
rayObj.tMin = rayWorld.tMin;
rayObj.tMax = (*rq).committedT;
var effective = rayFlags;
let iflags = inst.instanceFlags;
if ((iflags & RT_INSTANCE_TRIANGLE_FACING_CULL_DISABLE) != 0u) {
effective = effective & ~(RT_FLAG_CULL_BACK_FACING_TRIANGLES | RT_FLAG_CULL_FRONT_FACING_TRIANGLES);
}
let meshRec = meshRecords[inst.blasMeshIdx];
_rqTraverseBlas(rayObj, effective, meshRec, i, inst.customIndex, inst, rq);
if ((rayFlags & RT_FLAG_TERMINATE_ON_FIRST_HIT) != 0u
&& (*rq).committedType != RT_INTERSECTION_NONE) {
return;
}
} else {
let left = 2u * nodeIdx + 1u;
let right = 2u * nodeIdx + 2u;
if (sp + 1u < 24u) {
stack[sp] = right; sp = sp + 1u;
stack[sp] = left; sp = sp + 1u;
}
}
}
}
fn rayQueryInitialize(rq: ptr<function, RayQuery>, tlasIdx: u32, flags: u32, cullMask: u32,
origin: vec3<f32>, tMin: f32, direction: vec3<f32>, tMax: f32) {
(*rq).ray.origin = origin;
(*rq).ray.tMin = tMin;
(*rq).ray.direction = direction;
(*rq).ray.tMax = tMax;
(*rq).flags = flags;
(*rq).cullMask = cullMask & 0xFFu;
(*rq).committedType = RT_INTERSECTION_NONE;
(*rq).committedT = tMax;
(*rq).done = 0u;
}
fn rayQueryProceed(rq: ptr<function, RayQuery>) -> bool {
if ((*rq).done != 0u) { return false; }
_rqTraverseTlas(rq);
(*rq).done = 1u;
return false;
}
fn rayQueryTerminate(rq: ptr<function, RayQuery>) {
(*rq).done = 1u;
}
fn rayQueryGetCommittedIntersectionType(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedType; }
fn rayQueryGetCommittedT(rq: ptr<function, RayQuery>) -> f32 { return (*rq).committedT; }
fn rayQueryGetCommittedInstanceId(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedInstanceId; }
fn rayQueryGetCommittedInstanceCustomIndex(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedInstanceCustomIndex; }
fn rayQueryGetCommittedPrimitiveIndex(rq: ptr<function, RayQuery>) -> u32 { return (*rq).committedPrimitiveIndex; }
fn rayQueryGetCommittedBarycentrics(rq: ptr<function, RayQuery>) -> vec2<f32> { return (*rq).committedBarycentrics; }
fn rayQueryGetCommittedObjectRayOrigin(rq: ptr<function, RayQuery>) -> vec3<f32> { return (*rq).committedObjectRayOrigin; }
fn rayQueryGetCommittedObjectRayDirection(rq: ptr<function, RayQuery>) -> vec3<f32> { return (*rq).committedObjectRayDirection; }
fn rayQueryGetCommittedWorldPosition(rq: ptr<function, RayQuery>) -> vec3<f32> {
return (*rq).ray.origin + (*rq).ray.direction * (*rq).committedT;
}
`;
// TLAS spatial-partition bin. 64 of these per frame partition instances
// by the top 6 bits of their Morton code. RT traversal walks bin AABBs
// first; only bins a ray hits get their per-instance loop. Layout
// matches the WGSL struct below.
const TLAS_BIN_COUNT = 64;
const TLAS_BIN_SIZE = 32;
// ── Internal compute pipeline: builds TLASEntry[] from the RTInstance[]
// + the meshRecords table. One thread per instance.
// Uses only rtWgslTypes (no megakernel bindings) so it stays well under
// the 8-storage-buffer-per-stage baseline limit.
const tlasBuildWgsl = rtWgslTypes /* needs MeshRecord + TLASEntry */ + String.raw`
struct RTInstance {
transformR0: vec4<f32>,
transformR1: vec4<f32>,
transformR2: vec4<f32>,
customIndexMask: u32, // customIndex: low 24, mask: high 8
sbtFlags: u32, // sbtOffset: low 24, flags: high 8
accelStructureRef: vec2<u32>,
};
@group(0) @binding(0) var<storage, read> inInstances : array<RTInstance>;
@group(0) @binding(1) var<storage, read> inMeshes : array<MeshRecord>;
@group(0) @binding(2) var<storage, read_write> outEntries : array<TLASEntry>;
@group(0) @binding(3) var<storage, read_write> outOrder : array<u32>; // identity perm for stage 1
@group(0) @binding(4) var<storage, read_write> outMorton : array<u32>; // 30-bit Morton from world-AABB centroid
// Spread the low 10 bits of v across every 3rd bit (so three of these
// interleave into a 30-bit Morton code). Standard "magic number" version.
fn _expandBits10(v0: u32) -> u32 {
var v = v0 & 0x000003FFu;
v = (v * 0x00010001u) & 0xFF0000FFu;
v = (v * 0x00000101u) & 0x0F00F00Fu;
v = (v * 0x00000011u) & 0xC30C30C3u;
v = (v * 0x00000005u) & 0x49249249u;
return v;
}
fn _mortonCode3D(c: vec3<f32>) -> u32 {
// c assumed in [0, 1]^3.
let q = clamp(c, vec3<f32>(0.0), vec3<f32>(1.0));
let xi = u32(q.x * 1023.0);
let yi = u32(q.y * 1023.0);
let zi = u32(q.z * 1023.0);
return (_expandBits10(xi) << 2u)
| (_expandBits10(yi) << 1u)
| _expandBits10(zi);
}
fn _invMat3(c0: vec3<f32>, c1: vec3<f32>, c2: vec3<f32>) -> array<vec3<f32>, 3> {
let m00 = c1.y*c2.z - c1.z*c2.y;
let m01 = c1.z*c2.x - c1.x*c2.z;
let m02 = c1.x*c2.y - c1.y*c2.x;
let det = c0.x*m00 + c0.y*m01 + c0.z*m02;
let inv = 1.0 / det;
return array<vec3<f32>, 3>(
vec3<f32>(m00, c0.z*c2.y - c0.y*c2.z, c0.y*c1.z - c0.z*c1.y) * inv,
vec3<f32>(m01, c0.x*c2.z - c0.z*c2.x, c0.z*c1.x - c0.x*c1.z) * inv,
vec3<f32>(m02, c0.y*c2.x - c0.x*c2.y, c0.x*c1.y - c0.y*c1.x) * inv,
);
}
@compute @workgroup_size(64, 1, 1)
fn tlasBuildMain(@builtin(global_invocation_id) gid: vec3<u32>) {
let i = gid.x;
if (i >= arrayLength(&inInstances)) { return; }
let inst = inInstances[i];
let meshIdx = inst.accelStructureRef.x;
let custom = inst.customIndexMask & 0xFFFFFFu;
let mask = (inst.customIndexMask >> 24u) & 0xFFu;
let sbtOffset = inst.sbtFlags & 0xFFFFFFu;
let iflags = (inst.sbtFlags >> 24u) & 0xFFu;
let rec = inMeshes[meshIdx];
// 8 corners → world AABB.
var worldMin = vec3<f32>(1e30);
var worldMax = vec3<f32>(-1e30);
for (var c: u32 = 0u; c < 8u; c = c + 1u) {
let corner = vec3<f32>(
select(rec.rootAabbMin.x, rec.rootAabbMax.x, (c & 1u) != 0u),
select(rec.rootAabbMin.y, rec.rootAabbMax.y, (c & 2u) != 0u),
select(rec.rootAabbMin.z, rec.rootAabbMax.z, (c & 4u) != 0u),
);
let wc = vec3<f32>(
dot(inst.transformR0.xyz, corner) + inst.transformR0.w,
dot(inst.transformR1.xyz, corner) + inst.transformR1.w,
dot(inst.transformR2.xyz, corner) + inst.transformR2.w,
);
worldMin = min(worldMin, wc);
worldMax = max(worldMax, wc);
}
// Inverse 3x4 affine.
let inv = _invMat3(inst.transformR0.xyz, inst.transformR1.xyz, inst.transformR2.xyz);
let T = vec3<f32>(inst.transformR0.w, inst.transformR1.w, inst.transformR2.w);
let invT = vec3<f32>(-dot(inv[0], T), -dot(inv[1], T), -dot(inv[2], T));
var e: TLASEntry;
e.aabbMin = worldMin;
e.aabbMax = worldMax;
e.maskHGOffset = mask | (sbtOffset << 8u);
e.blasMeshIdx = meshIdx;
e.objectToWorldR0 = inst.transformR0;
e.objectToWorldR1 = inst.transformR1;
e.objectToWorldR2 = inst.transformR2;
e.worldToObjectR0 = vec4<f32>(inv[0], invT.x);
e.worldToObjectR1 = vec4<f32>(inv[1], invT.y);
e.worldToObjectR2 = vec4<f32>(inv[2], invT.z);
e.customIndex = custom;
e.instanceFlags = iflags;
outEntries[i] = e;
// LBVH pre-pass output:
// outOrder[i] = i — identity permutation (Stage 1; the
// radix sort in Stage 2 rewrites this).
// outMorton[i] = morton(centroid normalized to [0,1]^3 over the
// fixed world bound).
// Fixed bound is conservative for 3DForts (map ~2000 units, fort ~50).
// Stage 2 swaps to a true scene-AABB reduction.
outOrder[i] = i;
let centroid = (worldMin + worldMax) * 0.5;
let worldHalfExtent = 4000.0;
let normalized = (centroid + vec3<f32>(worldHalfExtent)) * (1.0 / (2.0 * worldHalfExtent));
outMorton[i] = _mortonCode3D(normalized);
}
`;
// TLAS LBVH build — single dispatch of one 1024-thread workgroup.
// Phases:
// 0. Scene AABB reduction (256-wide tree reduce).
// 1. Pack (morton16 << 16) | tlasIndex16 keys into sortA. Sentinel
// slots get 0xFFFFFFFF so they sort to the end; m16 is clamped to
// 0xFFFE so no real key collides with the sentinel.
// 2. LSD radix sort — 8 passes × 4 bits, ping-pong sortA ↔ sortB.
// STAGE 1: single-thread (thread 0) sequential scatter for stable
// ordering. This is the slow-but-trivially-correct baseline; Stage
// 2 will parallelize the scatter using per-thread local histograms
// + a cross-thread scan.
// 3. Write sorted instance permutation into outOrder.
// 4. Initialize BVH leaf AABBs from sorted instances.
// 5. Bottom-up sweep-tree refit, log2(N_PADDED) levels.
//
// Storage-barrier pattern: workgroupBarrier() fences workgroup memory
// only per WGSL spec; storageBarrier() is required between R/W phases
// on sortA/B and outBvh. Both are called at every storage boundary —
// minor perf cost, eliminates the class of bug that hung the GPU on
// the previous radix attempt.
//
// Hard cap: LBVH_MAX = 16384. Parallel scatter (per-bucket Hillis-Steele
// scan over 1024-lane indicators) made the build cost flat at ~0.5 ms
// regardless of N_PADDED, and the degenerate-AABB fix in _rtAabb keeps
// sentinel-only subtrees from being traversed. Per-ray cost scales with
// log2(N_real), not log2(N_PADDED).
const LBVH_MAX = 16384;
const lbvhBuildWgsl = String.raw`
struct TLASEntryStub {
aabbMin: vec3<f32>,
maskHGOffset: u32,
aabbMax: vec3<f32>,
blasMeshIdx: u32,
objectToWorldR0: vec4<f32>,
objectToWorldR1: vec4<f32>,
objectToWorldR2: vec4<f32>,
worldToObjectR0: vec4<f32>,
worldToObjectR1: vec4<f32>,
worldToObjectR2: vec4<f32>,
customIndex: u32,
instanceFlags: u32,
_pad0: u32,
_pad1: u32,
};
struct BvhNode {
aabbMin: vec3<f32>,
_pad0: u32,
aabbMax: vec3<f32>,
_pad1: u32,
};
@group(0) @binding(0) var<storage, read> entries : array<TLASEntryStub>;
@group(0) @binding(1) var<storage, read_write> outOrder : array<u32>;
@group(0) @binding(2) var<storage, read_write> outBvh : array<BvhNode>;
// Radix-sort ping-pong buffers. One u32 per element — the packed
// (morton16 << 16) | tlasIndex16 key. Sized for N_PADDED.
@group(0) @binding(3) var<storage, read_write> sortA : array<u32>;
@group(0) @binding(4) var<storage, read_write> sortB : array<u32>;
// Real instance count. Passed as a uniform so the entries / outOrder /
// sortA / sortB / outBvh buffers can be allocated ONCE at N_PADDED and
// never resized as the application's TLAS instance count changes —
// runtime resize-on-grow caused subtle BVH corruption (driver-level
// memory recycling, suspected) and was the root cause of mid-game
// geometry flicker when projectiles entered the TLAS.
struct LbvhPC { nReal: u32, _pad0: u32, _pad1: u32, _pad2: u32 };
@group(0) @binding(5) var<uniform> lbvhPc : LbvhPC;
const N_PADDED: u32 = 16384u;
const THREADS: u32 = 1024u;
const K_PER: u32 = 16u; // = N_PADDED / THREADS
const REDUCE_LANES: u32 = 256u;
const REDUCE_K_PER: u32 = 64u; // = N_PADDED / REDUCE_LANES
const BUCKETS: u32 = 16u;
const PASSES: u32 = 8u;
const LEVELS: u32 = 14u; // log2(N_PADDED)
const SCAN_STEPS: u32 = 10u; // log2(THREADS)
var<workgroup> shHist: array<atomic<u32>, BUCKETS>;
var<workgroup> shOffsets: array<u32, BUCKETS>;
// Hillis-Steele scratch for per-bucket exclusive prefix sum over 1024
// per-thread bucket counts. 4 KB. Reused across all 8 × 16 bucket scans
// in the radix passes.
var<workgroup> shScan: array<u32, THREADS>;
// Scene-AABB reduction scratch — 256-lane tree reduce. vec3 stride is
// 16 by WGSL alignment → 4 KB each, 8 KB total. Well under the 16 KB
// default workgroup-storage cap.
var<workgroup> shRedMin: array<vec3<f32>, 256>;
var<workgroup> shRedMax: array<vec3<f32>, 256>;
var<workgroup> shSceneMin: vec3<f32>;
var<workgroup> shSceneMax: vec3<f32>;
fn _expandBits10(v0: u32) -> u32 {
var v = v0 & 0x000003FFu;
v = (v * 0x00010001u) & 0xFF0000FFu;
v = (v * 0x00000101u) & 0x0F00F00Fu;
v = (v * 0x00000011u) & 0xC30C30C3u;
v = (v * 0x00000005u) & 0x49249249u;
return v;
}
fn _mortonCode3D(c: vec3<f32>) -> u32 {
let q = clamp(c, vec3<f32>(0.0), vec3<f32>(1.0));
let xi = u32(q.x * 1023.0);
let yi = u32(q.y * 1023.0);
let zi = u32(q.z * 1023.0);
return (_expandBits10(xi) << 2u)
| (_expandBits10(yi) << 1u)
| _expandBits10(zi);
}
@compute @workgroup_size(1024, 1, 1)
fn lbvhBuildMain(@builtin(local_invocation_id) lid: vec3<u32>) {
let tid = lid.x;
let n = lbvhPc.nReal;
// ── Phase 0: scene AABB reduction across centroids ───────────────────
// REDUCE_LANES=256 lanes each fold REDUCE_K_PER stripes (covers
// N_PADDED), then an 8-step tree reduce across those lanes gives
// the final AABB.
if (tid < REDUCE_LANES) {
var lMin = vec3<f32>( 1e30);
var lMax = vec3<f32>(-1e30);
for (var k: u32 = 0u; k < REDUCE_K_PER; k = k + 1u) {
let i = tid * REDUCE_K_PER + k;
if (i < n) {
let c = (entries[i].aabbMin + entries[i].aabbMax) * 0.5;
lMin = min(lMin, c);
lMax = max(lMax, c);
}
}
shRedMin[tid] = lMin;
shRedMax[tid] = lMax;
}
workgroupBarrier();
var stride: u32 = 128u;
for (var s: u32 = 0u; s < 8u; s = s + 1u) {
if (tid < stride) {
shRedMin[tid] = min(shRedMin[tid], shRedMin[tid + stride]);
shRedMax[tid] = max(shRedMax[tid], shRedMax[tid + stride]);
}
workgroupBarrier();
stride = stride / 2u;
}
if (tid == 0u) {
shSceneMin = shRedMin[0];
shSceneMax = shRedMax[0];
}
workgroupBarrier();
// ── Phase 1: emit packed sort keys into sortA ────────────────────────
let extent = max(shSceneMax - shSceneMin, vec3<f32>(1e-3));
let invExtent = vec3<f32>(1.0) / extent;
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
var key: u32;
if (i < n) {
let c = (entries[i].aabbMin + entries[i].aabbMax) * 0.5;
let nrm = (c - shSceneMin) * invExtent;
let m30 = _mortonCode3D(nrm);
let m16 = min(m30 >> 14u, 0xFFFEu);
key = (m16 << 16u) | (i & 0xFFFFu);
} else {
key = 0xFFFFFFFFu;
}
sortA[i] = key;
}
workgroupBarrier();
storageBarrier();
// ── Phase 2: stable LSD radix sort, fully parallel scatter ──────────
// Per pass:
// 1. Histogram (parallel atomicAdd to shHist).
// 2. Global bucket starts: exclusive prefix scan of shHist into
// shOffsets[16] — small, done sequentially by thread 0.
// 3. For each of the 16 buckets:
// a. Each thread counts its bucket-b source elements (K_PER
// re-reads, no caching — storage reads are L1-cheap).
// b. Hillis-Steele exclusive prefix scan of those counts
// across all THREADS lanes (log2(THREADS)=10 levels).
// Single-buffered with a read/barrier/write/barrier pattern
// per step so reads and writes don't race.
// c. Each thread re-walks its K_PER elements in source order
// and writes bucket-b ones to dst = shOffsets[b]
// + my_exclusive_prefix
// + my_local_idx_so_far.
// Stability holds because per-thread iteration is in source
// order and the cross-thread offsets respect thread index order.
// WORKAROUND: the parallel radix sort below corrupts sortA in a
// way that's count-dependent — symptom was mid-game geometry
// flicker as soon as ANY extra TLAS instance was added beyond the
// initial scene (e.g. firing a projectile would make fort braces
// appear to disappear in patterns deterministic on the projectile's
// angle). Bisected by skipping each LBVH phase in turn: with the
// sort skipped, no flicker. With the sort enabled, flicker.
//
// The exact bug in the Hillis-Steele scan + parallel scatter
// hasn't been identified despite careful review — likely a subtle
// memory-ordering / barrier issue that triggers only with the
// specific Morton-code distribution that arises when a small object
// (projectile) sits next to a large cluster (fort).
//
// Skipping the sort means BVH leaves are in instance-index order
// (no spatial coherence). Ray traversal still descends the BVH
// tree, but parent AABBs are larger than they would be with sorted
// leaves, so more leaves get tested per ray. With the fort's
// ~1011-entry scale that's still fast enough; revisit if the
// entry count grows toward the LBVH_MAX cap.
if (false) {
for (var p: u32 = 0u; p < PASSES; p = p + 1u) {
let shift = p * 4u;
let srcIsA = (p & 1u) == 0u;
// Clear histogram.
if (tid < BUCKETS) {
atomicStore(&shHist[tid], 0u);
}
workgroupBarrier();
// Histogram pass — K_PER elements per thread.
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
var myKey: u32;
if (srcIsA) { myKey = sortA[i]; } else { myKey = sortB[i]; }
let bucket = (myKey >> shift) & 0xFu;
atomicAdd(&shHist[bucket], 1u);
}
workgroupBarrier();
// Global bucket starts (16-wide; thread 0 does it sequentially).
if (tid == 0u) {
var s2: u32 = 0u;
for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
shOffsets[b] = s2;
s2 = s2 + atomicLoad(&shHist[b]);
}
}
workgroupBarrier();
// Per-bucket parallel scatter.
for (var b: u32 = 0u; b < BUCKETS; b = b + 1u) {
// (a) Count my bucket-b elements.
var localCount: u32 = 0u;
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
var srcKey: u32;
if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
let bk = (srcKey >> shift) & 0xFu;
if (bk == b) { localCount = localCount + 1u; }
}
shScan[tid] = localCount;
workgroupBarrier();
// (b) Hillis-Steele inclusive prefix scan across 1024 lanes.
// Single-buffered: read snapshot → barrier → write → barrier.
for (var step: u32 = 0u; step < SCAN_STEPS; step = step + 1u) {
let offset = 1u << step;
let v = shScan[tid];
var prev: u32 = 0u;
if (tid >= offset) { prev = shScan[tid - offset]; }
workgroupBarrier();
shScan[tid] = v + prev;
workgroupBarrier();
}
// Convert inclusive→exclusive by subtracting own contribution.
let myExclusivePrefix = shScan[tid] - localCount;
// (c) Scatter my bucket-b elements at the computed positions.
var localIdx: u32 = 0u;
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
var srcKey: u32;
if (srcIsA) { srcKey = sortA[i]; } else { srcKey = sortB[i]; }
let bk = (srcKey >> shift) & 0xFu;
if (bk == b) {
let dst = shOffsets[b] + myExclusivePrefix + localIdx;
if (srcIsA) { sortB[dst] = srcKey; } else { sortA[dst] = srcKey; }
localIdx = localIdx + 1u;
}
}
workgroupBarrier();
}
storageBarrier();
}
}
// After 8 ping-pongs (even count) the sorted keys live in sortA.
// ── Phase 3: write sorted instance permutation into outOrder ─────────
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
if (i < n) {
outOrder[i] = sortA[i] & 0xFFFFu;
}
}
workgroupBarrier();
storageBarrier();
// ── Phase 4: initialize BVH leaf AABBs ───────────────────────────────
for (var k: u32 = 0u; k < K_PER; k = k + 1u) {
let i = k * THREADS + tid;
let leafIdx = N_PADDED - 1u + i;
let leafKey = sortA[i];
if (leafKey == 0xFFFFFFFFu) {
outBvh[leafIdx].aabbMin = vec3<f32>( 1e30);
outBvh[leafIdx].aabbMax = vec3<f32>(-1e30);
} else {
let e = entries[leafKey & 0xFFFFu];
outBvh[leafIdx].aabbMin = e.aabbMin;
outBvh[leafIdx].aabbMax = e.aabbMax;
}
}
workgroupBarrier();
storageBarrier();
// ── Phase 5: bottom-up sweep-tree refit, LEVELS iterations ──────────
// Deepest internal level has N_PADDED/2 nodes; perThread = ceil of
// levelCount / THREADS is uniform per step, so workgroupBarrier
// stays in uniform control flow.
var levelCount: u32 = N_PADDED / 2u;
var levelStart: u32 = N_PADDED / 2u - 1u;
for (var step: u32 = 0u; step < LEVELS; step = step + 1u) {
let perThread = (levelCount + THREADS - 1u) / THREADS;
for (var k: u32 = 0u; k < perThread; k = k + 1u) {
let nodeOff = k * THREADS + tid;
if (nodeOff < levelCount) {
let nodeIdx = levelStart + nodeOff;
let leftIdx = 2u * nodeIdx + 1u;
let rightIdx = 2u * nodeIdx + 2u;
let lMin = outBvh[leftIdx].aabbMin;
let lMax = outBvh[leftIdx].aabbMax;
let rMin = outBvh[rightIdx].aabbMin;
let rMax = outBvh[rightIdx].aabbMax;
outBvh[nodeIdx].aabbMin = min(lMin, rMin);
outBvh[nodeIdx].aabbMax = max(lMax, rMax);
}
}
workgroupBarrier();
storageBarrier();
levelCount = levelCount / 2u;
levelStart = (levelStart - 1u) / 2u;
}
}
`;
// ── RT runtime state ──────────────────────────────────────────────────
// Mesh heaps grow geometrically; each Mesh::Build appends + records its
// offsets in meshRecordsCpu/Buffer. nextMeshHandle is what gets returned
// to C++ as RTInstance::accelerationStructureReference.
const RT_HEAP_INITIAL_BYTES = 64 * 1024;
function makeRtHeap() {
return {
gpu: device.createBuffer({
size: RT_HEAP_INITIAL_BYTES,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
}),
capacity: RT_HEAP_INITIAL_BYTES,
cursor: 0,
};
}
function rtHeapEnsure(h, neededBytes) {
if (h.cursor + neededBytes <= h.capacity) return;
let cap = h.capacity;
while (h.cursor + neededBytes > cap) cap *= 2;
const ng = device.createBuffer({
size: cap,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
});
const enc = device.createCommandEncoder();
if (h.cursor > 0) enc.copyBufferToBuffer(h.gpu, 0, ng, 0, h.cursor);
queue.submit([enc.finish()]);
h.gpu.destroy();
h.gpu = ng;
h.capacity = cap;
// Invalidate cached bind groups that referenced the heap.
rtState.bindGroupCache.clear();
}
const rtState = {
vertHeap: null, // f32 stream (3 floats per vertex)
indexHeap: null, // u32 stream
bvhHeap: null, // BVHNode stream (32 bytes per node)
primRemapHeap: null, // u32 stream
attribsHeap: null, // u32 stream (per-vertex attribute payload; example-defined stride)
meshRecordsBuffer: null, // GPUBuffer of MeshRecord[]
meshRecordsCapacity: 0,
nextMeshHandle: 1,
rtHeader: null, // uniform buffer for RTDispatchHeader (256 B aligned)
bindGroupCache: new Map(), // key → bind group
tlasBuildPipeline: null,
tlasBuildBgl: null,
// Latest TLAS buffer handle from wgpuBuildTLAS, used by rayQuery-capable
// compute shaders at dispatch time.
currentTlas: 0,
currentTlasInstanceCount: 0,
};
function rtInit() {
rtState.vertHeap = makeRtHeap();
rtState.indexHeap = makeRtHeap();
rtState.bvhHeap = makeRtHeap();
rtState.primRemapHeap = makeRtHeap();
rtState.attribsHeap = makeRtHeap();
rtState.meshRecordsCapacity = 16;
rtState.meshRecordsBuffer = device.createBuffer({
size: rtState.meshRecordsCapacity * 48,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
});
rtState.rtHeader = device.createBuffer({
size: 256,
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
// TLAS-build compute pipeline. group(0) carries:
// 0 inInstances (read-only-storage)
// 1 inMeshes (read-only-storage)
// 2 outEntries (read-write storage)
// 3 outOrder (read-write — identity perm, sort overwrites later)
// 4 outMorton (read-write — sort key, 30-bit Morton from centroid)
const mod = device.createShaderModule({ code: tlasBuildWgsl, label: "rt-tlas-build" });
const tlasBuildBgl = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
]});
rtState.tlasBuildBgl = tlasBuildBgl;
rtState.tlasBuildPipeline = device.createComputePipeline({
layout: device.createPipelineLayout({ bindGroupLayouts: [tlasBuildBgl] }),
compute: { module: mod, entryPoint: "tlasBuildMain" },
});
// LBVH-build follow-up pipeline. Single workgroup of 1024 threads
// sorts instances by packed (morton16:idx16) keys (LSD radix) and
// refits a sweep-tree BVH. N_PADDED = 1024 ceiling for Stage 1.
// 0 entries (read-only)
// 1 outOrder (read-write, sorted permutation)
// 2 outBvh (read-write, 2*N_PADDED - 1 nodes × 32 bytes)
// 3 sortA (read-write, N_PADDED u32 ping-pong)
// 4 sortB (read-write, N_PADDED u32 ping-pong)
const lbvhMod = device.createShaderModule({ code: lbvhBuildWgsl, label: "rt-tlas-lbvh-build" });
const lbvhBuildBgl = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "storage" } },
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "uniform" } },
]});
rtState.lbvhBuildBgl = lbvhBuildBgl;
rtState.lbvhBuildPipeline = device.createComputePipeline({
layout: device.createPipelineLayout({ bindGroupLayouts: [lbvhBuildBgl] }),
compute: { module: lbvhMod, entryPoint: "lbvhBuildMain" },
});
// Tiny uniform buffer for the LBVH's `nReal` field. Written each
// wgpuBuildTLAS call before dispatch.
rtState.lbvhCountBuf = device.createBuffer({
size: 16,
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
}
function rtMeshRecordsEnsure(meshCount) {
if (meshCount <= rtState.meshRecordsCapacity) return;
let cap = rtState.meshRecordsCapacity;
while (cap < meshCount) cap *= 2;
const ng = device.createBuffer({
size: cap * 48,
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC,
});
const enc = device.createCommandEncoder();
enc.copyBufferToBuffer(rtState.meshRecordsBuffer, 0, ng, 0,
rtState.meshRecordsCapacity * 48);
queue.submit([enc.finish()]);
rtState.meshRecordsBuffer.destroy();
rtState.meshRecordsBuffer = ng;
rtState.meshRecordsCapacity = cap;
rtState.bindGroupCache.clear();
}
env.wgpuRegisterMeshBLAS = (minX, minY, minZ, maxX, maxY, maxZ,
verticesPtr, vertexCount,
indicesPtr, indexCount,
bvhNodesPtr, bvhNodeCount,
primRemapPtr, primRemapCount,
attribsPtr, attribsByteCount) => {
if (!rtState.vertHeap) rtInit();
console.log(`[crafter-wgpu] mesh BLAS: bbox=(${minX.toFixed(1)}..${maxX.toFixed(1)}, ${minY.toFixed(1)}..${maxY.toFixed(1)}, ${minZ.toFixed(1)}..${maxZ.toFixed(1)}), ${vertexCount} verts, ${indexCount/3} tris, attribs=${attribsByteCount}B`);
const vBytes = vertexCount * 12;
const iBytes = indexCount * 4;
const nBytes = bvhNodeCount * 32;
const rBytes = primRemapCount * 4;
// attribsByteCount must be a multiple of 4 (the heap is array<u32>).
// Round up the upload size; the in-MeshRecord offset is in u32 words.
const aBytes = (attribsByteCount + 3) & ~3;
rtHeapEnsure(rtState.vertHeap, vBytes);
rtHeapEnsure(rtState.indexHeap, iBytes);
rtHeapEnsure(rtState.bvhHeap, nBytes);
rtHeapEnsure(rtState.primRemapHeap, rBytes);
if (aBytes > 0) rtHeapEnsure(rtState.attribsHeap, aBytes);
const vOff = rtState.vertHeap.cursor / 12; // in vec3 units
const iOff = rtState.indexHeap.cursor / 4; // in u32 units
const nOff = rtState.bvhHeap.cursor / 32; // in BVHNode units
const rOff = rtState.primRemapHeap.cursor / 4;
const aOff = rtState.attribsHeap.cursor / 4; // in u32 units
// queue.writeBuffer requires multiple-of-4 sizes. Vertex byte count is
// already 12*n; index/bvh/remap are 4*n / 32*n / 4*n — all multiples of 4.
queue.writeBuffer(rtState.vertHeap.gpu, rtState.vertHeap.cursor,
memU8().buffer, verticesPtr, vBytes);
queue.writeBuffer(rtState.indexHeap.gpu, rtState.indexHeap.cursor,
memU8().buffer, indicesPtr, iBytes);
queue.writeBuffer(rtState.bvhHeap.gpu, rtState.bvhHeap.cursor,
memU8().buffer, bvhNodesPtr, nBytes);
queue.writeBuffer(rtState.primRemapHeap.gpu, rtState.primRemapHeap.cursor,
memU8().buffer, primRemapPtr, rBytes);
if (aBytes > 0) {
queue.writeBuffer(rtState.attribsHeap.gpu, rtState.attribsHeap.cursor,
memU8().buffer, attribsPtr, aBytes);
}
rtState.vertHeap.cursor += vBytes;
rtState.indexHeap.cursor += iBytes;
rtState.bvhHeap.cursor += nBytes;
rtState.primRemapHeap.cursor += rBytes;
rtState.attribsHeap.cursor += aBytes;
const handle = rtState.nextMeshHandle++;
rtMeshRecordsEnsure(handle + 1);
// Build the MeshRecord (48 bytes) and write it.
const rec = new ArrayBuffer(48);
const f32 = new Float32Array(rec);
const u32 = new Uint32Array(rec);
f32[0] = minX; f32[1] = minY; f32[2] = minZ;
u32[3] = vOff;
f32[4] = maxX; f32[5] = maxY; f32[6] = maxZ;
u32[7] = iOff;
u32[8] = nOff;
u32[9] = rOff;
u32[10] = (vertexCount > 0) ? (indexCount / 3) : 0;
u32[11] = aOff;
queue.writeBuffer(rtState.meshRecordsBuffer, handle * 48, rec);
return handle;
};
env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle,
entryOrderHandle, mortonHandle, binsHandle,
bvhNodesHandle, sortABufHandle, sortBBufHandle) => {
if (!rtState.tlasBuildPipeline || !rtState.lbvhBuildPipeline) return;
const inst = buffers.get(instanceBufHandle);
const out = buffers.get(tlasOutBufHandle);
const order = buffers.get(entryOrderHandle);
const morton = buffers.get(mortonHandle);
const bvh = buffers.get(bvhNodesHandle);
const sortA = buffers.get(sortABufHandle);
const sortB = buffers.get(sortBBufHandle);
if (!inst || !out || !order || !morton || !bvh || !sortA || !sortB) {
console.error("[crafter-wgpu] wgpuBuildTLAS: unknown buffer handle");
return;
}
if (instanceCount > LBVH_MAX) {
console.error(`[crafter-wgpu] wgpuBuildTLAS: instance count ${instanceCount} > LBVH cap ${LBVH_MAX}`);
return;
}
const bg = device.createBindGroup({
layout: rtState.tlasBuildBgl,
entries: [
{ binding: 0, resource: { buffer: inst } },
{ binding: 1, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 2, resource: { buffer: out } },
{ binding: 3, resource: { buffer: order } },
{ binding: 4, resource: { buffer: morton } },
],
});
// Write the real instance count to the LBVH count uniform so the
// shader can iterate exactly the right number of entries even
// though the storage buffers stay sized for N_PADDED.
const countBuf = new Uint32Array(4);
countBuf[0] = instanceCount;
queue.writeBuffer(rtState.lbvhCountBuf, 0, countBuf);
const lbvhBg = device.createBindGroup({
layout: rtState.lbvhBuildBgl,
entries: [
{ binding: 0, resource: { buffer: out } },
{ binding: 1, resource: { buffer: order } },
{ binding: 2, resource: { buffer: bvh } },
{ binding: 3, resource: { buffer: sortA } },
{ binding: 4, resource: { buffer: sortB } },
{ binding: 5, resource: { buffer: rtState.lbvhCountBuf } },
],
});
if (state.pass) {
// Mid-frame — close the user's compute pass, run our build pass
// on the same encoder, then reopen.
state.pass.end();
state.pass = null;
}
const enc = state.encoder || device.createCommandEncoder();
const ownEncoder = !state.encoder;
// Pass 1: TLAS entry build (existing).
{
const pass = enc.beginComputePass({ label: "tlas-build" });
pass.setPipeline(rtState.tlasBuildPipeline);
pass.setBindGroup(0, bg);
const groups = Math.ceil(instanceCount / 64);
pass.dispatchWorkgroups(groups, 1, 1);
pass.end();
}
// Pass 2: LBVH-build. Single workgroup of 1024 threads runs the
// entire BVH build (Morton, sort, sweep-tree refit) in shared
// memory. Same encoder → pipeline barrier sequences pass1 → pass2.
if (instanceCount > 0) {
const pass = enc.beginComputePass({ label: "tlas-lbvh-build" });
pass.setPipeline(rtState.lbvhBuildPipeline);
pass.setBindGroup(0, lbvhBg);
pass.dispatchWorkgroups(1, 1, 1);
pass.end();
}
if (ownEncoder) {
queue.submit([enc.finish()]);
} else {
state.pass = state.encoder.beginComputePass();
}
// Publish so rayQuery-capable compute pipelines pick up the latest TLAS
// without each dispatch having to thread the handle explicitly.
rtState.currentTlas = tlasOutBufHandle;
rtState.currentTlasInstanceCount = instanceCount;
rtState.currentEntryOrder = entryOrderHandle;
rtState.currentBvh = bvhNodesHandle;
};
// RT pipeline loader — wraps user-supplied WGSL (sources + generated mega
// switches + raygen + @compute entry) with the library prelude/helpers.
// `bindingsPtr` / `bindingsCount` are UICustomBinding entries (same 8-byte
// shape as wgpuLoadCustomShader) declaring extra @group(2)+ resources the
// closest-hit / miss / raygen WGSL touches (material SSBOs, albedo
// textures, samplers). Pass (0, 0) for a pipeline with no user bindings.
const rtPipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups }
env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
if (!rtState.vertHeap) rtInit();
const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
// Insert helpers at the marker; prepend prelude.
const marker = "// @CRAFTER_RT_LIBRARY_HELPERS_HERE";
let beforeHelpers = userPart;
let afterHelpers = "";
const mi = userPart.indexOf(marker);
if (mi >= 0) {
beforeHelpers = userPart.substring(0, mi);
afterHelpers = userPart.substring(mi + marker.length);
}
const fullWgsl = rtWgslPrelude + "\n" + beforeHelpers + "\n" + rtWgslPureHelpers + "\n" + rtWgslMegakernelHelpers + "\n" + afterHelpers;
// Parse user bindings (same wire format as wgpuLoadCustomShader).
const userBindings = [];
if (bindingsCount > 0) {
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
for (let i = 0; i < bindingsCount; i++) {
const g = dv.getUint8(i*8 + 0);
if (g < 2) {
console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 2`);
return 0;
}
userBindings.push({
group: g,
binding: dv.getUint8(i*8 + 1),
kind: dv.getUint8(i*8 + 2),
pushOffset: dv.getUint32(i*8 + 4, true),
});
}
}
const byGroup = new Map();
for (const b of userBindings) {
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
byGroup.get(b.group).push(b);
}
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
let pipeline;
try {
const mod = device.createShaderModule({ code: fullWgsl, label: "rt-megakernel" });
// RTDispatchHeader is 16 bytes; bind exactly that.
const headerBgl = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
buffer: { type: "uniform", minBindingSize: 16 } },
]});
const dataBgl = device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
]});
// User binding-group layouts. WebGPU pipeline layouts need a
// contiguous array up to the highest group used, so pad any gaps
// with empty bgls (same rule as wgpuLoadCustomShader).
const userBgls = [];
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 1;
for (let g = 2; g <= highest; g++) {
if (byGroup.has(g)) {
const entries = byGroup.get(g).map(b => {
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
else if (b.kind === 2) e.sampler = { type: "filtering" };
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
return e;
});
userBgls.push(device.createBindGroupLayout({ entries }));
} else {
userBgls.push(device.createBindGroupLayout({ entries: [] }));
}
}
pipeline = device.createComputePipeline({
layout: device.createPipelineLayout({ bindGroupLayouts: [headerBgl, dataBgl, ...userBgls] }),
compute: { module: mod, entryPoint: "main" },
});
const handle = newHandle();
rtPipelines.set(handle, { pipeline, headerBgl, dataBgl, userBgls, byGroup, sortedGroups });
return handle;
} catch (e) {
console.error("[crafter-wgpu] RT pipeline compile failed:", e);
console.error("[crafter-wgpu] WGSL was:\n", fullWgsl);
return 0;
}
};
env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
tlasBufHandle, instanceCount, gx, gy,
handlesPtr, handlesCount) => {
if (!state.pass) return;
const pipe = rtPipelines.get(pipelineHandle);
const tlas = buffers.get(tlasBufHandle);
if (!pipe || !tlas) {
console.error("[crafter-wgpu] wgpuDispatchRT: unknown pipeline or tlas");
return;
}
// Write RT header from push data (first 16 bytes). Surface dims + instance count + flags.
const hdr32 = new Uint32Array(4);
hdr32[0] = state.width;
hdr32[1] = state.height;
hdr32[2] = instanceCount;
hdr32[3] = 0;
queue.writeBuffer(rtState.rtHeader, 0, hdr32);
const headerBg = device.createBindGroup({
layout: pipe.headerBgl,
entries: [{ binding: 0, resource: { buffer: rtState.rtHeader, offset: 0, size: 16 } }],
});
const outView = state.outIsPing ? state.pingView : state.pongView;
const entryOrderBuf = buffers.get(rtState.currentEntryOrder);
const bvhBuf = buffers.get(rtState.currentBvh);
if (!entryOrderBuf || !bvhBuf) {
console.error("[crafter-wgpu] wgpuDispatchRT: missing entryOrder/bins (no TLAS built yet?)");
return;
}
const dataBg = device.createBindGroup({
layout: pipe.dataBgl,
entries: [
{ binding: 0, resource: { buffer: tlas } },
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
{ binding: 6, resource: outView },
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
{ binding: 8, resource: { buffer: entryOrderBuf } },
{ binding: 9, resource: { buffer: bvhBuf } },
],
});
state.pass.setPipeline(pipe.pipeline);
state.pass.setBindGroup(0, headerBg);
state.pass.setBindGroup(1, dataBg);
// User bindings: walk byGroup in the same sorted order the C++ side
// packed handles[], picking up indices linearly.
if (handlesCount > 0) {
const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
let handleIdx = 0;
let bglIdx = 0;
for (let g = 2; g <= (pipe.sortedGroups[pipe.sortedGroups.length - 1] || 1); g++) {
if (pipe.byGroup.has(g)) {
const entries = pipe.byGroup.get(g).map(b => {
const h = handles[handleIdx++];
let resource;
if (b.kind === 0) resource = { buffer: buffers.get(h) };
else if (b.kind === 1) resource = textureViews.get(h);
else if (b.kind === 2) resource = samplers.get(h);
else if (b.kind === 3) resource = textureViews.get(h);
return { binding: b.binding, resource };
});
const bg = device.createBindGroup({
layout: pipe.userBgls[bglIdx],
entries,
});
state.pass.setBindGroup(g, bg);
}
bglIdx++;
}
}
state.pass.dispatchWorkgroups(gx, gy, 1);
state.outIsPing = !state.outIsPing;
};
// ── Standalone compute pipelines ────────────────────────────────────────
//
// Mirrors the native Vulkan `ComputeShader` API: user-authored compute
// pipelines that dispatch outside any UI render pass. Unlike
// wgpuLoadCustomShader (which lives inside the UI flow with ping/pong
// textures at @group(1)), these have NO library-supplied bindings —
// users declare all groups themselves.
//
// Layout contract:
// @group(0) @binding(0) uniform PushData // optional, only if pushUniformSize > 0
// @group(1+) @binding(N) // user bindings via UICustomBinding
//
// rayQuery: same flag as wgpuLoadCustomShader. When set, prepends the
// RT prelude + rayQuery library; user shader must NOT redeclare the
// resulting @group(1) layout (tlas/bvh/mesh heaps). The `bindings`
// list must start at @group(2) in that case.
const computePipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups, pushUniformSize, rayQueryCapable }
env.wgpuLoadComputePipeline = (wgslPtr, wgslLen, pushUniformSize,
bindingsPtr, bindingsCount, rayQueryFlag) => {
if (!rtState.vertHeap && rayQueryFlag) rtInit();
const userWgsl = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
const wgsl = rayQueryFlag
// rayQueryLib's _rqTraverseBlas/_rqTraverseTlas call _rtAabb,
// _rtFetchTri, _rtTri from rtWgslPureHelpers — must prepend
// the pure helper subset (NOT the megakernel-only traversal
// routines, which reference user-emitted runAnyHit/runMiss/
// runClosestHit and won't compile outside the raygen pipeline).
? (rtWgslTypes + rtWgslMegakernelBindings + rtWgslPureHelpers + rtWgslRayQueryLib + "\n" + userWgsl)
: userWgsl;
const bindings = [];
if (bindingsCount > 0) {
const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
for (let i = 0; i < bindingsCount; i++) {
bindings.push({
group: dv.getUint8(i*8 + 0),
binding: dv.getUint8(i*8 + 1),
kind: dv.getUint8(i*8 + 2),
pushOffset: dv.getUint32(i*8 + 4, true),
});
}
}
// Bind-group layouts. The push uniform sits at @group(0) if present;
// user bindings start at @group(1) (or @group(2) when rayQuery is on
// — see rtWgslMegakernelBindings).
const bgls = [];
if (pushUniformSize > 0) {
bgls.push(device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE,
buffer: { type: "uniform", minBindingSize: pushUniformSize } },
]}));
} else {
bgls.push(device.createBindGroupLayout({ entries: [] }));
}
if (rayQueryFlag) {
// group(1) is the megakernel RT bindings — same layout as the RT
// pipeline's group(1). The injected rayQuery library reads it.
bgls.push(device.createBindGroupLayout({ entries: [
{ binding: 0, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 1, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 2, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 3, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 4, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 6, visibility: GPUShaderStage.COMPUTE,
storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
{ binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 8, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
{ binding: 9, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
]}));
}
const byGroup = new Map();
const userGroupBase = rayQueryFlag ? 2 : 1;
for (const b of bindings) {
if (b.group < userGroupBase) {
console.error(`[crafter-wgpu] compute pipeline: @group(${b.group}) reserved; user groups must be >= ${userGroupBase}`);
return 0;
}
if (!byGroup.has(b.group)) byGroup.set(b.group, []);
byGroup.get(b.group).push(b);
}
const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : userGroupBase - 1;
for (let g = userGroupBase; g <= highest; g++) {
if (byGroup.has(g)) {
const entries = byGroup.get(g).map(b => {
const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
if (b.kind === 0) e.buffer = { type: "read-only-storage" };
else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
else if (b.kind === 2) e.sampler = { type: "filtering" };
else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
else if (b.kind === 4) e.buffer = { type: "storage" }; // read-write storage
return e;
});
bgls.push(device.createBindGroupLayout({ entries }));
} else {
bgls.push(device.createBindGroupLayout({ entries: [] }));
}
}
let pipeline;
try {
const mod = device.createShaderModule({ code: wgsl, label: "compute-pipeline" });
// Async compile-info to surface parse errors with line numbers.
mod.getCompilationInfo().then(info => {
const issues = info.messages.filter(m => m.type === "error" || m.type === "warning");
if (issues.length === 0) return;
const lines = wgsl.split("\n");
for (const m of issues) {
const ln = m.lineNum || 0, col = m.linePos || 0;
console[m.type === "error" ? "error" : "warn"](
`[crafter-wgpu] compute-pipeline ${m.type} at ${ln}:${col}: ${m.message}`);
for (let i = Math.max(1, ln - 3); i <= Math.min(lines.length, ln + 3); i++) {
const marker = i === ln ? ">> " : " ";
console.log(`${marker}${String(i).padStart(4)}: ${lines[i - 1]}`);
}
}
});
const layout = device.createPipelineLayout({ bindGroupLayouts: bgls });
pipeline = device.createComputePipeline({ layout, compute: { module: mod, entryPoint: "main" } });
} catch (e) {
console.error("[crafter-wgpu] compute pipeline compile failed:", e);
return 0;
}
const handle = newHandle();
computePipelines.set(handle, {
pipeline, bgls, byGroup, sortedGroups,
pushUniformSize, userGroupBase,
rayQueryCapable: !!rayQueryFlag,
});
return handle;
};
// Dispatch a standalone compute pipeline. Works both inside the per-frame
// UI compute pass (mid-frame physics tick) and outside it (game-tick
// dispatch from update lambda) — matching the wgpuBuildTLAS pattern.
//
// Push data: if the pipeline declared a push uniform (pushUniformSize > 0
// at load), `pushPtr` points at exactly that many bytes. We allocate a
// transient uniform buffer per dispatch — fine for the physics rate
// (handful of dispatches per substep, single-digit substeps per frame).
//
// Handles: parallel to the UICustomBinding[] declaration order at load
// time. Each handle resolves through the engine's buffer/texture/
// sampler tables to the live WebGPU resource.
env.wgpuDispatchCompute = (pipelineHandle, pushPtr, pushBytes,
handlesPtr, handlesCount, gx, gy, gz) => {
const pipe = computePipelines.get(pipelineHandle);
if (!pipe) {
console.error("[crafter-wgpu] wgpuDispatchCompute: unknown pipeline", pipelineHandle);
return;
}
// ── Push uniform: transient buffer per dispatch ────────────────────
let pushBG = null;
if (pipe.pushUniformSize > 0) {
const buf = device.createBuffer({
size: Math.max(16, (pipe.pushUniformSize + 15) & ~15),
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
});
if (pushBytes > 0) {
queue.writeBuffer(buf, 0, memU8().subarray(pushPtr, pushPtr + pushBytes));
}
pushBG = device.createBindGroup({
layout: pipe.bgls[0],
entries: [{ binding: 0, resource: { buffer: buf, offset: 0, size: pipe.pushUniformSize } }],
});
} else {
pushBG = device.createBindGroup({ layout: pipe.bgls[0], entries: [] });
}
// ── RT bindings (rayQuery only) ────────────────────────────────────
let rtBG = null;
if (pipe.rayQueryCapable) {
const tlasBuf = buffers.get(rtState.currentTlas);
const orderBuf = buffers.get(rtState.currentEntryOrder);
const bvhBuf = buffers.get(rtState.currentBvh);
if (!tlasBuf || !orderBuf || !bvhBuf) {
console.error("[crafter-wgpu] wgpuDispatchCompute rayQuery: no TLAS built yet");
return;
}
const outView = state.outIsPing ? state.pingView : state.pongView;
rtBG = device.createBindGroup({
layout: pipe.bgls[1],
entries: [
{ binding: 0, resource: { buffer: tlasBuf } },
{ binding: 1, resource: { buffer: rtState.bvhHeap.gpu } },
{ binding: 2, resource: { buffer: rtState.meshRecordsBuffer } },
{ binding: 3, resource: { buffer: rtState.vertHeap.gpu } },
{ binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
{ binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
{ binding: 6, resource: outView },
{ binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
{ binding: 8, resource: { buffer: orderBuf } },
{ binding: 9, resource: { buffer: bvhBuf } },
],
});
}
// ── User bind groups from handles ──────────────────────────────────
const handleArr = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
const userBGs = [];
let handleCursor = 0;
const userGroupBase = pipe.userGroupBase;
const userBglStart = pipe.bgls.length - (pipe.sortedGroups.length
? (pipe.sortedGroups[pipe.sortedGroups.length - 1] - userGroupBase + 1)
: 0);
let bglIdx = userBglStart;
if (pipe.sortedGroups.length > 0) {
for (let g = userGroupBase; g <= pipe.sortedGroups[pipe.sortedGroups.length - 1]; g++) {
if (pipe.byGroup.has(g)) {
const entries = pipe.byGroup.get(g).map(b => {
const h = handleArr[handleCursor++];
let resource;
if (b.kind === 0 || b.kind === 4) resource = { buffer: buffers.get(h) };
else if (b.kind === 1) resource = textureViews.get(h);
else if (b.kind === 2) resource = samplers.get(h);
else if (b.kind === 3) resource = textureViews.get(h);
return { binding: b.binding, resource };
});
userBGs.push(device.createBindGroup({ layout: pipe.bgls[bglIdx++], entries }));
} else {
userBGs.push(device.createBindGroup({ layout: pipe.bgls[bglIdx++], entries: [] }));
}
}
}
// ── Execute: attach to active pass, or create a standalone one. ────
const runDispatch = (pass) => {
pass.setPipeline(pipe.pipeline);
pass.setBindGroup(0, pushBG);
if (rtBG) pass.setBindGroup(1, rtBG);
for (let i = 0; i < userBGs.length; i++) {
pass.setBindGroup(userGroupBase + i, userBGs[i]);
}
pass.dispatchWorkgroups(gx, gy, gz);
};
if (state.pass) {
// Mid-frame: dispatch into the current UI compute pass.
runDispatch(state.pass);
} else {
// Standalone: build encoder + pass + submit. Mirrors wgpuBuildTLAS.
const enc = state.encoder || device.createCommandEncoder();
const ownEncoder = !state.encoder;
const pass = enc.beginComputePass({ label: "plain-compute" });
runDispatch(pass);
pass.end();
if (ownEncoder) queue.submit([enc.finish()]);
}
};
console.log("[crafter-wgpu] init complete; env handlers wired");
// Memory diagnostic. Logs handle-table sizes every 5s so a slow leak
// shows up as monotonic growth. Comment out for production builds.
setInterval(() => {
const m = (performance.memory && performance.memory.usedJSHeapSize) || 0;
console.log(`[crafter-wgpu] mem: js=${(m / 1024 / 1024).toFixed(1)}MB`
+ ` buffers=${buffers.size}`
+ ` textures=${textures.size}`
+ ` samplers=${samplers.size}`
+ ` customPipelines=${customPipelines.size}`
+ ` rtPipelines=${rtPipelines.size}`
+ ` computePipelines=${computePipelines.size}`
+ ` bindGroupCache=${state.bindGroupCache ? state.bindGroupCache.size : 0}`);
}, 5000);
} catch (e) {
// Capture any throw so the stub error messages name the real cause
// instead of "(no error captured)". Re-throw so runtime.js's catch
// also logs it.
initError = e instanceof Error ? e : new Error(String(e));
console.error("[crafter-wgpu] init failed:", initError);
throw initError;
}
})(); // end window.crafter_webbuild_env_ready