webgpu sponza

2026-05-19 00:27:09 +02:00 · 2026-05-19 00:27:09 +02:00 · b5d0f52da0
commit b5d0f52da0
parent 5553ded476
21 changed files with 1426 additions and 58 deletions
--- a/additional/dom-webgpu.js
+++ b/additional/dom-webgpu.js
@ -43,6 +43,8 @@ function stub(name) {
        "wgpuGetCanvasWidth", "wgpuGetCanvasHeight", "wgpuSurfaceWidth", "wgpuSurfaceHeight",
        "wgpuInit", "wgpuCreateBuffer", "wgpuWriteBuffer", "wgpuDestroyBuffer",
        "wgpuCreateAtlasTexture", "wgpuWriteAtlasRegion", "wgpuDestroyTexture",
        "wgpuCreateImage2D", "wgpuWriteImage2D",
        "wgpuCreateImage2DArray", "wgpuWriteImage2DLayer",
        "wgpuCreateLinearClampSampler", "wgpuFrameBegin", "wgpuFrameEnd",
        "wgpuDispatchQuads", "wgpuDispatchCircles", "wgpuDispatchImages", "wgpuDispatchText",
        "wgpuLoadCustomShader", "wgpuDispatchCustom",
@ -580,6 +582,99 @@ env.wgpuDestroyTexture = (handle) => {
    if (tex) { tex.destroy(); textures.delete(handle); textureViews.delete(handle); }
 };
 // General-purpose 2D rgba8unorm texture, used by Image2D<RGBA8>. Distinct
 // from the atlas path (r8unorm, sub-region writes) — this one's a one-shot
 // upload of a whole image, sized to the pixel data the caller hands over.
 env.wgpuCreateImage2D = (w, h) => {
    const handle = newHandle();
    const tex = device.createTexture({
        size: [w, h],
        format: "rgba8unorm",
        usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
    });
    textures.set(handle, tex);
    textureViews.set(handle, tex.createView());
    return handle;
 };
 // 2D texture array — N layers of identical (w × h) rgba8unorm. Used by
 // Image2DArray<RGBA8> to back one material albedo per layer; shaders
 // sample with `textureSampleLevel(tex, samp, uv, layerIdx, 0.0)`.
 env.wgpuCreateImage2DArray = (w, h, layerCount) => {
    const handle = newHandle();
    const tex = device.createTexture({
        size: [w, h, layerCount],
        dimension: "2d",
        format: "rgba8unorm",
        usage: GPUTextureUsage.TEXTURE_BINDING | GPUTextureUsage.COPY_DST,
    });
    textures.set(handle, tex);
    textureViews.set(handle, tex.createView({
        dimension: "2d-array",
        arrayLayerCount: layerCount,
    }));
    return handle;
 };
 env.wgpuWriteImage2DLayer = (handle, layer, srcPtr, byteSize, w, h) => {
    const tex = textures.get(handle);
    if (!tex) return;
    const srcBPR = w * 4;
    const alignedBPR = (srcBPR + 255) & ~255;
    if (alignedBPR === srcBPR) {
        queue.writeTexture(
            { texture: tex, origin: [0, 0, layer] },
            memU8().subarray(srcPtr, srcPtr + byteSize),
            { bytesPerRow: srcBPR, rowsPerImage: h },
            { width: w, height: h, depthOrArrayLayers: 1 }
        );
    } else {
        const staging = new Uint8Array(alignedBPR * h);
        const src = memU8();
        for (let y = 0; y < h; y++) {
            staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
                        y * alignedBPR);
        }
        queue.writeTexture(
            { texture: tex, origin: [0, 0, layer] },
            staging,
            { bytesPerRow: alignedBPR, rowsPerImage: h },
            { width: w, height: h, depthOrArrayLayers: 1 }
        );
    }
 };
 env.wgpuWriteImage2D = (handle, srcPtr, byteSize, w, h) => {
    const tex = textures.get(handle);
    if (!tex) return;
    // queue.writeTexture wants bytesPerRow as a multiple of 256, OR == width*bpp
    // when the source is contiguous. RGBA8 = 4 bpp, so bytesPerRow = w*4.
    const srcBPR = w * 4;
    const alignedBPR = (srcBPR + 255) & ~255;
    if (alignedBPR === srcBPR) {
        // Already aligned (w * 4 is a multiple of 256 → w is a multiple of 64).
        queue.writeTexture(
            { texture: tex },
            memU8().subarray(srcPtr, srcPtr + byteSize),
            { bytesPerRow: srcBPR, rowsPerImage: h },
            { width: w, height: h }
        );
    } else {
        // Repack into a 256-aligned staging buffer. One alloc per Update,
        // freed when the function returns — fine for asset-load time use.
        const staging = new Uint8Array(alignedBPR * h);
        const src = memU8();
        for (let y = 0; y < h; y++) {
            staging.set(src.subarray(srcPtr + y * srcBPR, srcPtr + (y + 1) * srcBPR),
                        y * alignedBPR);
        }
        queue.writeTexture(
            { texture: tex },
            staging,
            { bytesPerRow: alignedBPR, rowsPerImage: h },
            { width: w, height: h }
        );
    }
 };
 env.wgpuCreateLinearClampSampler = () => {
    const handle = newHandle();
    samplers.set(handle, device.createSampler({
@ -756,6 +851,7 @@ env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQue
                { binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
                { binding: 6, visibility: GPUShaderStage.COMPUTE,
                  storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
                { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
            ]})
            : device.createBindGroupLayout({ entries: [
                { binding: 0, visibility: GPUShaderStage.COMPUTE,
@ -773,9 +869,10 @@ env.wgpuLoadCustomShader = (wgslPtr, wgslLen, bindingsPtr, bindingsCount, rayQue
        if (byGroup.has(g)) {
            const entries = byGroup.get(g).map(b => {
                const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
-                if      (b.kind === 0) e.buffer = { type: "read-only-storage" };
+                if      (b.kind === 0) e.buffer  = { type: "read-only-storage" };
                else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
                else if (b.kind === 2) e.sampler = { type: "filtering" };
                else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
                return e;
            });
            bgls.push(device.createBindGroupLayout({ entries }));
@ -839,6 +936,7 @@ env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handle
                { binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
                { binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
                { binding: 6, resource: outView },
                { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
            ],
        });
        state.pass.setBindGroup(1, rtBG);
@ -858,6 +956,7 @@ env.wgpuDispatchCustom = (pipelineHandle, pushPtr, pushBytes, handlesPtr, handle
            if      (b.kind === 0) resource = { buffer: buffers.get(h) };
            else if (b.kind === 1) resource = textureViews.get(h);
            else if (b.kind === 2) resource = samplers.get(h);
            else if (b.kind === 3) resource = textureViews.get(h);
            return { binding: b.binding, resource };
        });
        const bg = device.createBindGroup({ layout: pipe.bgls[g], entries });
@ -981,6 +1080,12 @@ struct BVHNode {
 };
 // Per-mesh record. Indexed by RTInstance::accelerationStructureReference.
 // attribsOffset is the per-mesh base index (in u32 words) into the
 // vertexAttribs heap; meshes registered without per-vertex attribs leave
 // it 0 (the heap entries at that range are also 0 / never touched). The
 // per-vertex stride lives in the user's WGSL — the library doesn't store
 // it because the layout is example-defined (Sponza uses 8 u32 / vertex
 // for VertexNormalTangentUVPacked).
 struct MeshRecord {
    rootAabbMin:     vec3<f32>,
    vertexOffset:    u32,
@ -989,7 +1094,7 @@ struct MeshRecord {
    bvhOffset:       u32,
    primRemapOffset: u32,
    triangleCount:   u32,
-    _pad:            u32,
+    attribsOffset:   u32,
 };
 // Per-instance TLAS record built by the TLAS-build compute pass.
@ -1048,6 +1153,7 @@ const rtWgslMegakernelBindings = String.raw`
@group(1) @binding(4) var<storage,read>  indices     : array<u32>;
@group(1) @binding(5) var<storage,read>  primRemap   : array<u32>;
@group(1) @binding(6) var outImage : texture_storage_2d<rgba8unorm, write>;
@group(1) @binding(7) var<storage,read>  vertexAttribs : array<u32>;
 `;
 const rtWgslPrelude = rtWgslTypes + rtWgslMegakernelBindings;
@ -1565,6 +1671,7 @@ const rtState = {
    indexHeap: null,       // u32 stream
    bvhHeap: null,         // BVHNode stream (32 bytes per node)
    primRemapHeap: null,   // u32 stream
    attribsHeap: null,     // u32 stream (per-vertex attribute payload; example-defined stride)
    meshRecordsBuffer: null,    // GPUBuffer of MeshRecord[]
    meshRecordsCapacity: 0,
@ -1588,6 +1695,7 @@ function rtInit() {
    rtState.indexHeap     = makeRtHeap();
    rtState.bvhHeap       = makeRtHeap();
    rtState.primRemapHeap = makeRtHeap();
    rtState.attribsHeap   = makeRtHeap();
    rtState.meshRecordsCapacity = 16;
    rtState.meshRecordsBuffer = device.createBuffer({
        size: rtState.meshRecordsCapacity * 48,
@ -1634,23 +1742,30 @@ env.wgpuRegisterMeshBLAS = (minX, minY, minZ, maxX, maxY, maxZ,
                            verticesPtr, vertexCount,
                            indicesPtr,  indexCount,
                            bvhNodesPtr, bvhNodeCount,
-                            primRemapPtr, primRemapCount) => {
+                            primRemapPtr, primRemapCount,
                            attribsPtr,  attribsByteCount) => {
    if (!rtState.vertHeap) rtInit();
    console.log(`[crafter-wgpu] mesh BLAS: bbox=(${minX.toFixed(1)}..${maxX.toFixed(1)}, ${minY.toFixed(1)}..${maxY.toFixed(1)}, ${minZ.toFixed(1)}..${maxZ.toFixed(1)}), ${vertexCount} verts, ${indexCount/3} tris, attribs=${attribsByteCount}B`);
    const vBytes   = vertexCount  * 12;
    const iBytes   = indexCount   * 4;
    const nBytes   = bvhNodeCount * 32;
    const rBytes   = primRemapCount * 4;
    // attribsByteCount must be a multiple of 4 (the heap is array<u32>).
    // Round up the upload size; the in-MeshRecord offset is in u32 words.
    const aBytes   = (attribsByteCount + 3) & ~3;
    rtHeapEnsure(rtState.vertHeap,      vBytes);
    rtHeapEnsure(rtState.indexHeap,     iBytes);
    rtHeapEnsure(rtState.bvhHeap,       nBytes);
    rtHeapEnsure(rtState.primRemapHeap, rBytes);
    if (aBytes > 0) rtHeapEnsure(rtState.attribsHeap, aBytes);
    const vOff = rtState.vertHeap.cursor      / 12;   // in vec3 units
    const iOff = rtState.indexHeap.cursor     / 4;    // in u32 units
    const nOff = rtState.bvhHeap.cursor       / 32;   // in BVHNode units
    const rOff = rtState.primRemapHeap.cursor / 4;
    const aOff = rtState.attribsHeap.cursor   / 4;    // in u32 units
    // queue.writeBuffer requires multiple-of-4 sizes. Vertex byte count is
    // already 12*n; index/bvh/remap are 4*n / 32*n / 4*n — all multiples of 4.
@ -1662,11 +1777,16 @@ env.wgpuRegisterMeshBLAS = (minX, minY, minZ, maxX, maxY, maxZ,
                      memU8().buffer, bvhNodesPtr, nBytes);
    queue.writeBuffer(rtState.primRemapHeap.gpu, rtState.primRemapHeap.cursor,
                      memU8().buffer, primRemapPtr, rBytes);
    if (aBytes > 0) {
        queue.writeBuffer(rtState.attribsHeap.gpu, rtState.attribsHeap.cursor,
                          memU8().buffer, attribsPtr, aBytes);
    }
    rtState.vertHeap.cursor      += vBytes;
    rtState.indexHeap.cursor     += iBytes;
    rtState.bvhHeap.cursor       += nBytes;
    rtState.primRemapHeap.cursor += rBytes;
    rtState.attribsHeap.cursor   += aBytes;
    const handle = rtState.nextMeshHandle++;
    rtMeshRecordsEnsure(handle + 1);
@ -1682,7 +1802,7 @@ env.wgpuRegisterMeshBLAS = (minX, minY, minZ, maxX, maxY, maxZ,
    u32[8] = nOff;
    u32[9] = rOff;
    u32[10] = (vertexCount > 0) ? (indexCount / 3) : 0;
-    u32[11] = 0;
+    u32[11] = aOff;
    queue.writeBuffer(rtState.meshRecordsBuffer, handle * 48, rec);
    return handle;
@ -1734,9 +1854,13 @@ env.wgpuBuildTLAS = (instanceBufHandle, instanceCount, tlasOutBufHandle) => {
 // RT pipeline loader — wraps user-supplied WGSL (sources + generated mega
 // switches + raygen + @compute entry) with the library prelude/helpers.
-const rtPipelines = new Map(); // handle → { pipeline, bgls }
+// `bindingsPtr` / `bindingsCount` are UICustomBinding entries (same 8-byte
 // shape as wgpuLoadCustomShader) declaring extra @group(2)+ resources the
 // closest-hit / miss / raygen WGSL touches (material SSBOs, albedo
 // textures, samplers). Pass (0, 0) for a pipeline with no user bindings.
 const rtPipelines = new Map(); // handle → { pipeline, bgls, byGroup, sortedGroups }
-env.wgpuLoadRTPipeline = (wgslPtr, wgslLen) => {
+env.wgpuLoadRTPipeline = (wgslPtr, wgslLen, bindingsPtr, bindingsCount) => {
    if (!rtState.vertHeap) rtInit();
    const userPart = new TextDecoder().decode(memU8().subarray(wgslPtr, wgslPtr + wgslLen));
@ -1751,6 +1875,31 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen) => {
    }
    const fullWgsl = rtWgslPrelude + "\n" + beforeHelpers + "\n" + rtWgslHelpers + "\n" + afterHelpers;
    // Parse user bindings (same wire format as wgpuLoadCustomShader).
    const userBindings = [];
    if (bindingsCount > 0) {
        const dv = new DataView(memU8().buffer, bindingsPtr, bindingsCount * 8);
        for (let i = 0; i < bindingsCount; i++) {
            const g = dv.getUint8(i*8 + 0);
            if (g < 2) {
                console.error(`[crafter-wgpu] RT pipeline: @group(${g}) reserved; user bindings need group >= 2`);
                return 0;
            }
            userBindings.push({
                group:      g,
                binding:    dv.getUint8(i*8 + 1),
                kind:       dv.getUint8(i*8 + 2),
                pushOffset: dv.getUint32(i*8 + 4, true),
            });
        }
    }
    const byGroup = new Map();
    for (const b of userBindings) {
        if (!byGroup.has(b.group)) byGroup.set(b.group, []);
        byGroup.get(b.group).push(b);
    }
    const sortedGroups = [...byGroup.keys()].sort((a, b) => a - b);
    let pipeline;
    try {
        const mod = device.createShaderModule({ code: fullWgsl, label: "rt-megakernel" });
@ -1768,13 +1917,34 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen) => {
            { binding: 5, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
            { binding: 6, visibility: GPUShaderStage.COMPUTE,
              storageTexture: { format: "rgba8unorm", access: "write-only", viewDimension: "2d" } },
            { binding: 7, visibility: GPUShaderStage.COMPUTE, buffer: { type: "read-only-storage" } },
        ]});
        // User binding-group layouts. WebGPU pipeline layouts need a
        // contiguous array up to the highest group used, so pad any gaps
        // with empty bgls (same rule as wgpuLoadCustomShader).
        const userBgls = [];
        const highest = sortedGroups.length ? sortedGroups[sortedGroups.length - 1] : 1;
        for (let g = 2; g <= highest; g++) {
            if (byGroup.has(g)) {
                const entries = byGroup.get(g).map(b => {
                    const e = { binding: b.binding, visibility: GPUShaderStage.COMPUTE };
                    if      (b.kind === 0) e.buffer  = { type: "read-only-storage" };
                    else if (b.kind === 1) e.texture = { sampleType: "float", viewDimension: "2d" };
                    else if (b.kind === 2) e.sampler = { type: "filtering" };
                    else if (b.kind === 3) e.texture = { sampleType: "float", viewDimension: "2d-array" };
                    return e;
                });
                userBgls.push(device.createBindGroupLayout({ entries }));
            } else {
                userBgls.push(device.createBindGroupLayout({ entries: [] }));
            }
        }
        pipeline = device.createComputePipeline({
-            layout: device.createPipelineLayout({ bindGroupLayouts: [headerBgl, dataBgl] }),
+            layout: device.createPipelineLayout({ bindGroupLayouts: [headerBgl, dataBgl, ...userBgls] }),
            compute: { module: mod, entryPoint: "main" },
        });
        const handle = newHandle();
-        rtPipelines.set(handle, { pipeline, headerBgl, dataBgl });
+        rtPipelines.set(handle, { pipeline, headerBgl, dataBgl, userBgls, byGroup, sortedGroups });
        return handle;
    } catch (e) {
        console.error("[crafter-wgpu] RT pipeline compile failed:", e);
@ -1784,7 +1954,8 @@ env.wgpuLoadRTPipeline = (wgslPtr, wgslLen) => {
 };
 env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
-                      tlasBufHandle, instanceCount, gx, gy) => {
+                      tlasBufHandle, instanceCount, gx, gy,
                      handlesPtr, handlesCount) => {
    if (!state.pass) return;
    const pipe = rtPipelines.get(pipelineHandle);
    const tlas = buffers.get(tlasBufHandle);
@ -1815,12 +1986,41 @@ env.wgpuDispatchRT = (pipelineHandle, pushPtr, pushBytes,
            { binding: 4, resource: { buffer: rtState.indexHeap.gpu } },
            { binding: 5, resource: { buffer: rtState.primRemapHeap.gpu } },
            { binding: 6, resource: outView },
            { binding: 7, resource: { buffer: rtState.attribsHeap.gpu } },
        ],
    });
    state.pass.setPipeline(pipe.pipeline);
    state.pass.setBindGroup(0, headerBg);
    state.pass.setBindGroup(1, dataBg);
    // User bindings: walk byGroup in the same sorted order the C++ side
    // packed handles[], picking up indices linearly.
    if (handlesCount > 0) {
        const handles = new Uint32Array(memU8().buffer, handlesPtr, handlesCount);
        let handleIdx = 0;
        let bglIdx = 0;
        for (let g = 2; g <= (pipe.sortedGroups[pipe.sortedGroups.length - 1] || 1); g++) {
            if (pipe.byGroup.has(g)) {
                const entries = pipe.byGroup.get(g).map(b => {
                    const h = handles[handleIdx++];
                    let resource;
                    if      (b.kind === 0) resource = { buffer: buffers.get(h) };
                    else if (b.kind === 1) resource = textureViews.get(h);
                    else if (b.kind === 2) resource = samplers.get(h);
                    else if (b.kind === 3) resource = textureViews.get(h);
                    return { binding: b.binding, resource };
                });
                const bg = device.createBindGroup({
                    layout: pipe.userBgls[bglIdx],
                    entries,
                });
                state.pass.setBindGroup(g, bg);
            }
            bglIdx++;
        }
    }
    state.pass.dispatchWorkgroups(gx, gy, 1);
    state.outIsPing = !state.outIsPing;
 };
--- a/examples/Sponza/README.md
+++ b/examples/Sponza/README.md
@ -0,0 +1,58 @@
 # Sponza example
 Loads the Sponza atrium as a `.cmesh` + one albedo `.ctex` and renders
 it via ray tracing on both Vulkan (native) and WebGPU (wasm). Same
 `main.cpp`, `#ifdef CRAFTER_GRAPHICS_WINDOW_DOM` selects the backend.
 ## What this example proves
 - `.cmesh` and `.ctex` decompression round-trip on both backends
  (GPU via `VK_EXT_memory_decompression` on Vulkan, CPU via
  `Compression::DecompressCPU` on WebGPU).
 - A single texture binding flowing from `Image2D<RGBA8>` through the
  RT pipeline's closest-hit on both backends. The closest-hit samples
  at the barycentric attribs as UVs — proof-of-binding, not visually
  accurate. Per-vertex UV interpolation is the next step.
 ## Asset fetch
 `project.cpp` calls `Crafter::GitFetch(...)` on
 [https://github.com/jimmiebergmann/Sponza](https://github.com/jimmiebergmann/Sponza)
 (pinned to commit `222338979d32f4f4818466291bdbc29f192b86ba`). The
 clone lands in the per-user crafter-build cache; first build pulls
 ~280 MB once, subsequent builds reuse it.
 `cfg.assets` then picks two files out of that clone:
 | Source                                  | Compressed output       |
 |-----------------------------------------|-------------------------|
 | `sponza.obj`                            | `sponza.cmesh`          |
 | `textures/sponza_arch_diff.tga`         | `sponza_arch_diff.ctex` |
 Both land flat in the example's bin directory.
 ## Building
 ```
 crafter build                          # native Vulkan
 crafter build --target=wasm32-wasip1   # WebGPU / wasm
 ```
 ## License & attribution
 Sponza geometry, materials, and textures are licensed under
 [CC BY 3.0](https://creativecommons.org/licenses/by/3.0/).
 - **Original model:** Frank Meinl, Crytek (2010).
 - **OBJ packaging / cleanup:** Morgan McGuire, McGuire Computer
  Graphics Archive — https://casual-effects.com/data.
 - **GitHub mirror used here:** Jimmie Bergmann's roof-material fixup —
  https://github.com/jimmiebergmann/Sponza.
 When redistributing builds of this example that bundle the compressed
 Sponza outputs (`*.cmesh`, `*.ctex`), the CC BY 3.0 attribution
 requirement applies. Quoting the original credit somewhere visible to
 end users (about-screen, credits page, etc.) is enough.
 The Crafter.Graphics library code itself is LGPL-3.0; the two
 licenses are compatible for data + code distribution.
--- a/examples/Sponza/closesthit.glsl
+++ b/examples/Sponza/closesthit.glsl
@ -0,0 +1,23 @@
 #version 460
 #extension GL_EXT_ray_tracing : enable
 #extension GL_EXT_shader_image_load_formatted : enable
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 #extension GL_EXT_descriptor_heap : enable
 #extension GL_EXT_nonuniform_qualifier : enable
 // Specialization constant: descriptor-heap slot of the albedo texture.
 // Set from descriptorHeap.bufferStartElement + the slot allocated for
 // the Image2D<RGBA8> on the host side. Sampling uses gl_HitAttributeEXT
 // barycentrics as UVs — proof-of-binding rather than UV-correct shading.
 // Per-vertex UV interpolation lands when Mesh on Vulkan exposes the
 // data-region buffer.
 layout(constant_id = 0) const uint16_t albedoSlot = 0us;
 layout(descriptor_heap) uniform sampler2D albedo[];
 hitAttributeEXT vec2 hitAttrs;
 layout(location = 0) rayPayloadInEXT vec3 hitValue;
 void main() {
    vec2 bary = vec2(hitAttrs.x, hitAttrs.y);
    hitValue = texture(albedo[albedoSlot], bary).rgb;
 }
--- a/examples/Sponza/closesthit.wgsl
+++ b/examples/Sponza/closesthit.wgsl
@ -0,0 +1,90 @@
 // Payload declared here so the WGSL assembler sees it before raygen
 // (the assembler concatenates closesthit/anyhit/miss BEFORE raygen).
 //
 // WGSL forbids cycles in the function call graph, so closesthit_main
 // CAN'T call traceRay (that would create closesthit → traceRay →
 // runClosestHit → closesthit). The lighting + shadow trace therefore
 // happens in raygen; closesthit's job is just to gather surface data
 // into the payload.
 //
 //   shadowRay = 0 (primary): closesthit fills albedo/worldPos/normal/hit.
 //   shadowRay = 1 (shadow):  closesthit is skipped (RT_FLAG_SKIP_CLOSEST_HIT),
 //                            miss flips color to white = "lit".
 struct Payload {
    color:       vec3<f32>,
    shadowRay:   u32,
    worldPos:    vec3<f32>,
    hit:         u32,
    worldNormal: vec3<f32>,
    _pad:        f32,
 };
 // User-bound resources at group(2). Matches the UICustomBinding span the
 // host hands to PipelineRTWebGPU::Init.
 //   binding 0 — albedo texture_2d_array, one layer per Sponza material
 //   binding 1 — sampler (linear clamp)
 //   binding 2 — camera storage buffer (read by raygen only)
@group(2) @binding(0) var albedos : texture_2d_array<f32>;
@group(2) @binding(1) var samp    : sampler;
 // VertexNormalTangentUVPacked is `packed` on the outer struct but each
 // inner `Vector<float, N, 4>` is SIMD-aligned to a 16-byte stride. So
 // each vertex is 12 u32 words: normal at 0..2, tangent at 4..6, uv at 8..9.
 const ATTRIB_STRIDE_U32:    u32 = 12u;
 const ATTRIB_NORMAL_OFFSET: u32 = 0u;
 const ATTRIB_UV_OFFSET:     u32 = 8u;
 fn fetchUV(meshRec: MeshRecord, vertexIdx: u32) -> vec2<f32> {
    let base = meshRec.attribsOffset + vertexIdx * ATTRIB_STRIDE_U32 + ATTRIB_UV_OFFSET;
    return vec2<f32>(
        bitcast<f32>(vertexAttribs[base + 0u]),
        bitcast<f32>(vertexAttribs[base + 1u]),
    );
 }
 fn fetchNormal(meshRec: MeshRecord, vertexIdx: u32) -> vec3<f32> {
    let base = meshRec.attribsOffset + vertexIdx * ATTRIB_STRIDE_U32 + ATTRIB_NORMAL_OFFSET;
    return vec3<f32>(
        bitcast<f32>(vertexAttribs[base + 0u]),
        bitcast<f32>(vertexAttribs[base + 1u]),
        bitcast<f32>(vertexAttribs[base + 2u]),
    );
 }
 fn closesthit_main(ray: RayDesc, hit: HitInfo, payload: ptr<function, Payload>) {
    // Resolve hit triangle → 3 vertex indices.
    let meshIdx = tlasEntries[hit.instanceId].blasMeshIdx;
    let meshRec = meshRecords[meshIdx];
    let baseIdx = meshRec.indexOffset + hit.primitiveId * 3u;
    let i0 = indices[baseIdx + 0u];
    let i1 = indices[baseIdx + 1u];
    let i2 = indices[baseIdx + 2u];
    let bary = vec3<f32>(1.0 - hit.attribs.x - hit.attribs.y, hit.attribs.x, hit.attribs.y);
    // Albedo via barycentric UV interpolation.
    let uv0 = fetchUV(meshRec, i0);
    let uv1 = fetchUV(meshRec, i1);
    let uv2 = fetchUV(meshRec, i2);
    let uv  = uv0 * bary.x + uv1 * bary.y + uv2 * bary.z;
    // OBJ V is bottom-up; sampler is top-down. fract for manual tiling.
    let uvTiled = vec2<f32>(fract(uv.x), fract(1.0 - uv.y));
    let layer   = i32(hit.customIndex);
    let albedo  = textureSampleLevel(albedos, samp, uvTiled, layer, 0.0).rgb;
    // World-space smooth shading normal. Multiply through the
    // object-to-world rotation so this stays correct if a future scene
    // rotates instances (Sponza itself is all identities).
    let n0 = fetchNormal(meshRec, i0);
    let n1 = fetchNormal(meshRec, i1);
    let n2 = fetchNormal(meshRec, i2);
    let nObj = normalize(n0 * bary.x + n1 * bary.y + n2 * bary.z);
    let nWorld = normalize(vec3<f32>(
        dot(hit.objectToWorldR0.xyz, nObj),
        dot(hit.objectToWorldR1.xyz, nObj),
        dot(hit.objectToWorldR2.xyz, nObj)));
    (*payload).color       = albedo;
    (*payload).worldPos    = ray.origin + ray.direction * hit.t;
    (*payload).worldNormal = nWorld;
    (*payload).hit         = 1u;
 }
--- a/examples/Sponza/main.cpp
+++ b/examples/Sponza/main.cpp
@ -0,0 +1,445 @@
 // Sponza on Vulkan + WebGPU. Same example source, two backends — picked
 // by CRAFTER_GRAPHICS_WINDOW_DOM. Both paths:
 //   1. Load a Sponza .cmesh (positions + indices, optional per-vertex
 //      data region) and a single albedo .ctex from disk. The source
 //      assets are fetched once by project.cpp (Crafter.Build::GitFetch)
 //      from https://github.com/jimmiebergmann/Sponza and compressed
 //      into the bin dir at build time — they don't live in this repo.
 //   2. Build BLAS + TLAS via the existing Mesh / RenderingElement3D
 //      flow. The on-disk format is identical between backends; only
 //      the decompression path differs (VK_EXT_memory_decompression
 //      on Vulkan, CPU GDeflate on WebGPU).
 //   3. Upload the albedo as Image2D<RGBA8>, register it in the
 //      backend descriptor heap, and run the RT pipeline. Closest-hit
 //      shaders sample the texture at the hit's barycentric coords —
 //      proof-of-binding rather than UV-correct shading. Per-vertex
 //      UV interpolation is follow-up work (the attribs heap is in
 //      place on WebGPU; the Vulkan side needs a sibling data buffer
 //      exposed off Mesh).
 //
 // Sponza model: CC BY 3.0 — Frank Meinl (Crytek), packaged by Jimmie
 // Bergmann and Morgan McGuire. https://casual-effects.com/data
 #ifndef CRAFTER_GRAPHICS_WINDOW_DOM
 #include "vulkan/vulkan.h"
 #endif
 import Crafter.Graphics;
 import Crafter.Asset;
 import Crafter.Math;
 import Crafter.Event;
 import std;
 using namespace Crafter;
 namespace fs = std::filesystem;
 namespace {
    struct RGBA8 { std::uint8_t r, g, b, a; };
    void RequireAssets(const fs::path& mesh, const fs::path& tex) {
        const bool haveMesh = fs::exists(mesh);
        const bool haveTex  = fs::exists(tex);
        if (haveMesh && haveTex) return;
        std::println(std::cerr,
            "[Sponza] missing asset(s):\n"
            "  mesh:    {} {}\n"
            "  albedo:  {} {}\n"
            "The build should have populated these via cfg.assets +\n"
            "GitFetch (see examples/Sponza/project.cpp). If you ran\n"
            "the binary from outside its bin dir, cd into the bin dir\n"
            "first — asset paths are relative to cwd.",
            mesh.string(), haveMesh ? "OK" : "MISSING",
            tex.string(),  haveTex  ? "OK" : "MISSING");
        std::abort();
    }
 }
 #ifndef CRAFTER_GRAPHICS_WINDOW_DOM
 int main() {
    // Native Vulkan path is single-material for now (see file header) —
    // pick up just the first per-material output the build emits. The
    // WebGPU branch below uses every mesh + a texture array.
    const fs::path meshPath = "mesh_0.cmesh";
    const fs::path texPath  = "tex_0.ctex";
    RequireAssets(meshPath, texPath);
    CompressedMeshAsset    loadedMesh = LoadCompressedMesh(meshPath);
    CompressedTextureAsset loadedTex  = LoadCompressedTexture(texPath);
    std::println("[Sponza] loaded {} verts, {} idx, {}x{} albedo",
                 loadedMesh.vertexCount, loadedMesh.indexCount,
                 loadedTex.sizeX, loadedTex.sizeY);
    Device::Initialize();
    Window window(1280, 720, "Sponza");
    VkCommandBuffer cmd = window.StartInit();
    DescriptorHeapVulkan descriptorHeap;
    descriptorHeap.Initialize(/*images*/ 2, /*buffers*/ 1, /*samplers*/ 0);
    // Two specialization constants: the TLAS slot offset (shared with
    // VulkanTriangle pattern) and the albedo slot index for closesthit.
    VkSpecializationMapEntry raygenEntry = { .constantID = 0, .offset = 0, .size = sizeof(std::uint16_t) };
    VkSpecializationInfo raygenSpec = {
        .mapEntryCount = 1, .pMapEntries = &raygenEntry,
        .dataSize = sizeof(std::uint16_t), .pData = &descriptorHeap.bufferStartElement,
    };
    // Allocate the albedo slot first so its index is known when we
    // compile closesthit.spv.
    auto imgSlots    = descriptorHeap.AllocateImageSlots(2);
    auto bufSlots    = descriptorHeap.AllocateBufferSlots(1);
    std::uint16_t albedoHeapSlot = static_cast<std::uint16_t>(imgSlots.firstElement + 1);
    VkSpecializationMapEntry hitEntry = { .constantID = 0, .offset = 0, .size = sizeof(std::uint16_t) };
    VkSpecializationInfo hitSpec = {
        .mapEntryCount = 1, .pMapEntries = &hitEntry,
        .dataSize = sizeof(std::uint16_t), .pData = &albedoHeapSlot,
    };
    std::array<VulkanShader, 3> shaders {{
        { "raygen.spv",     "main", VK_SHADER_STAGE_RAYGEN_BIT_KHR,      &raygenSpec },
        { "miss.spv",       "main", VK_SHADER_STAGE_MISS_BIT_KHR,        nullptr     },
        { "closesthit.spv", "main", VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR, &hitSpec    },
    }};
    ShaderBindingTableVulkan shaderTable;
    shaderTable.Init(shaders);
    std::array<VkRayTracingShaderGroupCreateInfoKHR, 1> raygenGroups {{ {
        .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
        .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR,
        .generalShader = 0, .closestHitShader = VK_SHADER_UNUSED_KHR,
        .anyHitShader = VK_SHADER_UNUSED_KHR, .intersectionShader = VK_SHADER_UNUSED_KHR,
    } }};
    std::array<VkRayTracingShaderGroupCreateInfoKHR, 1> missGroups {{ {
        .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
        .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR,
        .generalShader = 1, .closestHitShader = VK_SHADER_UNUSED_KHR,
        .anyHitShader = VK_SHADER_UNUSED_KHR, .intersectionShader = VK_SHADER_UNUSED_KHR,
    } }};
    std::array<VkRayTracingShaderGroupCreateInfoKHR, 1> hitGroups {{ {
        .sType = VK_STRUCTURE_TYPE_RAY_TRACING_SHADER_GROUP_CREATE_INFO_KHR,
        .type = VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR,
        .generalShader = VK_SHADER_UNUSED_KHR, .closestHitShader = 2,
        .anyHitShader = VK_SHADER_UNUSED_KHR, .intersectionShader = VK_SHADER_UNUSED_KHR,
    } }};
    PipelineRTVulkan pipeline;
    pipeline.Init(cmd, raygenGroups, missGroups, hitGroups, shaderTable);
    Mesh sponzaMesh;
    sponzaMesh.Build(loadedMesh, cmd);
    Image2D<RGBA8> albedo;
    albedo.Create(loadedTex.sizeX, loadedTex.sizeY, /*mipLevels*/ 1, cmd,
                  VK_FORMAT_R8G8B8A8_UNORM,
                  VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
                  VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
    albedo.Update(loadedTex, cmd, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
    SamplerVulkan<RGBA8> sampler;
    static RenderingElement3D renderer;
    renderer.instance = {
        .transform                              = {},
        .instanceCustomIndex                    = 0,
        .mask                                   = 0xFF,
        .instanceShaderBindingTableRecordOffset = 0,
        .flags                                  = VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR,
        .accelerationStructureReference         = sponzaMesh.blasAddr,
    };
    MatrixRowMajor<float, 4, 3, 1>::Identity()
        .Store(reinterpret_cast<float*>(renderer.instance.transform.matrix));
    RenderingElement3D::elements.emplace_back(&renderer);
    RenderingElement3D::BuildTLAS(cmd, 0);
    RenderingElement3D::BuildTLAS(cmd, 1);
    RenderingElement3D::BuildTLAS(cmd, 2);
    window.FinishInit();
    // Write descriptors: TLAS at bufSlots[0], output image at imgSlots[0],
    // albedo (combined image+sampler) at imgSlots[1]. Per-frame replicated.
    VkDeviceAddressRangeKHR tlasRanges[Window::numFrames];
    VkImageDescriptorInfoEXT outImgInfos[Window::numFrames];
    VkDescriptorImageInfo albedoInfo {
        .sampler = sampler.textureSampler,
        .imageView = albedo.imageView,
        .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
    };
    for (std::uint32_t f = 0; f < Window::numFrames; ++f) {
        tlasRanges[f] = { .address = RenderingElement3D::tlases[f].address };
        outImgInfos[f] = {
            .sType = VK_STRUCTURE_TYPE_IMAGE_DESCRIPTOR_INFO_EXT,
            .pView = &window.imageViews[f],
            .layout = VK_IMAGE_LAYOUT_GENERAL,
        };
    }
    std::vector<VkResourceDescriptorInfoEXT> resources;
    std::vector<VkHostAddressRangeEXT>       destinations;
    resources.reserve(Window::numFrames * 3);
    destinations.reserve(Window::numFrames * 3);
    for (std::uint32_t f = 0; f < Window::numFrames; ++f) {
        resources.push_back({
            .sType = VK_STRUCTURE_TYPE_RESOURCE_DESCRIPTOR_INFO_EXT,
            .type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR,
            .data = { .pAddressRange = &tlasRanges[f] },
        });
        destinations.push_back({
            .address = descriptorHeap.resourceHeap[f].value
                     + descriptorHeap.BufferByteOffset(bufSlots.firstElement),
            .size = Device::descriptorHeapProperties.bufferDescriptorSize,
        });
        resources.push_back({
            .sType = VK_STRUCTURE_TYPE_RESOURCE_DESCRIPTOR_INFO_EXT,
            .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
            .data = { .pImage = &outImgInfos[f] },
        });
        destinations.push_back({
            .address = descriptorHeap.resourceHeap[f].value
                     + descriptorHeap.ImageByteOffset(imgSlots.firstElement),
            .size = Device::descriptorHeapProperties.imageDescriptorSize,
        });
        resources.push_back({
            .sType = VK_STRUCTURE_TYPE_RESOURCE_DESCRIPTOR_INFO_EXT,
            .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
            .data = { .pCombinedImageSampler = &albedoInfo },
        });
        destinations.push_back({
            .address = descriptorHeap.resourceHeap[f].value
                     + descriptorHeap.ImageByteOffset(albedoHeapSlot),
            .size = Device::descriptorHeapProperties.imageDescriptorSize,
        });
    }
    Device::vkWriteResourceDescriptorsEXT(Device::device,
        static_cast<std::uint32_t>(resources.size()),
        resources.data(), destinations.data());
    for (std::uint32_t f = 0; f < Window::numFrames; ++f) {
        descriptorHeap.resourceHeap[f].FlushDevice();
    }
    window.descriptorHeap = &descriptorHeap;
    RTPass rtPass(&pipeline);
    window.passes.push_back(&rtPass);
    window.Render();
    window.StartSync();
    return 0;
 }
 #else
 int main() {
    // ── Read scene manifest (produced by project.cpp's ImportSponzaBundle).
    //
    //   line 1: albedoCount
    //   line 2: meshCount
    //   line 3..: per-mesh albedoIdx (-1 means "no albedo")
    const fs::path manifestPath = "scene.txt";
    if (!fs::exists(manifestPath)) {
        std::println(std::cerr,
            "[Sponza] missing scene.txt — the build should have produced "
            "it (see examples/Sponza/project.cpp). If you ran the binary "
            "from outside its bin dir, cd in first.");
        std::abort();
    }
    std::ifstream manifest(manifestPath);
    std::uint32_t albedoCount = 0, meshCount = 0;
    manifest >> albedoCount >> meshCount;
    std::vector<std::int32_t> meshAlbedo(meshCount);
    for (std::uint32_t i = 0; i < meshCount; ++i) manifest >> meshAlbedo[i];
    std::println("[Sponza] scene: {} albedos, {} meshes", albedoCount, meshCount);
    Device::Initialize();
    static Window window(1280, 720, "Sponza");
    auto cmd = window.StartInit();
    DescriptorHeapWebGPU heap;
    heap.Initialize(/*images*/ 2, /*buffers*/ 2, /*samplers*/ 2);
    std::array<WebGPUShader, 3> shaders {{
        WebGPUShader(fs::path("raygen.wgsl"),     "raygen_main",     WebGPURTStage::Raygen),
        WebGPUShader(fs::path("miss.wgsl"),       "miss_main",       WebGPURTStage::Miss),
        WebGPUShader(fs::path("closesthit.wgsl"), "closesthit_main", WebGPURTStage::ClosestHit),
    }};
    ShaderBindingTableWebGPU sbt;
    sbt.Init(shaders);
    std::array<RTShaderGroup, 1> raygenGroups {{
        { .type = RTShaderGroupType::General,           .generalShader = 0 },
    }};
    std::array<RTShaderGroup, 1> missGroups {{
        { .type = RTShaderGroupType::General,           .generalShader = 1 },
    }};
    std::array<RTShaderGroup, 1> hitGroups {{
        { .type = RTShaderGroupType::TrianglesHitGroup, .closestHitShader = 2 },
    }};
    // Three user bindings at @group(2):
    //   binding 0 — albedo texture_2d_array (one layer per material)
    //   binding 1 — sampler (linear clamp)
    //   binding 2 — Camera storage buffer (host-driven, updated per frame)
    std::array<UICustomBinding, 3> bindings {{
        { .group = 2, .binding = 0, .kind = UICustomBindingKind::SampledTextureArray, ._pad = 0, .pushOffset = 0 },
        { .group = 2, .binding = 1, .kind = UICustomBindingKind::Sampler,             ._pad = 0, .pushOffset = 0 },
        { .group = 2, .binding = 2, .kind = UICustomBindingKind::Buffer,              ._pad = 0, .pushOffset = 0 },
    }};
    PipelineRTWebGPU pipeline;
    pipeline.Init(cmd, raygenGroups, missGroups, hitGroups, sbt, bindings);
    // ── Albedo texture array — one rgba8unorm layer per material. ──────
    //
    // Probe layer 0 for the canonical layer dimensions; project.cpp
    // already resized every albedo to the same square so any tex_N.ctex
    // would do, layer 0 is just the first one we have.
    Image2DArray<RGBA8> albedoArray;
    {
        CompressedTextureAsset probe = LoadCompressedTexture("tex_0.ctex");
        albedoArray.Create(probe.sizeX, probe.sizeY, static_cast<std::uint16_t>(albedoCount));
        albedoArray.UpdateLayer(0, probe);
        for (std::uint32_t i = 1; i < albedoCount; ++i) {
            CompressedTextureAsset tex = LoadCompressedTexture(std::format("tex_{}.ctex", i));
            albedoArray.UpdateLayer(static_cast<std::uint16_t>(i), tex);
        }
    }
    auto albedoArraySlot = albedoArray.AllocateSlot(heap);
    SamplerSlot samplerSlot = AllocateLinearClampSampler(heap);
    // Camera storage buffer — host writes (origin, right, up, forward,
    // aspect, tanHalf) every frame from the input-driven free camera
    // below. Layout matches the WGSL Camera struct in raygen.wgsl
    // (vec3-aligned, std430). 64 bytes total.
    struct CameraGPU {
        float origin[3];   float pad0;
        float right[3];    float tanHalf;
        float up[3];       float aspect;
        float forward[3];  float pad1;
    };
    static_assert(sizeof(CameraGPU) == 64);
    WebGPUBuffer<CameraGPU, true> cameraBuf;
    cameraBuf.Create(1);
    // Handle array fed to RTPass — order matches the bindings declaration.
    static std::array<std::uint32_t, 3> userHandles {
        heap.imageTable  [albedoArraySlot.firstElement],
        heap.samplerTable[samplerSlot.firstElement],
        cameraBuf.handle,
    };
    // ── Meshes + scene instances ───────────────────────────────────────
    //
    // One Mesh + one RenderingElement3D per material group from
    // scene.txt. Meshes whose albedoIdx is -1 (the .obj's `usemtl` named
    // something without a map_Kd in .mtl) get dropped — they're rare in
    // Sponza and we'd have nothing to sample for them anyway.
    //
    // Vector capacity is reserved up-front: RenderingElement3D::Add
    // takes a pointer that's stored in the static elements[] vector, so
    // any later vector reallocation would dangle those pointers.
    static std::vector<Mesh> meshes;
    static std::vector<RenderingElement3D> renderers;
    meshes.reserve(meshCount);
    renderers.reserve(meshCount);
    for (std::uint32_t i = 0; i < meshCount; ++i) {
        if (meshAlbedo[i] < 0) continue;
        CompressedMeshAsset loaded = LoadCompressedMesh(std::format("mesh_{}.cmesh", i));
        meshes.emplace_back();
        meshes.back().Build(loaded, cmd);
        renderers.emplace_back();
        RenderingElement3D& r = renderers.back();
        auto& tx = r.instance.transform.matrix;
        tx[0][0] = 1; tx[0][1] = 0; tx[0][2] = 0; tx[0][3] = 0;
        tx[1][0] = 0; tx[1][1] = 1; tx[1][2] = 0; tx[1][3] = 0;
        tx[2][0] = 0; tx[2][1] = 0; tx[2][2] = 1; tx[2][3] = 0;
        // 24-bit instanceCustomIndex carries the albedo array layer that
        // closesthit.wgsl reads as `hit.customIndex`.
        r.instance.instanceCustomIndex                    = static_cast<std::uint32_t>(meshAlbedo[i]);
        r.instance.mask                                   = 0xFF;
        r.instance.instanceShaderBindingTableRecordOffset = 0;
        r.instance.flags                                  = kRTGeometryInstanceForceOpaque;
        r.instance.accelerationStructureReference         = meshes.back().blasAddr;
        RenderingElement3D::Add(&r);
    }
    RenderingElement3D::BuildTLAS(cmd, 0);
    window.descriptorHeap = &heap;
    window.FinishInit();
    RTPass rtPass(&pipeline);
    rtPass.handlesPtr   = userHandles.data();
    rtPass.handlesCount = static_cast<std::uint32_t>(userHandles.size());
    window.passes.push_back(&rtPass);
    // ── Free camera: WASD + mouse-delta look ───────────────────────────
    //
    // Initial pose puts the camera near one end of the atrium at eye
    // height, looking +X down the long axis (bbox: X[-1921..1800],
    // Y[-126..1429], Z[-1183..1105]). The user can fine-tune from there.
    struct CamState {
        Vector<float, 3, 4> position{ -1500.0f, 200.0f, 0.0f };
        float yaw   = 0.0f;   // radians, around world +Y
        float pitch = 0.0f;   // radians, +pitch looks up
    } cam;
    Input::Map inputMap;
    Input::Action& moveAct = inputMap.AddAction("Move", Input::ActionType::Vector2);
    Input::Action& lookAct = inputMap.AddAction("Look", Input::ActionType::Vector2);
    moveAct.bindings = {
        Input::WASDBind{
            Key(CrafterKeys::W), Key(CrafterKeys::S),
            Key(CrafterKeys::A), Key(CrafterKeys::D),
        },
    };
    lookAct.bindings = {
        Input::MouseDeltaBind{ 1.0f },
    };
    inputMap.Attach(window);
    constexpr float kMoveSpeed = 1200.0f;  // Sponza units / second (room is ~3700 wide)
    constexpr float kLookSens  = 0.05f;   // radians per mouse pixel
    constexpr float kDt        = 1.0f / 60.0f;
    EventListener<void> camTick(&window.onBeforeUpdate, [&]() {
        inputMap.Tick();
        cam.yaw   += lookAct.vector2.x * kLookSens;
        cam.pitch -= lookAct.vector2.y * kLookSens;
        // Keep pitch just shy of straight up/down so the basis vectors
        // don't collapse (cross(forward, world_up) would go zero).
        cam.pitch = std::clamp(cam.pitch, -1.55f, 1.55f);
        const float cp = std::cos(cam.pitch), sp = std::sin(cam.pitch);
        const float cy = std::cos(cam.yaw),   sy = std::sin(cam.yaw);
        Vector<float, 3, 4> forward { cp * cy, sp, cp * sy };
        Vector<float, 3, 4> worldUp { 0.0f, 1.0f, 0.0f };
        Vector<float, 3, 4> right { forward.y * worldUp.z - forward.z * worldUp.y,
                                    forward.z * worldUp.x - forward.x * worldUp.z,
                                    forward.x * worldUp.y - forward.y * worldUp.x };
        const float rLen = std::sqrt(right.x*right.x + right.y*right.y + right.z*right.z);
        right.x /= rLen; right.y /= rLen; right.z /= rLen;
        Vector<float, 3, 4> up { right.y * forward.z - right.z * forward.y,
                                 right.z * forward.x - right.x * forward.z,
                                 right.x * forward.y - right.y * forward.x };
        const float dx = moveAct.vector2.x * kMoveSpeed * kDt;
        const float dy = moveAct.vector2.y * kMoveSpeed * kDt;
        cam.position.x += right.x * dx + forward.x * dy;
        cam.position.y += right.y * dx + forward.y * dy;
        cam.position.z += right.z * dx + forward.z * dy;
        CameraGPU& g  = cameraBuf.value[0];
        g.origin[0]   = cam.position.x; g.origin[1]   = cam.position.y; g.origin[2]   = cam.position.z; g.pad0 = 0.0f;
        g.right[0]    = right.x;        g.right[1]    = right.y;        g.right[2]    = right.z;
        g.up[0]       = up.x;           g.up[1]       = up.y;           g.up[2]       = up.z;
        g.forward[0]  = forward.x;      g.forward[1]  = forward.y;      g.forward[2]  = forward.z;
        g.aspect      = float(window.width) / float(window.height);
        g.tanHalf     = std::tan(70.0f * 3.14159265f / 360.0f);
        g.pad1        = 0.0f;
        cameraBuf.FlushDevice();
    });
    window.Render();
    window.StartUpdate();
    window.StartSync();
    return 0;
 }
 #endif
--- a/examples/Sponza/miss.glsl
+++ b/examples/Sponza/miss.glsl
@ -0,0 +1,11 @@
 #version 460
 #extension GL_EXT_ray_tracing : enable
 layout(location = 0) rayPayloadInEXT vec3 hitValue;
 void main() {
    // Soft sky gradient based on ray direction Y. The actual ray dir
    // isn't accessible without an extra payload field; use a flat warm
    // tone that matches Sponza's interior lighting.
    hitValue = vec3(0.10, 0.08, 0.06);
 }
--- a/examples/Sponza/miss.wgsl
+++ b/examples/Sponza/miss.wgsl
@ -0,0 +1,16 @@
 fn miss_main(ray: RayDesc, payload: ptr<function, Payload>) {
    if ((*payload).shadowRay == 1u) {
        // Shadow ray escaped to infinity — the sun is visible from the
        // origin, so the surface there should pick up full direct light.
        // raygen reads color.x as the visibility coefficient.
        (*payload).color = vec3<f32>(1.0);
        return;
    }
    // Primary miss: cheap two-stop sky gradient. (*payload).hit stays 0
    // so raygen knows to skip the lighting path and just use this color.
    let t = clamp(ray.direction.y * 0.5 + 0.5, 0.0, 1.0);
    let sky    = vec3<f32>(0.45, 0.65, 0.95);
    let zenith = vec3<f32>(0.95, 0.85, 0.65);
    (*payload).color = mix(sky, zenith, t);
 }
--- a/examples/Sponza/project.cpp
+++ b/examples/Sponza/project.cpp
@ -0,0 +1,92 @@
 import std;
 import Crafter.Build;
 namespace fs = std::filesystem;
 using namespace Crafter;
 // Sponza geometry + albedo: CC BY 3.0, Frank Meinl (Crytek), packaged by
 // Jimmie Bergmann (https://github.com/jimmiebergmann/Sponza) and Morgan
 // McGuire (https://casual-effects.com/data). The full asset bundle is
 // ~280 MB — too large to live in this repo. GitFetch lands it in the
 // per-user crafter-build cache on first build and reuses thereafter.
 constexpr std::string_view kSponzaGitUrl    = "https://github.com/jimmiebergmann/Sponza.git";
 constexpr std::string_view kSponzaCommitSHA = "222338979d32f4f4818466291bdbc29f192b86ba";
 // Every albedo is normalized to this size so they can live as layers of
 // one texture_2d_array on the GPU (WebGPU array textures require
 // identical layer dimensions). 1024 matches the majority of Sponza's
 // textures; the few outliers (256×1024 chain, 512² thorn, 2048² curtains)
 // get bilinear-resized via stb_image_resize2.
 constexpr std::uint16_t    kAlbedoSize      = 1024u;
 extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> args) {
    bool isWasm = false;
    for (std::string_view a : args) {
        if (a.starts_with("--target=") && a.find("wasm") != std::string_view::npos) {
            isWasm = true;
            break;
        }
    }
    std::vector<std::string> graphicsArgs(args.begin(), args.end());
    Configuration* graphics = LocalProject({
        .projectFile = "../../project.cpp",
        .args = graphicsArgs,
    });
    Configuration cfg;
    cfg.path = "./";
    cfg.name = "Sponza";
    cfg.outputName = "Sponza";
    cfg.type = ConfigurationType::Executable;
    if (isWasm) {
        cfg.target = "wasm32-wasip1";
        cfg.defines.push_back({"CRAFTER_GRAPHICS_WINDOW_DOM", ""});
        cfg.compileFlags.push_back("-msimd128");
    }
    ApplyStandardArgs(cfg, args);
    cfg.dependencies = { graphics };
    std::array<fs::path, 0> ifaces = {};
    std::array<fs::path, 1> impls = { "main" };
    cfg.GetInterfacesAndImplementations(ifaces, impls);
    // Fetch Sponza once into the shared crafter-build cache, then process
    // it into a per-material bundle under build/sponza-bundle-<hash>/.
    // Hashing on (sha, albedoSize) so changing either invalidates the
    // bundle without touching the rest of the example's build tree.
    fs::path sponzaRoot = GitFetch({
        .url    = std::string(kSponzaGitUrl),
        .commit = std::string(kSponzaCommitSHA),
    });
    std::string bundleKey = std::format("{}|{}", kSponzaCommitSHA, kAlbedoSize);
    auto bundleHash = std::hash<std::string>{}(bundleKey);
    fs::path bundleDir = fs::path("build") / std::format("sponza-bundle-{:016x}", bundleHash);
    if (auto err = BuildOBJBundle(
            sponzaRoot / "sponza.obj",
            sponzaRoot / "sponza.mtl",
            bundleDir,
            kAlbedoSize); !err.empty()) {
        std::println(std::cerr, "Sponza bundle error: {}", err);
        std::exit(1);
    }
    // Forward every produced file (.cmesh, .ctex, scene.txt) as a
    // passthrough — they're already compressed by Crafter.Asset, no
    // further compression needed. cfg.files copies them flat into
    // the executable's bin dir.
    for (const auto& entry : fs::directory_iterator(bundleDir)) {
        if (entry.is_regular_file()) cfg.files.push_back(entry.path());
    }
    if (isWasm) {
        cfg.files.emplace_back(fs::path("raygen.wgsl"));
        cfg.files.emplace_back(fs::path("closesthit.wgsl"));
        cfg.files.emplace_back(fs::path("miss.wgsl"));
        EnableWasiBrowserRuntime(cfg);
    } else {
        cfg.shaders.emplace_back(fs::path("raygen.glsl"),     std::string("main"), ShaderType::RayGen);
        cfg.shaders.emplace_back(fs::path("closesthit.glsl"), std::string("main"), ShaderType::ClosestHit);
        cfg.shaders.emplace_back(fs::path("miss.glsl"),       std::string("main"), ShaderType::Miss);
    }
    return cfg;
 }
--- a/examples/Sponza/raygen.glsl
+++ b/examples/Sponza/raygen.glsl
@ -0,0 +1,52 @@
 #version 460
 #extension GL_EXT_ray_tracing : enable
 #extension GL_EXT_shader_image_load_formatted : enable
 #extension GL_EXT_shader_explicit_arithmetic_types_int16 : enable
 #extension GL_EXT_descriptor_heap : enable
 #extension GL_EXT_nonuniform_qualifier : enable
 // Specialization constant set from descriptorHeap.bufferStartElement —
 // shared with closesthit.glsl. The TLAS lives at descriptor_heap slot
 // `bufferStart` (it's an SSBO-typed entry), the per-frame output image
 // at heap slot 0.
 layout(constant_id = 0) const uint16_t bufferStart = 0us;
 layout(descriptor_heap) uniform accelerationStructureEXT topLevelAS[];
 layout(descriptor_heap) uniform writeonly image2D image[];
 layout(location = 0) rayPayloadEXT vec3 hitValue;
 void main() {
    uvec2 pixel      = gl_LaunchIDEXT.xy;
    uvec2 resolution = gl_LaunchSizeEXT.xy;
    vec2  uv         = (vec2(pixel) + 0.5) / vec2(resolution);
    vec2  ndc        = uv * 2.0 - 1.0;
    // Camera positioned to look down the Sponza atrium axis. Sponza-OBJ
    // from McGuire's archive is roughly 30 units wide × 13 tall × 18 deep,
    // axis-aligned, with the floor near y=0 and the atrium centered on
    // origin. -X faces the long end, so we sit inside looking +X.
    vec3  origin    = vec3(-10.0, 5.0, 0.0);
    float aspect    = float(resolution.x) / float(resolution.y);
    float fov       = radians(70.0);
    float tanHalf   = tan(fov * 0.5);
    vec3  direction = normalize(vec3(
        ndc.x * aspect * tanHalf,
        -ndc.y * tanHalf,
        1.0));
    // Rotate +Z forward → +X forward (90° about Y).
    direction = vec3(direction.z, direction.y, -direction.x);
    traceRayEXT(
        topLevelAS[bufferStart],
        gl_RayFlagsNoneEXT,
        0xff,
        0, 0, 0,
        origin,
        0.001,
        direction,
        10000.0,
        0);
    imageStore(image[0], ivec2(pixel), vec4(hitValue, 1.0));
 }
--- a/examples/Sponza/raygen.wgsl
+++ b/examples/Sponza/raygen.wgsl
@ -0,0 +1,109 @@
 // WebGPU raygen. Camera state comes from the host every frame via a
 // storage buffer bound at @group(2) @binding(2); main.cpp drives that
 // from WASD + mouse-delta through Crafter::Input.
 //
 // The shading + shadow trace all happens here because WGSL forbids
 // recursive function call graphs — closesthit_main can't call traceRay
 // (that would loop closesthit → traceRay → runClosestHit → closesthit).
 // Raygen is the entry point and not called by anyone, so it can call
 // traceRay twice (once primary, once shadow) without forming a cycle.
 struct Camera {
    origin:  vec3<f32>,
    pad0:    f32,
    right:   vec3<f32>,
    tanHalf: f32,
    up:      vec3<f32>,
    aspect:  f32,
    forward: vec3<f32>,
    pad1:    f32,
 };
@group(2) @binding(2) var<storage, read> camera : Camera;
 // Sun coming through Sponza's open roof. Y is up; this points "down and
 // slightly along +X" so the light grazes the colonnades on one side.
 const SUN_DIR_TO_LIGHT: vec3<f32> = vec3<f32>(-0.35,  1.00, -0.20);
 const SUN_COLOR:        vec3<f32> = vec3<f32>( 1.10,  1.00,  0.85);
 const AMBIENT_COLOR:    vec3<f32> = vec3<f32>( 0.18,  0.20,  0.28);
 fn raygen_main(gid: vec3<u32>) {
    if (gid.x >= hdr.surfaceW || gid.y >= hdr.surfaceH) { return; }
    let pixel      = vec2<f32>(f32(gid.x), f32(gid.y));
    let resolution = vec2<f32>(f32(hdr.surfaceW), f32(hdr.surfaceH));
    let uv         = (pixel + vec2<f32>(0.5)) / resolution;
    let ndc        = uv * 2.0 - vec2<f32>(1.0);
    // Pinhole camera reconstructed from the host basis. ndc.x runs left-
    // to-right across the screen → +right; ndc.y is top-down so we
    // negate before applying +up.
    let direction = normalize(
        camera.right   * (ndc.x  * camera.aspect * camera.tanHalf) +
        camera.up      * (-ndc.y * camera.tanHalf) +
        camera.forward);
    // ── Primary ray ────────────────────────────────────────────────────
    var payload: Payload;
    payload.color     = vec3<f32>(0.0);
    payload.shadowRay = 0u;
    payload.hit       = 0u;
    traceRay(
        0u, 0u, 0xFFu,
        0u, 0u, 0u,
        camera.origin, 0.001,
        direction,     10000.0,
        &payload);
    var finalColor: vec3<f32>;
    if (payload.hit == 1u) {
        // Closesthit filled albedo/worldPos/worldNormal. Two-sided
        // shading: flip the normal toward the camera if we hit the back
        // face — Sponza's curtains in particular have inconsistent
        // winding, and without this half the surface would go black.
        let albedo = payload.color;
        let nFacing = select(-payload.worldNormal,
                              payload.worldNormal,
                              dot(payload.worldNormal, direction) < 0.0);
        let lightDir = normalize(SUN_DIR_TO_LIGHT);
        let nDotL    = max(0.0, dot(nFacing, lightDir));
        // ── Shadow ray ────────────────────────────────────────────────
        // Only worth tracing if the surface faces the sun at all.
        var visibility = 0.0;
        if (nDotL > 0.0) {
            // Normal-offset bias on Sponza's units (~3700 wide atrium)
            // is hefty; 0.5 keeps the shadow ray clear of the originating
            // triangle without producing visible "floating" shadows.
            let shadowOrigin = payload.worldPos + nFacing * 0.5;
            var shadowPayload: Payload;
            shadowPayload.color     = vec3<f32>(0.0);  // default: blocked
            shadowPayload.shadowRay = 1u;
            shadowPayload.hit       = 0u;
            traceRay(
                0u,
                RT_FLAG_SKIP_CLOSEST_HIT | RT_FLAG_TERMINATE_ON_FIRST_HIT,
                0xFFu,
                0u, 0u, 0u,
                shadowOrigin, 0.001,
                lightDir,     10000.0,
                &shadowPayload);
            visibility = shadowPayload.color.x;
        }
        let lit = AMBIENT_COLOR + SUN_COLOR * (nDotL * visibility);
        finalColor = albedo * lit;
    } else {
        // Sky color was filled by miss_main.
        finalColor = payload.color;
    }
    // Reinhard tonemap + gamma 2.2 so sun-lit albedos don't clip and
    // shadow detail stays readable.
    let mapped = finalColor / (finalColor + vec3<f32>(1.0));
    let gamma  = pow(mapped, vec3<f32>(1.0 / 2.2));
    textureStore(outImage,
                 vec2<i32>(i32(gid.x), i32(gid.y)),
                 vec4<f32>(gamma, 1.0));
 }
--- a/implementations/Crafter.Graphics-Mesh-WebGPU.cpp
+++ b/implementations/Crafter.Graphics-Mesh-WebGPU.cpp
@ -19,6 +19,7 @@ module Crafter.Graphics:Mesh_implWebGPU;
 import :Mesh;
 import :WebGPU;
 import Crafter.Asset;
 import Crafter.Math;
 import std;
@ -215,26 +216,59 @@ namespace {
    };
 }
 namespace {
    // Shared between the positions-only and the compressed-asset Build paths.
    // attribsBytes is empty for positions-only meshes; the JS bridge skips
    // the attribs-heap append in that case.
    void BuildBVHAndRegister(Mesh& mesh,
                             std::span<const Vector<float, 3, 3>> vertices,
                             std::span<const std::uint32_t>       indices,
                             std::span<const std::byte>           attribsBytes) {
        mesh.triangleCount = static_cast<std::uint32_t>(indices.size()) / 3;
        Builder builder;
        builder.Build(vertices, indices);
        std::vector<std::uint32_t> primRemap(mesh.triangleCount);
        for (std::uint32_t i = 0; i < mesh.triangleCount; ++i) {
            primRemap[i] = builder.prims[i].triIndex;
        }
        const BVHNode& root = builder.nodes[0];
        mesh.blasAddr = WebGPU::wgpuRegisterMeshBLAS(
            root.aabbMin[0], root.aabbMin[1], root.aabbMin[2],
            root.aabbMax[0], root.aabbMax[1], root.aabbMax[2],
            vertices.data(),       static_cast<std::int32_t>(vertices.size()),
            indices.data(),        static_cast<std::int32_t>(indices.size()),
            builder.nodes.data(),  static_cast<std::int32_t>(builder.nodes.size()),
            primRemap.data(),      static_cast<std::int32_t>(primRemap.size()),
            attribsBytes.data(),   static_cast<std::int32_t>(attribsBytes.size()));
    }
 }
 void Mesh::Build(std::span<Vector<float, 3, 3>> vertices,
                 std::span<std::uint32_t>       indices,
                 WebGPUCommandEncoderRef        /*cmd*/) {
-    triangleCount = static_cast<std::uint32_t>(indices.size()) / 3;
+    BuildBVHAndRegister(*this, vertices, indices, {});
 }
-    Builder builder;
+void Mesh::Build(const CompressedMeshAsset& asset,
-    builder.Build(vertices, indices);
+                 WebGPUCommandEncoderRef    /*cmd*/) {
    std::vector<Vector<float, 3, 3>> vertices(asset.vertexCount);
    std::vector<std::uint32_t>       indices(asset.indexCount);
    std::vector<std::byte>           dataBytes(
        static_cast<std::size_t>(asset.dataCount) * asset.dataStride);
-    std::vector<std::uint32_t> primRemap(triangleCount);
+    // CompressedBlob always carries 3 regions for MeshAsset (the data region
-    for (std::uint32_t i = 0; i < triangleCount; ++i) {
+    // can have decompressedSize=0). DecompressCPU validates output sizes
-        primRemap[i] = builder.prims[i].triIndex;
+    // against region sizes, so the empty-data path needs the empty span.
-    }
+    std::array<std::span<std::byte>, 3> outputs = {
        std::as_writable_bytes(std::span(vertices)),
        std::as_writable_bytes(std::span(indices)),
        std::span<std::byte>(dataBytes),
    };
    Compression::DecompressCPU(asset.blob,
        std::span(outputs).first(asset.blob.regions.size()));
-    const BVHNode& root = builder.nodes[0];
+    BuildBVHAndRegister(*this, vertices, indices, std::span(dataBytes));
    std::uint32_t h = WebGPU::wgpuRegisterMeshBLAS(
        root.aabbMin[0], root.aabbMin[1], root.aabbMin[2],
        root.aabbMax[0], root.aabbMax[1], root.aabbMax[2],
        vertices.data(),       static_cast<std::int32_t>(vertices.size()),
        indices.data(),        static_cast<std::int32_t>(indices.size()),
        builder.nodes.data(),  static_cast<std::int32_t>(builder.nodes.size()),
        primRemap.data(),      static_cast<std::int32_t>(primRemap.size()));
    blasAddr = h;
 }
--- a/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp
+++ b/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp
@ -22,6 +22,7 @@ module Crafter.Graphics:PipelineRTWebGPU_impl;
 import :PipelineRTWebGPU;
 import :ShaderBindingTableWebGPU;
 import :WebGPUComputeShader;
 import :RT;
 import :WebGPU;
 import std;
@ -65,7 +66,9 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef                 /*cmd*/,
                            std::span<const RTShaderGroup>          raygenGroups,
                            std::span<const RTShaderGroup>          missGroups,
                            std::span<const RTShaderGroup>          hitGroups,
-                            const ShaderBindingTableWebGPU&         sbt) {
+                            const ShaderBindingTableWebGPU&         sbt,
                            std::span<const UICustomBinding>        bindings) {
    userBindings.assign(bindings.begin(), bindings.end());
    std::string wgsl;
    wgsl.reserve(8 * 1024);
@ -183,5 +186,7 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef                 /*cmd*/,
    pipelineHandle = WebGPU::wgpuLoadRTPipeline(
        wgsl.data(),
-        static_cast<std::int32_t>(wgsl.size()));
+        static_cast<std::int32_t>(wgsl.size()),
        userBindings.empty() ? nullptr : userBindings.data(),
        static_cast<std::int32_t>(userBindings.size()));
 }
--- a/interfaces/Crafter.Graphics-DescriptorHeapWebGPU.cppm
+++ b/interfaces/Crafter.Graphics-DescriptorHeapWebGPU.cppm
@ -181,5 +181,15 @@ export namespace Crafter {
        }
        return *this;
    }
    // Convenience: create the "standard" linear-filter clamp-to-edge sampler,
    // allocate a slot for it, and return the slot. The wgpu* bridge call is
    // intentionally kept inside the library — example code shouldn't need to
    // reach into Crafter::WebGPU directly.
    inline SamplerSlot AllocateLinearClampSampler(DescriptorHeapWebGPU& heap) {
        DescriptorRange r = heap.AllocateSamplerSlots(1);
        heap.samplerTable[r.firstElement] = WebGPU::wgpuCreateLinearClampSampler();
        return SamplerSlot(&heap, r.firstElement);
    }
 }
 #endif // CRAFTER_GRAPHICS_WINDOW_DOM
--- a/interfaces/Crafter.Graphics-Image2D.cppm
+++ b/interfaces/Crafter.Graphics-Image2D.cppm
@ -0,0 +1,166 @@
 /*
 Crafter®.Graphics
 Copyright (C) 2026 Catcrafts®
 catcrafts.net
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License version 3.0 as published by the Free Software Foundation;
 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 Lesser General Public License for more details.
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
 */
 // Image2D<T> — portable 2D image type whose API surface is intentionally
 // backend-specific via #ifdef. On Vulkan it aliases the existing
 // ImageVulkan<T> (full VkFormat / usage / layout control). On WebGPU it's
 // a thin handle around an rgba8unorm GPUTexture; sizes are u16 and the
 // only update path is from a CompressedTextureAsset.
 //
 // The "no shared no-op signatures" principle is deliberate: callers do
 // the same #ifdef the library does, and write the backend-specific
 // invocation. The unified type name Image2D<T> is the only thing
 // portable between the two — that's the whole point.
 export module Crafter.Graphics:Image2D;
 #ifndef CRAFTER_GRAPHICS_WINDOW_DOM
 import :ImageVulkan;
 export namespace Crafter {
    // Vulkan target: Image2D is just the existing ImageVulkan. New name,
    // same shape — keeps existing ImageVulkan callers (e.g. examples/
    // Decompression) working without a churn-rename.
    template <typename PixelType>
    using Image2D = ImageVulkan<PixelType>;
 }
 #endif // !CRAFTER_GRAPHICS_WINDOW_DOM
 #ifdef CRAFTER_GRAPHICS_WINDOW_DOM
 import std;
 import Crafter.Asset;
 import :DescriptorHeapWebGPU;
 import :WebGPU;
 export namespace Crafter {
    template <typename PixelType>
    class Image2D {
    public:
        WebGPUTextureRef handle = 0;
        std::uint16_t width  = 0;
        std::uint16_t height = 0;
        void Create(std::uint16_t w, std::uint16_t h) {
            width  = w;
            height = h;
            handle = WebGPU::wgpuCreateImage2D(w, h);
        }
        // CPU-decompress the .ctex blob (no GPU decompression on WebGPU)
        // and upload via wgpuWriteImage2D. The intermediate `pixels` vector
        // lives only for the duration of this call — the underlying
        // queue.writeTexture in JS makes its own copy.
        void Update(const CompressedTextureAsset& asset) {
            if (asset.pixelStride != sizeof(PixelType)) {
                std::println(std::cerr,
                    "Image2D::Update: pixel stride mismatch (got {}, expected {})",
                    asset.pixelStride, sizeof(PixelType));
                std::abort();
            }
            std::vector<PixelType> pixels(
                static_cast<std::size_t>(asset.sizeX) * asset.sizeY);
            std::array<std::span<std::byte>, 1> outputs = {
                std::as_writable_bytes(std::span(pixels)),
            };
            Compression::DecompressCPU(asset.blob, outputs);
            WebGPU::wgpuWriteImage2D(
                handle,
                pixels.data(),
                static_cast<std::int32_t>(pixels.size() * sizeof(PixelType)),
                asset.sizeX, asset.sizeY);
        }
        // Register the texture in a descriptor heap slot so a custom RT
        // pipeline can bind it via UICustomBinding::SampledTexture.
        ImageSlot AllocateSlot(DescriptorHeapWebGPU& heap) {
            DescriptorRange r = heap.AllocateImageSlots(1);
            heap.imageTable[r.firstElement] = handle;
            return ImageSlot(&heap, r.firstElement);
        }
        void Destroy() {
            if (handle != 0) {
                WebGPU::wgpuDestroyTexture(handle);
                handle = 0;
            }
        }
    };
    // 2D texture array — `layers` × (w × h) rgba8unorm. Each layer is
    // populated independently from a CompressedTextureAsset whose dims
    // must match the array's (w × h). Layer 0 is sampled at array
    // index 0 in WGSL; bind through UICustomBindingKind::SampledTextureArray.
    template <typename PixelType>
    class Image2DArray {
    public:
        WebGPUTextureRef handle = 0;
        std::uint16_t width  = 0;
        std::uint16_t height = 0;
        std::uint16_t layers = 0;
        void Create(std::uint16_t w, std::uint16_t h, std::uint16_t layerCount) {
            width  = w;
            height = h;
            layers = layerCount;
            handle = WebGPU::wgpuCreateImage2DArray(w, h, layerCount);
        }
        // Decompress `tex` and upload to `layer`. The asset's dims must
        // match the array's (w × h) — resize beforehand on the host with
        // TextureAsset<RGBA8>::Resize() if they don't.
        void UpdateLayer(std::uint16_t layer, const CompressedTextureAsset& tex) {
            if (tex.pixelStride != sizeof(PixelType)) {
                std::println(std::cerr,
                    "Image2DArray::UpdateLayer: pixel stride mismatch (got {}, expected {})",
                    tex.pixelStride, sizeof(PixelType));
                std::abort();
            }
            if (tex.sizeX != width || tex.sizeY != height) {
                std::println(std::cerr,
                    "Image2DArray::UpdateLayer: layer {} dims {}x{} don't match array dims {}x{}",
                    layer, tex.sizeX, tex.sizeY, width, height);
                std::abort();
            }
            std::vector<PixelType> pixels(static_cast<std::size_t>(width) * height);
            std::array<std::span<std::byte>, 1> outputs = {
                std::as_writable_bytes(std::span(pixels)),
            };
            Compression::DecompressCPU(tex.blob, outputs);
            WebGPU::wgpuWriteImage2DLayer(
                handle, layer,
                pixels.data(),
                static_cast<std::int32_t>(pixels.size() * sizeof(PixelType)),
                width, height);
        }
        ImageSlot AllocateSlot(DescriptorHeapWebGPU& heap) {
            DescriptorRange r = heap.AllocateImageSlots(1);
            heap.imageTable[r.firstElement] = handle;
            return ImageSlot(&heap, r.firstElement);
        }
        void Destroy() {
            if (handle != 0) {
                WebGPU::wgpuDestroyTexture(handle);
                handle = 0;
            }
        }
    };
 }
 #endif // CRAFTER_GRAPHICS_WINDOW_DOM
--- a/interfaces/Crafter.Graphics-Mesh.cppm
+++ b/interfaces/Crafter.Graphics-Mesh.cppm
@ -64,6 +64,7 @@ export namespace Crafter {
 #ifdef CRAFTER_GRAPHICS_WINDOW_DOM
 import std;
 import Crafter.Math;
 import Crafter.Asset;
 import :WebGPU;
 export namespace Crafter {
@ -108,6 +109,15 @@ export namespace Crafter {
        void Build(std::span<Crafter::Vector<float, 3, 3>> vertices,
                   std::span<std::uint32_t>                indices,
                   WebGPUCommandEncoderRef                  cmd = 0);
        // CPU-decompress the .cmesh blob (no VK_EXT_memory_decompression
        // equivalent in WebGPU) and forward to the positions+indices path,
        // plus push the optional `data` region into the per-vertex attribs
        // heap so closest-hit shaders can sample UVs / normals / tangents.
        // The data layout is example-defined — the heap is exposed in WGSL
        // as `vertexAttribs : array<u32>` with a per-mesh u32-word offset.
        void Build(const ::Crafter::CompressedMeshAsset& asset,
                   WebGPUCommandEncoderRef                cmd = 0);
    };
 }
 #endif // CRAFTER_GRAPHICS_WINDOW_DOM
--- a/interfaces/Crafter.Graphics-PipelineRTWebGPU.cppm
+++ b/interfaces/Crafter.Graphics-PipelineRTWebGPU.cppm
@ -26,22 +26,31 @@ import std;
 import :RT;
 import :WebGPU;
 import :ShaderBindingTableWebGPU;
 import :WebGPUComputeShader;
 export namespace Crafter {
    class PipelineRTWebGPU {
    public:
        std::uint32_t pipelineHandle = 0;
        // Mirror of the bindings handed to Init. Kept for the example /
        // RTPass to consult when packing the handles[] array at dispatch
        // time (one resolved u32 handle per binding, in declaration order).
        std::vector<UICustomBinding> userBindings;
        // Build the megakernel pipeline. Groups carry indices into
        // `sbt.shaders`. The library generates one `case` per registered
        // group: closest-hit groups dispatch to their closestHitShader's
        // entryFn, miss groups to their generalShader's entryFn, etc.
        // The `cmd` parameter is unused on WebGPU; kept for API symmetry.
        // `userBindings` declares extra @group(2)+ resources the user's
        // closest-hit / miss / raygen WGSL touches (material SSBOs,
        // albedo textures, samplers).
        void Init(WebGPUCommandEncoderRef                 cmd,
                  std::span<const RTShaderGroup>          raygenGroups,
                  std::span<const RTShaderGroup>          missGroups,
                  std::span<const RTShaderGroup>          hitGroups,
-                  const ShaderBindingTableWebGPU&         sbt);
+                  const ShaderBindingTableWebGPU&         sbt,
                  std::span<const UICustomBinding>        bindings = {});
        PipelineRTWebGPU() = default;
        PipelineRTWebGPU(const PipelineRTWebGPU&) = delete;
--- a/interfaces/Crafter.Graphics-RTPass.cppm
+++ b/interfaces/Crafter.Graphics-RTPass.cppm
@ -66,6 +66,12 @@ export namespace Crafter {
        // RTDispatchHeader. Null means "no extra data".
        const void*       pushPtr   = nullptr;
        std::uint32_t     pushBytes = 0;
        // Resolved WebGPU resource handles for each user binding the
        // pipeline was loaded with, in declaration order. The example
        // owns the storage (typically a small std::array of u32). Null /
        // 0 means "no user bindings".
        const void*       handlesPtr   = nullptr;
        std::uint32_t     handlesCount = 0;
        RTPass(PipelineRTWebGPU* p) : pipeline(p) {}
@ -80,7 +86,9 @@ export namespace Crafter {
                tlas.buffer.handle,
                static_cast<std::int32_t>(tlas.builtInstanceCount),
                static_cast<std::int32_t>(gx),
-                static_cast<std::int32_t>(gy));
+                static_cast<std::int32_t>(gy),
                handlesPtr,
                static_cast<std::int32_t>(handlesCount));
        }
    };
 }
--- a/interfaces/Crafter.Graphics-WebGPU.cppm
+++ b/interfaces/Crafter.Graphics-WebGPU.cppm
@ -49,6 +49,27 @@ namespace Crafter::WebGPU {
    __attribute__((import_module("env"), import_name("wgpuDestroyTexture")))
    extern "C" void wgpuDestroyTexture(std::uint32_t handle);
    // General-purpose rgba8unorm 2D texture for material albedo etc.
    // Separate from the atlas path because atlas uses r8unorm + sub-region
    // writes; this one takes the whole image in one shot.
    __attribute__((import_module("env"), import_name("wgpuCreateImage2D")))
    extern "C" std::uint32_t wgpuCreateImage2D(std::int32_t w, std::int32_t h);
    __attribute__((import_module("env"), import_name("wgpuWriteImage2D")))
    extern "C" void wgpuWriteImage2D(std::uint32_t handle, const void* srcPtr,
                                     std::int32_t byteSize,
                                     std::int32_t w, std::int32_t h);
    // 2D texture array — `layerCount` rgba8unorm layers of identical (w × h).
    // Sampled via `texture_2d_array<f32>` in WGSL (UICustomBindingKind 3).
    // Used by Image2DArray<RGBA8> to stack per-material albedos for one
    // multi-material scene.
    __attribute__((import_module("env"), import_name("wgpuCreateImage2DArray")))
    extern "C" std::uint32_t wgpuCreateImage2DArray(std::int32_t w, std::int32_t h, std::int32_t layerCount);
    __attribute__((import_module("env"), import_name("wgpuWriteImage2DLayer")))
    extern "C" void wgpuWriteImage2DLayer(std::uint32_t handle, std::int32_t layer,
                                          const void* srcPtr, std::int32_t byteSize,
                                          std::int32_t w, std::int32_t h);
    __attribute__((import_module("env"), import_name("wgpuCreateLinearClampSampler")))
    extern "C" std::uint32_t wgpuCreateLinearClampSampler();
@ -96,6 +117,11 @@ namespace Crafter::WebGPU {
    // stores in RTInstance::accelerationStructureReference; the WebGPU
    // TLAS-build compute shader resolves it back to root AABB + heap
    // offsets at dispatch time. Returns 0 on failure.
    // The optional `attribsPtr` / `attribsByteCount` carry per-vertex
    // attribute payload (normals, UVs, etc. — layout is example-defined)
    // that gets appended to a global attribs heap and exposed to RT
    // closest-hit shaders as `vertexAttribs : array<u32>` at
    // @group(1) @binding(7). Pass (nullptr, 0) for positions-only meshes.
    __attribute__((import_module("env"), import_name("wgpuRegisterMeshBLAS")))
    extern "C" std::uint32_t wgpuRegisterMeshBLAS(
        float minX, float minY, float minZ,
@ -103,25 +129,34 @@ namespace Crafter::WebGPU {
        const void* verticesPtr, std::int32_t vertexCount,
        const void* indicesPtr,  std::int32_t indexCount,
        const void* bvhNodesPtr, std::int32_t bvhNodeCount,
-        const void* primRemapPtr, std::int32_t primRemapCount);
+        const void* primRemapPtr, std::int32_t primRemapCount,
        const void* attribsPtr,  std::int32_t attribsByteCount);
    // RT pipeline build. The library composes WGSL by concatenating the
    // traversal library, generated hit-group switches, and the user-
-    // supplied raygen / miss / closesthit / anyhit bodies. Returns an
+    // supplied raygen / miss / closesthit / anyhit bodies. `bindings` is
-    // opaque pipeline handle.
+    // UICustomBinding-shaped (8 bytes each) declaring extra @group(2)+
    // resources the user's closest-hit / miss / raygen WGSL references.
    // Pass (nullptr, 0) for a pipeline with no user-declared bindings.
    // Returns an opaque pipeline handle.
    __attribute__((import_module("env"), import_name("wgpuLoadRTPipeline")))
-    extern "C" std::uint32_t wgpuLoadRTPipeline(const void* wgslPtr, std::int32_t wgslLen);
+    extern "C" std::uint32_t wgpuLoadRTPipeline(const void* wgslPtr, std::int32_t wgslLen,
                                                const void* bindingsPtr, std::int32_t bindingsCount);
    // Dispatch a TraceRays-equivalent pass: the RT pipeline is dispatched
    // over a (gx, gy) tile grid; the library writes the push data (camera,
    // payload, etc. — opaque) into a uniform ring buffer, attaches the TLAS
    // + global mesh heap, and runs one workgroup per 8x8 screen tile.
    // `handles[]` carries resolved WebGPU resource handles for every user
    // binding declared at pipeline-load time, in the same order. Pass
    // (nullptr, 0) for a pipeline with no user bindings.
    __attribute__((import_module("env"), import_name("wgpuDispatchRT")))
    extern "C" void wgpuDispatchRT(std::uint32_t pipelineHandle,
                                   const void* pushPtr, std::int32_t pushBytes,
                                   std::uint32_t tlasBufHandle,
                                   std::int32_t  instanceCount,
-                                   std::int32_t  gx, std::int32_t gy);
+                                   std::int32_t  gx, std::int32_t gy,
                                   const void* handlesPtr, std::int32_t handlesCount);
    // GPU TLAS-build dispatch. Reads the instance buffer (host-uploaded or
    // GPU-written), produces per-instance world-space AABBs + per-instance
--- a/interfaces/Crafter.Graphics-WebGPUComputeShader.cppm
+++ b/interfaces/Crafter.Graphics-WebGPUComputeShader.cppm
@ -32,9 +32,10 @@ import :WebGPU;
 export namespace Crafter {
    enum class UICustomBindingKind : std::uint8_t {
-        Buffer         = 0,   // read-only-storage SSBO, handle is a slot into heap.bufferTable
+        Buffer              = 0,   // read-only-storage SSBO, handle is a slot into heap.bufferTable
-        SampledTexture = 1,   // sampled texture_2d<f32>, handle is a slot into heap.imageTable
+        SampledTexture      = 1,   // sampled texture_2d<f32>, handle is a slot into heap.imageTable
-        Sampler        = 2,   // filtering sampler, handle is a slot into heap.samplerTable
+        Sampler             = 2,   // filtering sampler, handle is a slot into heap.samplerTable
        SampledTextureArray = 3,   // sampled texture_2d_array<f32>, handle is a slot into heap.imageTable
    };
    struct UICustomBinding {
--- a/interfaces/Crafter.Graphics.cppm
+++ b/interfaces/Crafter.Graphics.cppm
@ -47,6 +47,7 @@ export import :ShaderBindingTableVulkan;
 export import :PipelineRTVulkan;
 export import :RenderingElement3D;
 export import :ImageVulkan;
 export import :Image2D;
 export import :SamplerVulkan;
 export import :DescriptorHeapVulkan;
 export import :RenderPass;
--- a/project.cpp
+++ b/project.cpp
@ -31,23 +31,9 @@ extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> a
        });
    };
    // Sniff the requested target from args before any deps resolve — the
    // Crafter.Asset dependency is heavy and not wasm-ready (uses `throw`
    // under -fno-exceptions, references `_Float16`). The DOM build stubs
    // the renderer entirely so the dep doesn't apply anyway.
    bool isWasm = false;
    for (std::string_view a : args) {
        if (a.starts_with("--target=") && a.find("wasm") != std::string_view::npos) {
            isWasm = true;
            break;
        }
    }
    Configuration* event = resolveDep("Crafter.Event", "https://forgejo.catcrafts.net/Catcrafts/Crafter.Event.git");
    Configuration* math  = resolveDep("Crafter.Math",  "https://forgejo.catcrafts.net/Catcrafts/Crafter.Math.git");
-    Configuration* asset = isWasm
+    Configuration* asset = resolveDep("Crafter.Asset", "https://forgejo.catcrafts.net/Catcrafts/Crafter.Asset.git");
        ? nullptr
        : resolveDep("Crafter.Asset", "https://forgejo.catcrafts.net/Catcrafts/Crafter.Asset.git");
    Configuration cfg;
    cfg.path = "./";
@ -55,11 +41,7 @@ extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> a
    cfg.outputName = "Crafter.Graphics";
    cfg.type = ConfigurationType::LibraryStatic;
    auto opts = ApplyStandardArgs(cfg, args);
-    if (asset) {
+    cfg.dependencies = { event, math, asset };
        cfg.dependencies = { event, math, asset };
    } else {
        cfg.dependencies = { event, math };
    }
    // Window backend follows the target triple. V1 had separate lib-wayland /
    // lib-win32 configurations; V2 picks the right one automatically based on
@ -78,6 +60,16 @@ extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> a
        // strips -march/-mtune from the clang command line for any wasm32-*
        // triple, so cfg.march/mtune can stay at their defaults — keeping them
        // matches the VariantId of dependency PCMs.
        //
        // WasmAlloc / WasmFree live in Crafter.Graphics-Dom.cpp and back
        // dom-env.js's __writeUtf8 path (every keyboard / text-input event
        // routes through them). The TU defines no symbols main.cpp would
        // reference, so wasm-ld dead-strips it from libCrafter.Graphics.a
        // for examples that don't touch the `Dom::HtmlElement*` API (like
        // Sponza). `--export=` both forces the export AND pulls the
        // defining .o in — solving both halves of the dead-strip problem.
        cfg.linkFlags.push_back("-Wl,--export=WasmAlloc");
        cfg.linkFlags.push_back("-Wl,--export=WasmFree");
    } else if (windows) {
        cfg.defines.push_back({"CRAFTER_GRAPHICS_WINDOW_WIN32", ""});
        cfg.linkFlags.push_back("-lkernel32");
@ -131,7 +123,7 @@ extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> a
    // when its body is gated out. Vulkan-typed partitions stub to empty
    // modules under CRAFTER_GRAPHICS_WINDOW_DOM; the Dom/DomEvents/Router
    // partitions stub to empty modules in the opposite direction.
-    std::array<fs::path, 40> ifaces = {
+    std::array<fs::path, 41> ifaces = {
        "interfaces/Crafter.Graphics",
        "interfaces/Crafter.Graphics-Animation",
        "interfaces/Crafter.Graphics-Clipboard",
@ -147,6 +139,7 @@ extern "C" Configuration CrafterBuildProject(std::span<const std::string_view> a
        "interfaces/Crafter.Graphics-ForwardDeclarations",
        "interfaces/Crafter.Graphics-Gamepad",
        "interfaces/Crafter.Graphics-GraphicsTypes",
        "interfaces/Crafter.Graphics-Image2D",
        "interfaces/Crafter.Graphics-ImageVulkan",
        "interfaces/Crafter.Graphics-Input",
        "interfaces/Crafter.Graphics-InputField",