webgpu improvements

2026-05-24 13:32:08 +02:00 · 2026-05-24 13:32:08 +02:00 · 8347467e1e
commit 8347467e1e
parent 5a75571ffd
18 changed files with 1932 additions and 153 deletions
--- a/interfaces/Crafter.Graphics-RenderingElement3D.cppm
+++ b/interfaces/Crafter.Graphics-RenderingElement3D.cppm
@ -121,6 +121,37 @@ export namespace Crafter {
        // customIndex (4) + _pad (12). Defined in the WGSL traversal
        // library; never directly read by C++.
        WebGPUBuffer<char, false>           buffer;
+        // GPU LBVH support — see additional/dom-webgpu.js's TLAS-build
+        // pipeline.
+        //
+        // entryOrder: per-frame permutation array of u32, indexing into
+        // `buffer` (the TLASEntry[] array). Populated by the radix-sort
+        // pass to spatially-coherent Morton order, then consumed by the
+        // BVH construction + traversal passes.  In Stage 1 (this
+        // baseline) it's the identity permutation written by
+        // tlasBuildMain alongside the entries.
+        WebGPUBuffer<char, false>           entryOrder;
+        // mortonCodes: per-instance 32-bit Morton codes computed from the
+        // world-AABB centroid, used as the radix-sort key. Written by
+        // tlasBuildMain.
+        WebGPUBuffer<char, false>           mortonCodes;
+        // bvhNodes: 2N_PADDED - 1 sweep-tree BVH nodes built per frame
+        // by the LBVH-build compute pass. Each node 32 bytes (aabbMin +
+        // pad, aabbMax + pad). N_PADDED = 65536 (hardcoded in WGSL).
+        // Internal nodes [0, N_PADDED-1); leaves [N_PADDED-1, 2*N_PADDED-1).
+        // Node i's children are 2i+1, 2i+2 (implicit perfect binary
+        // tree). Cap: 65536 instances per scene.
+        WebGPUBuffer<char, false>           bvhNodes;
+        // tlasBins: dead, kept allocated as a 64-byte placeholder so the
+        // existing wgpuBuildTLAS C++ signature doesn't need a churn.
+        // The pre-LBVH 64-bin partition was replaced by the full BVH.
+        WebGPUBuffer<char, false>           tlasBins;
+        // Sort ping-pong buffers for the radix sort. Each pass reads
+        // from one and writes to the other, swapping role. Layout per
+        // element: 1 u32 packed key = (morton16 << 16) | tlasIndex16.
+        // Sized for N_PADDED.
+        WebGPUBuffer<char, false>           sortTempA;
+        WebGPUBuffer<char, false>           sortTempB;

        std::uint32_t builtInstanceCount = 0;
    };
@ -141,6 +172,17 @@ export namespace Crafter {
        // a fresh build (no refit) — the GPU build pass is cheap at the
        // ~10–100 instance counts the design targets; LBVH-for-TLAS is a
        // future optimization for larger scenes.
+        //
+        // BuildTLAS is now split into Upload + Build so a physics
+        // compute pass (e.g. physics-tlas-transform) can run between the
+        // CPU mirror upload and the GPU LBVH build. The compute pass
+        // writes the per-instance transform bytes that BuildTLAS leaves
+        // intact for elements flagged transformOwnedByGpu, and those
+        // writes have to land before the LBVH reads them. The combined
+        // BuildTLAS is kept as a convenience for callers that don't
+        // interleave a compute pass (e.g. the ctor-time first build).
+        static void BuildTLASUpload(WebGPUCommandEncoderRef cmd, std::uint32_t index);
+        static void BuildTLASBuild(WebGPUCommandEncoderRef cmd, std::uint32_t index);
        static void BuildTLAS(WebGPUCommandEncoderRef cmd, std::uint32_t index);

        static void Add(RenderingElement3D* e);