webgpu improvements

2026-05-24 13:32:08 +02:00 · 2026-05-24 13:32:08 +02:00 · 8347467e1e
commit 8347467e1e
parent 5a75571ffd
18 changed files with 1932 additions and 153 deletions
--- a/implementations/Crafter.Graphics-RenderingElement3D-WebGPU.cpp
+++ b/implementations/Crafter.Graphics-RenderingElement3D-WebGPU.cpp
@ -4,12 +4,21 @@ Copyright (C) 2026 Catcrafts®
 catcrafts.net
 */

-// DOM-mode TLAS upkeep. BuildTLAS copies the per-element RTInstance into
-// the host-visible instance buffer (skipping the transform for elements
-// whose transform is GPU-owned), uploads it, then dispatches the JS-side
-// TLAS-build compute pass — which consults the per-BLAS records published
-// at Mesh::Build() time to produce world-space AABBs and inverse
-// transforms in the format `traceRay` / `rayQuery` consume.
+// DOM-mode TLAS upkeep. BuildTLAS is split in two phases so a physics
+// compute pass can run between them:
+//   - BuildTLASUpload mirrors the CPU-side RTInstance array into the
+//     host-visible instance buffer (with partial-write semantics that
+//     preserve the transform bytes for elements flagged
+//     transformOwnedByGpu, see notes in the body) and uploads the
+//     metadata buffer.
+//   - BuildTLASBuild dispatches the JS-side TLAS-build compute pass —
+//     which consults the per-BLAS records published at Mesh::Build()
+//     time to produce world-space AABBs and inverse transforms in the
+//     format `traceRay` / `rayQuery` consume.
+// The combined BuildTLAS calls both back-to-back; callers that want to
+// interleave a physics tlas-transform compute pass (which writes the
+// transform bytes BuildTLASUpload leaves intact) call Upload + their
+// compute pass + Build manually.

 module;
 module Crafter.Graphics:RenderingElement3D_implWebGPU;
@ -41,7 +50,7 @@ void RenderingElement3D::Remove(RenderingElement3D* e) {
    e->indexInElements = std::numeric_limits<std::uint32_t>::max();
 }

-void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
+void RenderingElement3D::BuildTLASUpload(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
    auto& tlas = tlases[index];
    const std::uint32_t primitiveCount = static_cast<std::uint32_t>(elements.size());
    if (primitiveCount == 0) {
@ -49,19 +58,52 @@ void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef /*cmd*/, std::uint32_
        return;
    }

-    // (Re)allocate instance + metadata + output TLAS buffers if the count
-    // changed. WebGPUBuffer::Resize destroys and recreates the GPU buffer;
-    // bind-group caches keyed on the buffer handle are invalidated in the
-    // JS bridge automatically.
-    if (primitiveCount != tlas.builtInstanceCount) {
-        tlas.instanceBuffer.Resize(primitiveCount);
-        tlas.metadataBuffer.Resize(primitiveCount);
-        // TLASEntry layout in WGSL is 144 bytes due to vec3 align/pad
-        // rules. Must match the struct declared in the rtWgslTypes
-        // block in additional/dom-webgpu.js.
-        tlas.buffer.Resize(primitiveCount * 144);
+    constexpr std::uint32_t kNPadded   = 65536u;     // size for instance / metadata mirrors
+    constexpr std::uint32_t kLbvhMax   = 16384u;     // matches N_PADDED in lbvhBuildWgsl
+    constexpr std::uint32_t kNodeCount = 2u * kNPadded - 1u;
+
+    // ALL TLAS-side GPU buffers get allocated ONCE and never resized.
+    // The LBVH-build shader takes the real instance count via a uniform
+    // (lbvhPc.nReal) instead of arrayLength(&entries), so the
+    // tlas.buffer / entryOrder / mortonCodes don't need to grow when
+    // the application's element count changes.
+    //
+    // Why this matters: an earlier version resized these per-frame on
+    // primitiveCount change. The destroy+recreate cycle on the GPU
+    // buffer caused subtle mid-game flicker as soon as any element was
+    // added (e.g. firing a projectile) — fort braces would appear to
+    // briefly vanish in patterns deterministic on the projectile's
+    // angle. Suspected driver-level memory recycling without proper
+    // zero-init; the fixed-size allocation sidesteps it entirely.
+    if (tlas.instanceBuffer.handle == 0) {
+        tlas.instanceBuffer.Resize(kNPadded);
+        tlas.metadataBuffer.Resize(kNPadded);
+        tlas.bvhNodes.Resize(kNodeCount * 32u);
+        tlas.sortTempA.Resize(kNPadded * 4u);
+        tlas.sortTempB.Resize(kNPadded * 4u);
+        tlas.tlasBins.Resize(64 * 32);
+        // TLAS-entry / order / morton-code buffers: sized for the LBVH
+        // cap (16384). lbvhBuildMain iterates `lbvhPc.nReal` real
+        // entries; the remainder stays zero / sentinel. Keep these
+        // stable across element-count changes so the renderer's bind
+        // group references the same buffer handle every frame.
+        tlas.buffer.Resize(kLbvhMax * 144u);
+        tlas.entryOrder.Resize(kLbvhMax * 4u);
+        tlas.mortonCodes.Resize(kLbvhMax * 4u);
    }

+    // NB: tlas.buffer / entryOrder / mortonCodes get resized in
+    // BuildTLASBuild, NOT here. Resize destroys + recreates the GPU
+    // resource (and the JS-side handle); the rayQuery dispatches that
+    // run between BuildTLASUpload and BuildTLASBuild (projectile-collide,
+    // splash, builder-pick) still hold the previous frame's TLAS in
+    // rtState.current{Tlas,EntryOrder,Bvh}. If we resized here, those
+    // handles would point at destroyed buffers and the dispatches would
+    // log "no TLAS built yet" every frame the element count changed
+    // (e.g. every projectile fire). Resizing inside BuildTLASBuild,
+    // immediately before wgpuBuildTLAS publishes the new handles, keeps
+    // the JS-side current* refs in sync with the GPU resources.
+
    for (std::uint32_t i = 0; i < primitiveCount; ++i) {
        auto& dst = tlas.instanceBuffer.value[i];
        const auto& src = elements[i]->instance;
@ -80,12 +122,73 @@ void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef /*cmd*/, std::uint32_
        tlas.metadataBuffer.value[i] = elements[i]->userMetadata;
    }

-    tlas.instanceBuffer.FlushDevice();
+    // Upload the instance buffer with partial-write semantics: for runs
+    // of CPU-driven elements (transformOwnedByGpu=false) we push the
+    // whole 64-byte struct in one writeBuffer call; for GPU-driven runs
+    // we push only the trailing 16 metadata bytes per element, leaving
+    // the transform field intact for the physics-tlas-transform compute
+    // shader to update. The two arms below produce identical GPU state
+    // when every element is CPU-driven — this is a no-op refactor until
+    // 3DForts flips its physics elements to transformOwnedByGpu=true.
+    constexpr std::uint32_t kInstSize      = sizeof(RTInstance);          // 64
+    constexpr std::uint32_t kTransformSize = sizeof(RTTransformMatrix);   // 48
+    constexpr std::uint32_t kMetaSize      = kInstSize - kTransformSize;  // 16
+
+    std::uint32_t runStart = 0;
+    bool runOwned = elements[0]->transformOwnedByGpu;
+    for (std::uint32_t i = 1; i <= primitiveCount; ++i) {
+        const bool atEnd     = (i == primitiveCount);
+        const bool currOwned = atEnd ? !runOwned : elements[i]->transformOwnedByGpu;
+        if (currOwned == runOwned && !atEnd) continue;
+
+        if (runOwned) {
+            // GPU-driven run — metadata only, per element. Cannot batch
+            // because the metadata bytes are non-contiguous in the
+            // instance buffer (one 16-byte chunk per 64-byte slot).
+            for (std::uint32_t j = runStart; j < i; ++j) {
+                const std::uint32_t off = j * kInstSize + kTransformSize;
+                tlas.instanceBuffer.FlushDeviceRange(off, off, kMetaSize);
+            }
+        } else {
+            // CPU-driven run — one contiguous writeBuffer.
+            const std::uint32_t startOff = runStart * kInstSize;
+            const std::uint32_t bytes    = (i - runStart) * kInstSize;
+            tlas.instanceBuffer.FlushDeviceRange(startOff, startOff, bytes);
+        }
+        runStart = i;
+        runOwned = currOwned;
+    }
+
    tlas.metadataBuffer.FlushDevice();
+}
+
+void RenderingElement3D::BuildTLASBuild(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
+    auto& tlas = tlases[index];
+    const std::uint32_t primitiveCount = static_cast<std::uint32_t>(elements.size());
+    if (primitiveCount == 0) {
+        // Upload already cleared builtInstanceCount; nothing to dispatch.
+        return;
+    }
+
+    // No per-count Resize. tlas.buffer / entryOrder / mortonCodes were
+    // allocated at kLbvhMax in BuildTLASUpload's first call and stay
+    // that size. The LBVH shader reads the real count from a uniform
+    // (lbvhPc.nReal) wgpuBuildTLAS writes each call.

    WebGPU::wgpuBuildTLAS(tlas.instanceBuffer.handle,
                          static_cast<std::int32_t>(primitiveCount),
-                          tlas.buffer.handle);
+                          tlas.buffer.handle,
+                          tlas.entryOrder.handle,
+                          tlas.mortonCodes.handle,
+                          tlas.tlasBins.handle,
+                          tlas.bvhNodes.handle,
+                          tlas.sortTempA.handle,
+                          tlas.sortTempB.handle);

    tlas.builtInstanceCount = primitiveCount;
 }
+
+void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef cmd, std::uint32_t index) {
+    BuildTLASUpload(cmd, index);
+    BuildTLASBuild(cmd, index);
+}