/* Crafter®.Graphics Copyright (C) 2026 Catcrafts® catcrafts.net */ // DOM-mode TLAS upkeep. BuildTLAS is split in two phases so a physics // compute pass can run between them: // - BuildTLASUpload mirrors the CPU-side RTInstance array into the // host-visible instance buffer (with partial-write semantics that // preserve the transform bytes for elements flagged // transformOwnedByGpu, see notes in the body) and uploads the // metadata buffer. // - BuildTLASBuild dispatches the JS-side TLAS-build compute pass — // which consults the per-BLAS records published at Mesh::Build() // time to produce world-space AABBs and inverse transforms in the // format `traceRay` / `rayQuery` consume. // The combined BuildTLAS calls both back-to-back; callers that want to // interleave a physics tlas-transform compute pass (which writes the // transform bytes BuildTLASUpload leaves intact) call Upload + their // compute pass + Build manually. module; module Crafter.Graphics:RenderingElement3D_implWebGPU; import :RenderingElement3D; import :Mesh; import :WebGPU; import :WebGPUBuffer; import std; using namespace Crafter; std::vector RenderingElement3D::elements; void RenderingElement3D::Add(RenderingElement3D* e) { e->indexInElements = static_cast(elements.size()); elements.push_back(e); } void RenderingElement3D::Remove(RenderingElement3D* e) { std::uint32_t idx = e->indexInElements; if (idx == std::numeric_limits::max()) return; std::uint32_t last = static_cast(elements.size() - 1); if (idx != last) { elements[idx] = elements[last]; elements[idx]->indexInElements = idx; } elements.pop_back(); e->indexInElements = std::numeric_limits::max(); } void RenderingElement3D::BuildTLASUpload(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) { auto& tlas = tlases[index]; const std::uint32_t primitiveCount = static_cast(elements.size()); if (primitiveCount == 0) { tlas.builtInstanceCount = 0; return; } constexpr std::uint32_t kNPadded = 65536u; // size for instance / metadata mirrors constexpr std::uint32_t kLbvhMax = 16384u; // matches N_PADDED in lbvhBuildWgsl constexpr std::uint32_t kNodeCount = 2u * kNPadded - 1u; // ALL TLAS-side GPU buffers get allocated ONCE and never resized. // The LBVH-build shader takes the real instance count via a uniform // (lbvhPc.nReal) instead of arrayLength(&entries), so the // tlas.buffer / entryOrder / mortonCodes don't need to grow when // the application's element count changes. // // Why this matters: an earlier version resized these per-frame on // primitiveCount change. The destroy+recreate cycle on the GPU // buffer caused subtle mid-game flicker as soon as any element was // added (e.g. firing a projectile) — fort braces would appear to // briefly vanish in patterns deterministic on the projectile's // angle. Suspected driver-level memory recycling without proper // zero-init; the fixed-size allocation sidesteps it entirely. if (tlas.instanceBuffer.handle == 0) { tlas.instanceBuffer.Resize(kNPadded); tlas.metadataBuffer.Resize(kNPadded); tlas.bvhNodes.Resize(kNodeCount * 32u); tlas.sortTempA.Resize(kNPadded * 4u); tlas.sortTempB.Resize(kNPadded * 4u); tlas.tlasBins.Resize(64 * 32); // TLAS-entry / order / morton-code buffers: sized for the LBVH // cap (16384). lbvhBuildMain iterates `lbvhPc.nReal` real // entries; the remainder stays zero / sentinel. Keep these // stable across element-count changes so the renderer's bind // group references the same buffer handle every frame. tlas.buffer.Resize(kLbvhMax * 144u); tlas.entryOrder.Resize(kLbvhMax * 4u); tlas.mortonCodes.Resize(kLbvhMax * 4u); } // NB: tlas.buffer / entryOrder / mortonCodes get resized in // BuildTLASBuild, NOT here. Resize destroys + recreates the GPU // resource (and the JS-side handle); the rayQuery dispatches that // run between BuildTLASUpload and BuildTLASBuild (projectile-collide, // splash, builder-pick) still hold the previous frame's TLAS in // rtState.current{Tlas,EntryOrder,Bvh}. If we resized here, those // handles would point at destroyed buffers and the dispatches would // log "no TLAS built yet" every frame the element count changed // (e.g. every projectile fire). Resizing inside BuildTLASBuild, // immediately before wgpuBuildTLAS publishes the new handles, keeps // the JS-side current* refs in sync with the GPU resources. for (std::uint32_t i = 0; i < primitiveCount; ++i) { auto& dst = tlas.instanceBuffer.value[i]; const auto& src = elements[i]->instance; if (elements[i]->transformOwnedByGpu) { // Preserve whatever the GPU compute shader most recently // wrote into dst.transform. Update only the non-transform // fields. dst.instanceCustomIndex = src.instanceCustomIndex; dst.mask = src.mask; dst.instanceShaderBindingTableRecordOffset = src.instanceShaderBindingTableRecordOffset; dst.flags = src.flags; dst.accelerationStructureReference = src.accelerationStructureReference; } else { dst = src; } tlas.metadataBuffer.value[i] = elements[i]->userMetadata; } // Upload the instance buffer with partial-write semantics: for runs // of CPU-driven elements (transformOwnedByGpu=false) we push the // whole 64-byte struct in one writeBuffer call; for GPU-driven runs // we push only the trailing 16 metadata bytes per element, leaving // the transform field intact for the physics-tlas-transform compute // shader to update. The two arms below produce identical GPU state // when every element is CPU-driven — this is a no-op refactor until // 3DForts flips its physics elements to transformOwnedByGpu=true. constexpr std::uint32_t kInstSize = sizeof(RTInstance); // 64 constexpr std::uint32_t kTransformSize = sizeof(RTTransformMatrix); // 48 constexpr std::uint32_t kMetaSize = kInstSize - kTransformSize; // 16 std::uint32_t runStart = 0; bool runOwned = elements[0]->transformOwnedByGpu; for (std::uint32_t i = 1; i <= primitiveCount; ++i) { const bool atEnd = (i == primitiveCount); const bool currOwned = atEnd ? !runOwned : elements[i]->transformOwnedByGpu; if (currOwned == runOwned && !atEnd) continue; if (runOwned) { // GPU-driven run — metadata only, per element. Cannot batch // because the metadata bytes are non-contiguous in the // instance buffer (one 16-byte chunk per 64-byte slot). for (std::uint32_t j = runStart; j < i; ++j) { const std::uint32_t off = j * kInstSize + kTransformSize; tlas.instanceBuffer.FlushDeviceRange(off, off, kMetaSize); } } else { // CPU-driven run — one contiguous writeBuffer. const std::uint32_t startOff = runStart * kInstSize; const std::uint32_t bytes = (i - runStart) * kInstSize; tlas.instanceBuffer.FlushDeviceRange(startOff, startOff, bytes); } runStart = i; runOwned = currOwned; } tlas.metadataBuffer.FlushDevice(); } void RenderingElement3D::BuildTLASBuild(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) { auto& tlas = tlases[index]; const std::uint32_t primitiveCount = static_cast(elements.size()); if (primitiveCount == 0) { // Upload already cleared builtInstanceCount; nothing to dispatch. return; } // No per-count Resize. tlas.buffer / entryOrder / mortonCodes were // allocated at kLbvhMax in BuildTLASUpload's first call and stay // that size. The LBVH shader reads the real count from a uniform // (lbvhPc.nReal) wgpuBuildTLAS writes each call. WebGPU::wgpuBuildTLAS(tlas.instanceBuffer.handle, static_cast(primitiveCount), tlas.buffer.handle, tlas.entryOrder.handle, tlas.mortonCodes.handle, tlas.tlasBins.handle, tlas.bvhNodes.handle, tlas.sortTempA.handle, tlas.sortTempB.handle); tlas.builtInstanceCount = primitiveCount; } void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef cmd, std::uint32_t index) { BuildTLASUpload(cmd, index); BuildTLASBuild(cmd, index); }