webgpu improvements

This commit is contained in:
Jorijn van der Graaf 2026-05-24 13:32:08 +02:00
commit 8347467e1e
18 changed files with 1932 additions and 153 deletions

View file

@ -225,6 +225,7 @@ namespace {
std::span<const std::uint32_t> indices,
std::span<const std::byte> attribsBytes) {
mesh.triangleCount = static_cast<std::uint32_t>(indices.size()) / 3;
mesh.vertexCount = static_cast<std::uint32_t>(vertices.size());
Builder builder;
builder.Build(vertices, indices);

View file

@ -4,12 +4,21 @@ Copyright (C) 2026 Catcrafts®
catcrafts.net
*/
// DOM-mode TLAS upkeep. BuildTLAS copies the per-element RTInstance into
// the host-visible instance buffer (skipping the transform for elements
// whose transform is GPU-owned), uploads it, then dispatches the JS-side
// TLAS-build compute pass — which consults the per-BLAS records published
// at Mesh::Build() time to produce world-space AABBs and inverse
// transforms in the format `traceRay` / `rayQuery` consume.
// DOM-mode TLAS upkeep. BuildTLAS is split in two phases so a physics
// compute pass can run between them:
// - BuildTLASUpload mirrors the CPU-side RTInstance array into the
// host-visible instance buffer (with partial-write semantics that
// preserve the transform bytes for elements flagged
// transformOwnedByGpu, see notes in the body) and uploads the
// metadata buffer.
// - BuildTLASBuild dispatches the JS-side TLAS-build compute pass —
// which consults the per-BLAS records published at Mesh::Build()
// time to produce world-space AABBs and inverse transforms in the
// format `traceRay` / `rayQuery` consume.
// The combined BuildTLAS calls both back-to-back; callers that want to
// interleave a physics tlas-transform compute pass (which writes the
// transform bytes BuildTLASUpload leaves intact) call Upload + their
// compute pass + Build manually.
module;
module Crafter.Graphics:RenderingElement3D_implWebGPU;
@ -41,7 +50,7 @@ void RenderingElement3D::Remove(RenderingElement3D* e) {
e->indexInElements = std::numeric_limits<std::uint32_t>::max();
}
void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
void RenderingElement3D::BuildTLASUpload(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
auto& tlas = tlases[index];
const std::uint32_t primitiveCount = static_cast<std::uint32_t>(elements.size());
if (primitiveCount == 0) {
@ -49,19 +58,52 @@ void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef /*cmd*/, std::uint32_
return;
}
// (Re)allocate instance + metadata + output TLAS buffers if the count
// changed. WebGPUBuffer::Resize destroys and recreates the GPU buffer;
// bind-group caches keyed on the buffer handle are invalidated in the
// JS bridge automatically.
if (primitiveCount != tlas.builtInstanceCount) {
tlas.instanceBuffer.Resize(primitiveCount);
tlas.metadataBuffer.Resize(primitiveCount);
// TLASEntry layout in WGSL is 144 bytes due to vec3 align/pad
// rules. Must match the struct declared in the rtWgslTypes
// block in additional/dom-webgpu.js.
tlas.buffer.Resize(primitiveCount * 144);
constexpr std::uint32_t kNPadded = 65536u; // size for instance / metadata mirrors
constexpr std::uint32_t kLbvhMax = 16384u; // matches N_PADDED in lbvhBuildWgsl
constexpr std::uint32_t kNodeCount = 2u * kNPadded - 1u;
// ALL TLAS-side GPU buffers get allocated ONCE and never resized.
// The LBVH-build shader takes the real instance count via a uniform
// (lbvhPc.nReal) instead of arrayLength(&entries), so the
// tlas.buffer / entryOrder / mortonCodes don't need to grow when
// the application's element count changes.
//
// Why this matters: an earlier version resized these per-frame on
// primitiveCount change. The destroy+recreate cycle on the GPU
// buffer caused subtle mid-game flicker as soon as any element was
// added (e.g. firing a projectile) — fort braces would appear to
// briefly vanish in patterns deterministic on the projectile's
// angle. Suspected driver-level memory recycling without proper
// zero-init; the fixed-size allocation sidesteps it entirely.
if (tlas.instanceBuffer.handle == 0) {
tlas.instanceBuffer.Resize(kNPadded);
tlas.metadataBuffer.Resize(kNPadded);
tlas.bvhNodes.Resize(kNodeCount * 32u);
tlas.sortTempA.Resize(kNPadded * 4u);
tlas.sortTempB.Resize(kNPadded * 4u);
tlas.tlasBins.Resize(64 * 32);
// TLAS-entry / order / morton-code buffers: sized for the LBVH
// cap (16384). lbvhBuildMain iterates `lbvhPc.nReal` real
// entries; the remainder stays zero / sentinel. Keep these
// stable across element-count changes so the renderer's bind
// group references the same buffer handle every frame.
tlas.buffer.Resize(kLbvhMax * 144u);
tlas.entryOrder.Resize(kLbvhMax * 4u);
tlas.mortonCodes.Resize(kLbvhMax * 4u);
}
// NB: tlas.buffer / entryOrder / mortonCodes get resized in
// BuildTLASBuild, NOT here. Resize destroys + recreates the GPU
// resource (and the JS-side handle); the rayQuery dispatches that
// run between BuildTLASUpload and BuildTLASBuild (projectile-collide,
// splash, builder-pick) still hold the previous frame's TLAS in
// rtState.current{Tlas,EntryOrder,Bvh}. If we resized here, those
// handles would point at destroyed buffers and the dispatches would
// log "no TLAS built yet" every frame the element count changed
// (e.g. every projectile fire). Resizing inside BuildTLASBuild,
// immediately before wgpuBuildTLAS publishes the new handles, keeps
// the JS-side current* refs in sync with the GPU resources.
for (std::uint32_t i = 0; i < primitiveCount; ++i) {
auto& dst = tlas.instanceBuffer.value[i];
const auto& src = elements[i]->instance;
@ -80,12 +122,73 @@ void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef /*cmd*/, std::uint32_
tlas.metadataBuffer.value[i] = elements[i]->userMetadata;
}
tlas.instanceBuffer.FlushDevice();
// Upload the instance buffer with partial-write semantics: for runs
// of CPU-driven elements (transformOwnedByGpu=false) we push the
// whole 64-byte struct in one writeBuffer call; for GPU-driven runs
// we push only the trailing 16 metadata bytes per element, leaving
// the transform field intact for the physics-tlas-transform compute
// shader to update. The two arms below produce identical GPU state
// when every element is CPU-driven — this is a no-op refactor until
// 3DForts flips its physics elements to transformOwnedByGpu=true.
constexpr std::uint32_t kInstSize = sizeof(RTInstance); // 64
constexpr std::uint32_t kTransformSize = sizeof(RTTransformMatrix); // 48
constexpr std::uint32_t kMetaSize = kInstSize - kTransformSize; // 16
std::uint32_t runStart = 0;
bool runOwned = elements[0]->transformOwnedByGpu;
for (std::uint32_t i = 1; i <= primitiveCount; ++i) {
const bool atEnd = (i == primitiveCount);
const bool currOwned = atEnd ? !runOwned : elements[i]->transformOwnedByGpu;
if (currOwned == runOwned && !atEnd) continue;
if (runOwned) {
// GPU-driven run — metadata only, per element. Cannot batch
// because the metadata bytes are non-contiguous in the
// instance buffer (one 16-byte chunk per 64-byte slot).
for (std::uint32_t j = runStart; j < i; ++j) {
const std::uint32_t off = j * kInstSize + kTransformSize;
tlas.instanceBuffer.FlushDeviceRange(off, off, kMetaSize);
}
} else {
// CPU-driven run — one contiguous writeBuffer.
const std::uint32_t startOff = runStart * kInstSize;
const std::uint32_t bytes = (i - runStart) * kInstSize;
tlas.instanceBuffer.FlushDeviceRange(startOff, startOff, bytes);
}
runStart = i;
runOwned = currOwned;
}
tlas.metadataBuffer.FlushDevice();
}
void RenderingElement3D::BuildTLASBuild(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
auto& tlas = tlases[index];
const std::uint32_t primitiveCount = static_cast<std::uint32_t>(elements.size());
if (primitiveCount == 0) {
// Upload already cleared builtInstanceCount; nothing to dispatch.
return;
}
// No per-count Resize. tlas.buffer / entryOrder / mortonCodes were
// allocated at kLbvhMax in BuildTLASUpload's first call and stay
// that size. The LBVH shader reads the real count from a uniform
// (lbvhPc.nReal) wgpuBuildTLAS writes each call.
WebGPU::wgpuBuildTLAS(tlas.instanceBuffer.handle,
static_cast<std::int32_t>(primitiveCount),
tlas.buffer.handle);
tlas.buffer.handle,
tlas.entryOrder.handle,
tlas.mortonCodes.handle,
tlas.tlasBins.handle,
tlas.bvhNodes.handle,
tlas.sortTempA.handle,
tlas.sortTempB.handle);
tlas.builtInstanceCount = primitiveCount;
}
void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef cmd, std::uint32_t index) {
BuildTLASUpload(cmd, index);
BuildTLASBuild(cmd, index);
}

View file

@ -98,13 +98,9 @@ void UIRenderer::DispatchImages(GraphicsCommandBuffer /*cmd*/, std::uint32_t buf
if (itemCount == 0) return;
UIDispatchHeader hdr = FillHeader(bufferSlot, itemCount, clipRectPx);
auto handle = heap_->bufferTable[bufferSlot];
// For DispatchImages, the WGSL expects a texture + sampler in group 3.
// The library v1 doesn't expose user-image registration on DOM (out of
// scope per plan). If the user calls DispatchImages without a registered
// image, fall back to using the font atlas binding — the user's items
// should reference texSlot/sampSlot but on DOM those are ignored. For
// now, route through the font atlas texture if available; otherwise
// skip the dispatch.
// Backward-compatible fallback: callers that don't pass a texture
// get the font atlas. Useful for tests, useless for real content.
// New code should use the 6-arg overload below.
if (fontAtlasImageSlot_) {
auto texHandle = heap_->imageTable[fontAtlasImageSlot_];
auto sampHandle = heap_->samplerTable[fontAtlasSamplerSlot_];
@ -115,6 +111,21 @@ void UIRenderer::DispatchImages(GraphicsCommandBuffer /*cmd*/, std::uint32_t buf
}
}
void UIRenderer::DispatchImages(GraphicsCommandBuffer /*cmd*/, std::uint32_t bufferSlot,
std::uint32_t itemCount,
std::uint16_t imageSlot, std::uint16_t samplerSlot,
std::array<float,4> clipRectPx) {
if (itemCount == 0) return;
UIDispatchHeader hdr = FillHeader(bufferSlot, itemCount, clipRectPx);
auto handle = heap_->bufferTable[bufferSlot];
auto texHandle = heap_->imageTable[imageSlot];
auto sampHandle = heap_->samplerTable[samplerSlot];
WebGPU::wgpuDispatchImages(handle, &hdr,
static_cast<std::int32_t>(TilesFor(window_->width)),
static_cast<std::int32_t>(TilesFor(window_->height)),
texHandle, sampHandle);
}
void UIRenderer::DispatchText(GraphicsCommandBuffer /*cmd*/, std::uint32_t bufferSlot,
std::uint32_t itemCount,
std::array<float,4> clipRectPx) {
@ -168,6 +179,7 @@ void UIRenderer::Dispatch(GraphicsCommandBuffer /*cmd*/, const GraphicsComputeSh
case UICustomBindingKind::Sampler:
if (slot < heap_->samplerTable.size()) handle = heap_->samplerTable[slot];
break;
default: break;
}
handles.push_back(handle);
}