webgpu improvements

This commit is contained in:
Jorijn van der Graaf 2026-05-24 13:32:08 +02:00
commit 8347467e1e
18 changed files with 1932 additions and 153 deletions

View file

@ -121,6 +121,37 @@ export namespace Crafter {
// customIndex (4) + _pad (12). Defined in the WGSL traversal
// library; never directly read by C++.
WebGPUBuffer<char, false> buffer;
// GPU LBVH support — see additional/dom-webgpu.js's TLAS-build
// pipeline.
//
// entryOrder: per-frame permutation array of u32, indexing into
// `buffer` (the TLASEntry[] array). Populated by the radix-sort
// pass to spatially-coherent Morton order, then consumed by the
// BVH construction + traversal passes. In Stage 1 (this
// baseline) it's the identity permutation written by
// tlasBuildMain alongside the entries.
WebGPUBuffer<char, false> entryOrder;
// mortonCodes: per-instance 32-bit Morton codes computed from the
// world-AABB centroid, used as the radix-sort key. Written by
// tlasBuildMain.
WebGPUBuffer<char, false> mortonCodes;
// bvhNodes: 2N_PADDED - 1 sweep-tree BVH nodes built per frame
// by the LBVH-build compute pass. Each node 32 bytes (aabbMin +
// pad, aabbMax + pad). N_PADDED = 65536 (hardcoded in WGSL).
// Internal nodes [0, N_PADDED-1); leaves [N_PADDED-1, 2*N_PADDED-1).
// Node i's children are 2i+1, 2i+2 (implicit perfect binary
// tree). Cap: 65536 instances per scene.
WebGPUBuffer<char, false> bvhNodes;
// tlasBins: dead, kept allocated as a 64-byte placeholder so the
// existing wgpuBuildTLAS C++ signature doesn't need a churn.
// The pre-LBVH 64-bin partition was replaced by the full BVH.
WebGPUBuffer<char, false> tlasBins;
// Sort ping-pong buffers for the radix sort. Each pass reads
// from one and writes to the other, swapping role. Layout per
// element: 1 u32 packed key = (morton16 << 16) | tlasIndex16.
// Sized for N_PADDED.
WebGPUBuffer<char, false> sortTempA;
WebGPUBuffer<char, false> sortTempB;
std::uint32_t builtInstanceCount = 0;
};
@ -141,6 +172,17 @@ export namespace Crafter {
// a fresh build (no refit) — the GPU build pass is cheap at the
// ~10100 instance counts the design targets; LBVH-for-TLAS is a
// future optimization for larger scenes.
//
// BuildTLAS is now split into Upload + Build so a physics
// compute pass (e.g. physics-tlas-transform) can run between the
// CPU mirror upload and the GPU LBVH build. The compute pass
// writes the per-instance transform bytes that BuildTLAS leaves
// intact for elements flagged transformOwnedByGpu, and those
// writes have to land before the LBVH reads them. The combined
// BuildTLAS is kept as a convenience for callers that don't
// interleave a compute pass (e.g. the ctor-time first build).
static void BuildTLASUpload(WebGPUCommandEncoderRef cmd, std::uint32_t index);
static void BuildTLASBuild(WebGPUCommandEncoderRef cmd, std::uint32_t index);
static void BuildTLAS(WebGPUCommandEncoderRef cmd, std::uint32_t index);
static void Add(RenderingElement3D* e);