webgpu improvements
This commit is contained in:
parent
5a75571ffd
commit
8347467e1e
18 changed files with 1932 additions and 153 deletions
|
|
@ -191,5 +191,13 @@ export namespace Crafter {
|
|||
heap.samplerTable[r.firstElement] = WebGPU::wgpuCreateLinearClampSampler();
|
||||
return SamplerSlot(&heap, r.firstElement);
|
||||
}
|
||||
|
||||
// Same as AllocateLinearClampSampler but the address modes are
|
||||
// `repeat` instead of `clamp-to-edge`. Mip filtering is also linear.
|
||||
inline SamplerSlot AllocateLinearRepeatSampler(DescriptorHeapWebGPU& heap) {
|
||||
DescriptorRange r = heap.AllocateSamplerSlots(1);
|
||||
heap.samplerTable[r.firstElement] = WebGPU::wgpuCreateLinearRepeatSampler();
|
||||
return SamplerSlot(&heap, r.firstElement);
|
||||
}
|
||||
}
|
||||
#endif // CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
|
|
|
|||
|
|
@ -113,17 +113,30 @@ export namespace Crafter {
|
|||
std::uint16_t width = 0;
|
||||
std::uint16_t height = 0;
|
||||
std::uint16_t layers = 0;
|
||||
std::uint8_t mipLevels = 1;
|
||||
|
||||
void Create(std::uint16_t w, std::uint16_t h, std::uint16_t layerCount) {
|
||||
width = w;
|
||||
height = h;
|
||||
layers = layerCount;
|
||||
handle = WebGPU::wgpuCreateImage2DArray(w, h, layerCount);
|
||||
// Create an array with `layerCount` × (w × h) layers, each carrying
|
||||
// `mipLevels` mip levels. Pass mipLevels=1 (default) for a single
|
||||
// base level — matching the original no-mip behaviour. Caller is
|
||||
// responsible for uploading each level via UpdateLayer (which
|
||||
// handles CPU mip-chain generation when mipLevels > 1).
|
||||
void Create(std::uint16_t w, std::uint16_t h, std::uint16_t layerCount,
|
||||
std::uint8_t mipLevelCount = 1) {
|
||||
width = w;
|
||||
height = h;
|
||||
layers = layerCount;
|
||||
mipLevels = mipLevelCount;
|
||||
handle = WebGPU::wgpuCreateImage2DArray(w, h, layerCount, mipLevelCount);
|
||||
}
|
||||
|
||||
// Decompress `tex` and upload to `layer`. The asset's dims must
|
||||
// match the array's (w × h) — resize beforehand on the host with
|
||||
// TextureAsset<RGBA8>::Resize() if they don't.
|
||||
// Decompress `tex`, generate a CPU box-filter mip chain (if
|
||||
// mipLevels > 1), and upload each level into `layer`. The asset's
|
||||
// base-level dims must match the array's (w × h) — resize
|
||||
// beforehand on the host with TextureAsset<RGBA8>::Resize() if
|
||||
// they don't. Pixel data is treated as raw bytes per channel for
|
||||
// the box filter — for non-color data (normal maps) this gives
|
||||
// approximate but adequate results; for sRGB-encoded color data
|
||||
// it's also approximate but visually fine for game textures.
|
||||
void UpdateLayer(std::uint16_t layer, const CompressedTextureAsset& tex) {
|
||||
if (tex.pixelStride != sizeof(PixelType)) {
|
||||
std::println(std::cerr,
|
||||
|
|
@ -142,11 +155,56 @@ export namespace Crafter {
|
|||
std::as_writable_bytes(std::span(pixels)),
|
||||
};
|
||||
Compression::DecompressCPU(tex.blob, outputs);
|
||||
|
||||
// Upload level 0.
|
||||
WebGPU::wgpuWriteImage2DLayer(
|
||||
handle, layer,
|
||||
handle, layer, /*level*/ 0,
|
||||
pixels.data(),
|
||||
static_cast<std::int32_t>(pixels.size() * sizeof(PixelType)),
|
||||
width, height);
|
||||
|
||||
// Generate + upload subsequent mip levels via a 2x2 box filter
|
||||
// on the previous level's bytes. Each channel is averaged
|
||||
// independently across 4 source texels.
|
||||
std::uint16_t srcW = width;
|
||||
std::uint16_t srcH = height;
|
||||
std::vector<PixelType> prev = std::move(pixels);
|
||||
for (std::uint8_t lvl = 1; lvl < mipLevels; ++lvl) {
|
||||
std::uint16_t dstW = std::max<std::uint16_t>(1, srcW >> 1);
|
||||
std::uint16_t dstH = std::max<std::uint16_t>(1, srcH >> 1);
|
||||
std::vector<PixelType> next(static_cast<std::size_t>(dstW) * dstH);
|
||||
constexpr std::size_t kChannels = sizeof(PixelType);
|
||||
auto srcBytes = reinterpret_cast<const std::uint8_t*>(prev.data());
|
||||
auto dstBytes = reinterpret_cast<std::uint8_t*>(next.data());
|
||||
for (std::uint16_t y = 0; y < dstH; ++y) {
|
||||
std::uint16_t sy0 = static_cast<std::uint16_t>(y * 2);
|
||||
std::uint16_t sy1 = static_cast<std::uint16_t>(std::min<std::int32_t>(sy0 + 1, srcH - 1));
|
||||
for (std::uint16_t x = 0; x < dstW; ++x) {
|
||||
std::uint16_t sx0 = static_cast<std::uint16_t>(x * 2);
|
||||
std::uint16_t sx1 = static_cast<std::uint16_t>(std::min<std::int32_t>(sx0 + 1, srcW - 1));
|
||||
std::size_t a = (static_cast<std::size_t>(sy0) * srcW + sx0) * kChannels;
|
||||
std::size_t b = (static_cast<std::size_t>(sy0) * srcW + sx1) * kChannels;
|
||||
std::size_t c = (static_cast<std::size_t>(sy1) * srcW + sx0) * kChannels;
|
||||
std::size_t d = (static_cast<std::size_t>(sy1) * srcW + sx1) * kChannels;
|
||||
std::size_t out = (static_cast<std::size_t>(y) * dstW + x) * kChannels;
|
||||
for (std::size_t ch = 0; ch < kChannels; ++ch) {
|
||||
std::uint32_t sum = static_cast<std::uint32_t>(srcBytes[a + ch])
|
||||
+ static_cast<std::uint32_t>(srcBytes[b + ch])
|
||||
+ static_cast<std::uint32_t>(srcBytes[c + ch])
|
||||
+ static_cast<std::uint32_t>(srcBytes[d + ch]);
|
||||
dstBytes[out + ch] = static_cast<std::uint8_t>((sum + 2u) >> 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
WebGPU::wgpuWriteImage2DLayer(
|
||||
handle, layer, /*level*/ lvl,
|
||||
next.data(),
|
||||
static_cast<std::int32_t>(next.size() * sizeof(PixelType)),
|
||||
dstW, dstH);
|
||||
prev = std::move(next);
|
||||
srcW = dstW;
|
||||
srcH = dstH;
|
||||
}
|
||||
}
|
||||
|
||||
ImageSlot AllocateSlot(DescriptorHeapWebGPU& heap) {
|
||||
|
|
|
|||
|
|
@ -18,10 +18,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|||
*/
|
||||
module;
|
||||
|
||||
#ifndef CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
#endif // !CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
export module Crafter.Graphics:InputField;
|
||||
#ifndef CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
import std;
|
||||
import :Types;
|
||||
import :Keys;
|
||||
|
|
@ -110,4 +107,3 @@ export namespace Crafter {
|
|||
const InputFieldColors& colors,
|
||||
bool caretVisible);
|
||||
}
|
||||
#endif // !CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
|
|
|
|||
|
|
@ -97,6 +97,7 @@ export namespace Crafter {
|
|||
// sentinel; never returned by Build().
|
||||
std::uint64_t blasAddr = 0;
|
||||
std::uint32_t triangleCount = 0;
|
||||
std::uint32_t vertexCount = 0;
|
||||
|
||||
bool opaque = true;
|
||||
|
||||
|
|
|
|||
113
interfaces/Crafter.Graphics-PlainComputeShader.cppm
Normal file
113
interfaces/Crafter.Graphics-PlainComputeShader.cppm
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
/*
|
||||
Crafter®.Graphics
|
||||
Copyright (C) 2026 Catcrafts®
|
||||
catcrafts.net
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License version 3.0 as published by the Free Software Foundation;
|
||||
*/
|
||||
|
||||
// Standalone compute pipeline. Dispatches at any point in the frame
|
||||
// (inside or outside the UI render pass) via the JS bridge's
|
||||
// wgpuDispatchCompute, which mirrors the wgpuBuildTLAS pattern of
|
||||
// attaching to the active encoder when one exists or creating an
|
||||
// ephemeral encoder+submit when not.
|
||||
//
|
||||
// This is the WebGPU counterpart to the Vulkan `:ComputeShader` partition.
|
||||
// They expose the same conceptual API — Load + Dispatch — but with
|
||||
// backend-specific binding plumbing. See `:GraphicsTypes` for the
|
||||
// `GraphicsComputeShader` alias picking the right one per target.
|
||||
//
|
||||
// WGSL contract:
|
||||
// @group(0) @binding(0) uniform PushData // optional; only if pushUniformSize>0
|
||||
// @group(1+) @binding(N) // user bindings via UICustomBinding
|
||||
// When rayQuery is on, @group(1) is reserved for the RT heap; user
|
||||
// bindings start at @group(2).
|
||||
|
||||
module;
|
||||
export module Crafter.Graphics:PlainComputeShader;
|
||||
#ifdef CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
import std;
|
||||
import :WebGPU;
|
||||
import :WebGPUComputeShader; // for UICustomBinding + UICustomBindingKind
|
||||
|
||||
export namespace Crafter {
|
||||
class PlainComputeShader {
|
||||
public:
|
||||
std::uint32_t pipelineHandle = 0;
|
||||
std::uint32_t pushUniformSize = 0;
|
||||
bool rayQueryCapable = false;
|
||||
std::vector<UICustomBinding> customBindings;
|
||||
|
||||
PlainComputeShader() = default;
|
||||
PlainComputeShader(const PlainComputeShader&) = delete;
|
||||
PlainComputeShader& operator=(const PlainComputeShader&) = delete;
|
||||
PlainComputeShader(PlainComputeShader&& o) noexcept
|
||||
: pipelineHandle(o.pipelineHandle),
|
||||
pushUniformSize(o.pushUniformSize),
|
||||
rayQueryCapable(o.rayQueryCapable),
|
||||
customBindings(std::move(o.customBindings)) {
|
||||
o.pipelineHandle = 0;
|
||||
}
|
||||
|
||||
// Compile + link a standalone compute shader.
|
||||
// wgsl — source.
|
||||
// pushUniformSize — byte size of the @group(0)@binding(0) uniform
|
||||
// struct, or 0 if the shader doesn't declare one.
|
||||
// bindings — every user-declared resource the dispatch
|
||||
// should bind (groups 1+ if no rayQuery, 2+ if
|
||||
// rayQuery). Order MUST match `handles` at
|
||||
// Dispatch time.
|
||||
// rayQuery — prepend the RT prelude + rayQuery library
|
||||
// so the shader can call `rayQuery*` helpers.
|
||||
void Load(std::string_view wgsl,
|
||||
std::uint32_t pushUniformSize_,
|
||||
std::span<const UICustomBinding> bindings = {},
|
||||
bool rayQuery = false) {
|
||||
pushUniformSize = pushUniformSize_;
|
||||
rayQueryCapable = rayQuery;
|
||||
customBindings.assign(bindings.begin(), bindings.end());
|
||||
pipelineHandle = WebGPU::wgpuLoadComputePipeline(
|
||||
wgsl.data(), static_cast<std::int32_t>(wgsl.size()),
|
||||
static_cast<std::int32_t>(pushUniformSize),
|
||||
customBindings.empty() ? nullptr : customBindings.data(),
|
||||
static_cast<std::int32_t>(customBindings.size()),
|
||||
rayQuery ? 1 : 0);
|
||||
}
|
||||
|
||||
void Load(const std::filesystem::path& wgslPath,
|
||||
std::uint32_t pushUniformSize_,
|
||||
std::span<const UICustomBinding> bindings = {},
|
||||
bool rayQuery = false) {
|
||||
std::ifstream f(wgslPath, std::ios::binary);
|
||||
if (!f) {
|
||||
std::println(std::cerr,
|
||||
"PlainComputeShader::Load: cannot open {}", wgslPath.string());
|
||||
std::abort();
|
||||
}
|
||||
std::string wgsl((std::istreambuf_iterator<char>(f)),
|
||||
std::istreambuf_iterator<char>());
|
||||
Load(std::string_view{wgsl}, pushUniformSize_, bindings, rayQuery);
|
||||
}
|
||||
|
||||
// Bind, push, dispatch. `handles` is parallel to the
|
||||
// UICustomBinding[] passed at Load — order matches.
|
||||
void Dispatch(const void* push, std::uint32_t pushBytes,
|
||||
std::span<const std::uint32_t> handles,
|
||||
std::uint32_t gx,
|
||||
std::uint32_t gy = 1,
|
||||
std::uint32_t gz = 1) const {
|
||||
if (pipelineHandle == 0) return;
|
||||
WebGPU::wgpuDispatchCompute(
|
||||
pipelineHandle,
|
||||
push, static_cast<std::int32_t>(pushBytes),
|
||||
handles.empty() ? nullptr : handles.data(),
|
||||
static_cast<std::int32_t>(handles.size()),
|
||||
static_cast<std::int32_t>(gx),
|
||||
static_cast<std::int32_t>(gy),
|
||||
static_cast<std::int32_t>(gz));
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif // CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
|
|
@ -121,6 +121,37 @@ export namespace Crafter {
|
|||
// customIndex (4) + _pad (12). Defined in the WGSL traversal
|
||||
// library; never directly read by C++.
|
||||
WebGPUBuffer<char, false> buffer;
|
||||
// GPU LBVH support — see additional/dom-webgpu.js's TLAS-build
|
||||
// pipeline.
|
||||
//
|
||||
// entryOrder: per-frame permutation array of u32, indexing into
|
||||
// `buffer` (the TLASEntry[] array). Populated by the radix-sort
|
||||
// pass to spatially-coherent Morton order, then consumed by the
|
||||
// BVH construction + traversal passes. In Stage 1 (this
|
||||
// baseline) it's the identity permutation written by
|
||||
// tlasBuildMain alongside the entries.
|
||||
WebGPUBuffer<char, false> entryOrder;
|
||||
// mortonCodes: per-instance 32-bit Morton codes computed from the
|
||||
// world-AABB centroid, used as the radix-sort key. Written by
|
||||
// tlasBuildMain.
|
||||
WebGPUBuffer<char, false> mortonCodes;
|
||||
// bvhNodes: 2N_PADDED - 1 sweep-tree BVH nodes built per frame
|
||||
// by the LBVH-build compute pass. Each node 32 bytes (aabbMin +
|
||||
// pad, aabbMax + pad). N_PADDED = 65536 (hardcoded in WGSL).
|
||||
// Internal nodes [0, N_PADDED-1); leaves [N_PADDED-1, 2*N_PADDED-1).
|
||||
// Node i's children are 2i+1, 2i+2 (implicit perfect binary
|
||||
// tree). Cap: 65536 instances per scene.
|
||||
WebGPUBuffer<char, false> bvhNodes;
|
||||
// tlasBins: dead, kept allocated as a 64-byte placeholder so the
|
||||
// existing wgpuBuildTLAS C++ signature doesn't need a churn.
|
||||
// The pre-LBVH 64-bin partition was replaced by the full BVH.
|
||||
WebGPUBuffer<char, false> tlasBins;
|
||||
// Sort ping-pong buffers for the radix sort. Each pass reads
|
||||
// from one and writes to the other, swapping role. Layout per
|
||||
// element: 1 u32 packed key = (morton16 << 16) | tlasIndex16.
|
||||
// Sized for N_PADDED.
|
||||
WebGPUBuffer<char, false> sortTempA;
|
||||
WebGPUBuffer<char, false> sortTempB;
|
||||
|
||||
std::uint32_t builtInstanceCount = 0;
|
||||
};
|
||||
|
|
@ -141,6 +172,17 @@ export namespace Crafter {
|
|||
// a fresh build (no refit) — the GPU build pass is cheap at the
|
||||
// ~10–100 instance counts the design targets; LBVH-for-TLAS is a
|
||||
// future optimization for larger scenes.
|
||||
//
|
||||
// BuildTLAS is now split into Upload + Build so a physics
|
||||
// compute pass (e.g. physics-tlas-transform) can run between the
|
||||
// CPU mirror upload and the GPU LBVH build. The compute pass
|
||||
// writes the per-instance transform bytes that BuildTLAS leaves
|
||||
// intact for elements flagged transformOwnedByGpu, and those
|
||||
// writes have to land before the LBVH reads them. The combined
|
||||
// BuildTLAS is kept as a convenience for callers that don't
|
||||
// interleave a compute pass (e.g. the ctor-time first build).
|
||||
static void BuildTLASUpload(WebGPUCommandEncoderRef cmd, std::uint32_t index);
|
||||
static void BuildTLASBuild(WebGPUCommandEncoderRef cmd, std::uint32_t index);
|
||||
static void BuildTLAS(WebGPUCommandEncoderRef cmd, std::uint32_t index);
|
||||
|
||||
static void Add(RenderingElement3D* e);
|
||||
|
|
|
|||
|
|
@ -165,6 +165,18 @@ export namespace Crafter {
|
|||
std::array<float,4> clipRectPx = {0.0f, 0.0f, 1e9f, 1e9f});
|
||||
void DispatchImages(GraphicsCommandBuffer cmd, std::uint32_t bufferSlot, std::uint32_t itemCount,
|
||||
std::array<float,4> clipRectPx = {0.0f, 0.0f, 1e9f, 1e9f});
|
||||
#ifdef CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
// WebGPU-only overload. WebGPU bind groups can only carry one
|
||||
// texture/sampler per dispatch, so all items in `bufferSlot`
|
||||
// share the same texture (`imageSlot`) and sampler (`samplerSlot`).
|
||||
// The per-item `slots` field in ImageItem is ignored on this
|
||||
// backend. On Vulkan the bindless heap resolves per-item slots,
|
||||
// so the cross-backend path is to call the 4-arg overload above
|
||||
// on native and this 6-arg overload on DOM.
|
||||
void DispatchImages(GraphicsCommandBuffer cmd, std::uint32_t bufferSlot, std::uint32_t itemCount,
|
||||
std::uint16_t imageSlot, std::uint16_t samplerSlot,
|
||||
std::array<float,4> clipRectPx = {0.0f, 0.0f, 1e9f, 1e9f});
|
||||
#endif
|
||||
void DispatchText(GraphicsCommandBuffer cmd, std::uint32_t bufferSlot, std::uint32_t itemCount,
|
||||
std::array<float,4> clipRectPx = {0.0f, 0.0f, 1e9f, 1e9f});
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,40 @@ namespace Crafter::WebGPU {
|
|||
extern "C" std::uint32_t wgpuCreateBuffer(std::int32_t byteSize);
|
||||
__attribute__((import_module("env"), import_name("wgpuWriteBuffer")))
|
||||
extern "C" void wgpuWriteBuffer(std::uint32_t handle, const void* srcPtr, std::int32_t byteSize);
|
||||
__attribute__((import_module("env"), import_name("wgpuWriteBufferRange")))
|
||||
extern "C" void wgpuWriteBufferRange(std::uint32_t handle,
|
||||
std::uint32_t dstByteOffset,
|
||||
const void* srcPtr,
|
||||
std::int32_t byteSize);
|
||||
// Kick off a GPU→CPU readback for the entire `byteSize`-byte prefix
|
||||
// of the buffer at `handle`. Returns immediately; the actual map
|
||||
// resolves asynchronously. Successive Enqueues without a Poll in
|
||||
// between are no-ops until the previous map resolves.
|
||||
//
|
||||
// `resetBytes` ≥ 0 — if non-zero, the JS bridge encodes a
|
||||
// clearBuffer over the first `resetBytes` bytes of the source
|
||||
// buffer immediately after the copy, in the same command encoder.
|
||||
// Used by Forts3D's GPU event queues to zero the atomic-add count
|
||||
// for the next frame's substeps. The reset is TIED to a successful
|
||||
// enqueue: if the enqueue was skipped (previous map still pending),
|
||||
// the reset is skipped too — so events written by substeps during
|
||||
// the missed-drain window accumulate into the next successful
|
||||
// capture instead of being silently wiped.
|
||||
__attribute__((import_module("env"), import_name("wgpuReadbackEnqueue")))
|
||||
extern "C" void wgpuReadbackEnqueue(std::uint32_t handle,
|
||||
std::int32_t byteSize,
|
||||
std::int32_t resetBytes);
|
||||
// Poll a previously-enqueued readback. Returns 1 and writes the
|
||||
// bytes into `dstPtr` if the map resolved; returns 0 otherwise.
|
||||
__attribute__((import_module("env"), import_name("wgpuReadbackPoll")))
|
||||
extern "C" std::int32_t wgpuReadbackPoll(std::uint32_t handle, void* dstPtr, std::int32_t byteSize);
|
||||
// Non-consuming readiness probe. Returns 1 if the readback has
|
||||
// resolved and the next Poll would succeed; returns 0 otherwise.
|
||||
// Used to gate multi-buffer drains (header + array) so neither side
|
||||
// gets consumed until both are ready — otherwise the consumed side's
|
||||
// data is lost while the other side waits for its map to resolve.
|
||||
__attribute__((import_module("env"), import_name("wgpuReadbackReady")))
|
||||
extern "C" std::int32_t wgpuReadbackReady(std::uint32_t handle);
|
||||
__attribute__((import_module("env"), import_name("wgpuDestroyBuffer")))
|
||||
extern "C" void wgpuDestroyBuffer(std::uint32_t handle);
|
||||
|
||||
|
|
@ -64,15 +98,26 @@ namespace Crafter::WebGPU {
|
|||
// Used by Image2DArray<RGBA8> to stack per-material albedos for one
|
||||
// multi-material scene.
|
||||
__attribute__((import_module("env"), import_name("wgpuCreateImage2DArray")))
|
||||
extern "C" std::uint32_t wgpuCreateImage2DArray(std::int32_t w, std::int32_t h, std::int32_t layerCount);
|
||||
extern "C" std::uint32_t wgpuCreateImage2DArray(std::int32_t w, std::int32_t h,
|
||||
std::int32_t layerCount, std::int32_t mipLevels);
|
||||
// Upload a single mip level for one array layer. `level` indexes into
|
||||
// the texture's mip chain (0 = base); `w` / `h` must be the dimensions
|
||||
// at that level. Callers pass each level's pixels separately — mip
|
||||
// generation is host-side.
|
||||
__attribute__((import_module("env"), import_name("wgpuWriteImage2DLayer")))
|
||||
extern "C" void wgpuWriteImage2DLayer(std::uint32_t handle, std::int32_t layer,
|
||||
extern "C" void wgpuWriteImage2DLayer(std::uint32_t handle, std::int32_t layer, std::int32_t level,
|
||||
const void* srcPtr, std::int32_t byteSize,
|
||||
std::int32_t w, std::int32_t h);
|
||||
|
||||
__attribute__((import_module("env"), import_name("wgpuCreateLinearClampSampler")))
|
||||
extern "C" std::uint32_t wgpuCreateLinearClampSampler();
|
||||
|
||||
// Linear-filtered, repeat-addressed sampler with mipmap linear-filter.
|
||||
// The usual choice for tiled material textures (woodBrace, panel, etc.)
|
||||
// which expect UV > 1.0 to wrap.
|
||||
__attribute__((import_module("env"), import_name("wgpuCreateLinearRepeatSampler")))
|
||||
extern "C" std::uint32_t wgpuCreateLinearRepeatSampler();
|
||||
|
||||
__attribute__((import_module("env"), import_name("wgpuFrameBegin")))
|
||||
extern "C" void wgpuFrameBegin();
|
||||
__attribute__((import_module("env"), import_name("wgpuFrameEnd")))
|
||||
|
|
@ -158,12 +203,56 @@ namespace Crafter::WebGPU {
|
|||
std::int32_t gx, std::int32_t gy,
|
||||
const void* handlesPtr, std::int32_t handlesCount);
|
||||
|
||||
// GPU TLAS-build dispatch. Reads the instance buffer (host-uploaded or
|
||||
// GPU-written), produces per-instance world-space AABBs + per-instance
|
||||
// transform matrices in a flat tlasBuf SSBO consumed by traceRay / rayQuery.
|
||||
// GPU TLAS-build dispatch. Two sequential compute passes:
|
||||
// 1. tlasBuildMain — per-instance world AABB + identity permutation
|
||||
// + naive Morton (overwritten in pass 2). Outputs the flat
|
||||
// tlasBuf SSBO consumed by traceRay / rayQuery.
|
||||
// 2. lbvhBuildMain — single workgroup of 1024 threads; reduces
|
||||
// scene AABB, recomputes Morton with proper normalization,
|
||||
// bitonic-sorts (morton, instance_id), writes the sorted
|
||||
// permutation into `entryOrderBufHandle`, and refits a
|
||||
// sweep-tree BVH into `bvhNodesBufHandle` bottom-up.
|
||||
// Pre-LBVH bin-build is gone; `binsBufHandle` is kept in the
|
||||
// signature as a placeholder so the C++ side doesn't churn.
|
||||
__attribute__((import_module("env"), import_name("wgpuBuildTLAS")))
|
||||
extern "C" void wgpuBuildTLAS(std::uint32_t instanceBufHandle,
|
||||
std::int32_t instanceCount,
|
||||
std::uint32_t tlasOutBufHandle);
|
||||
std::uint32_t tlasOutBufHandle,
|
||||
std::uint32_t entryOrderBufHandle,
|
||||
std::uint32_t mortonBufHandle,
|
||||
std::uint32_t binsBufHandle,
|
||||
std::uint32_t bvhNodesBufHandle,
|
||||
std::uint32_t sortTempABufHandle,
|
||||
std::uint32_t sortTempBBufHandle);
|
||||
|
||||
// ── Standalone compute pipelines ───────────────────────────────────
|
||||
//
|
||||
// Mirror of the native ComputeShader API: load a user-authored
|
||||
// compute WGSL with arbitrary @group bindings, dispatch it at any
|
||||
// point in the frame (inside or outside the UI compute pass —
|
||||
// physics ticks dispatch from update lambdas, which fire outside
|
||||
// the per-frame render encoder).
|
||||
//
|
||||
// WGSL contract:
|
||||
// @group(0) @binding(0) — uniform PushData (optional; only if
|
||||
// pushUniformSize > 0 at load).
|
||||
// @group(1+) @binding(N) — user bindings declared via
|
||||
// UICustomBinding[]. When rayQuery is
|
||||
// on, @group(1) is reserved for the RT
|
||||
// heap and user bindings start at
|
||||
// @group(2).
|
||||
__attribute__((import_module("env"), import_name("wgpuLoadComputePipeline")))
|
||||
extern "C" std::uint32_t wgpuLoadComputePipeline(
|
||||
const void* wgslPtr, std::int32_t wgslLen,
|
||||
std::int32_t pushUniformSize,
|
||||
const void* bindingsPtr, std::int32_t bindingsCount,
|
||||
std::int32_t rayQueryFlag);
|
||||
|
||||
__attribute__((import_module("env"), import_name("wgpuDispatchCompute")))
|
||||
extern "C" void wgpuDispatchCompute(
|
||||
std::uint32_t pipelineHandle,
|
||||
const void* pushPtr, std::int32_t pushBytes,
|
||||
const void* handlesPtr, std::int32_t handlesCount,
|
||||
std::int32_t gx, std::int32_t gy, std::int32_t gz);
|
||||
}
|
||||
#endif // CRAFTER_GRAPHICS_WINDOW_DOM
|
||||
|
|
|
|||
|
|
@ -78,6 +78,60 @@ export namespace Crafter {
|
|||
void FlushDevice() requires(Mapped) {
|
||||
WebGPU::wgpuWriteBuffer(handle, this->value, static_cast<std::int32_t>(size));
|
||||
}
|
||||
// Partial upload — write the bytes [srcByteOffset, srcByteOffset+byteCount)
|
||||
// of the host mirror to GPU offset `dstByteOffset`. BuildTLAS uses
|
||||
// this to leave the GPU-owned transform field of an RTInstance
|
||||
// intact (the physics-tlas-transform compute shader is its sole
|
||||
// writer) while still pushing the CPU-side metadata fields.
|
||||
void FlushDeviceRange(std::uint32_t dstByteOffset,
|
||||
std::uint32_t srcByteOffset,
|
||||
std::uint32_t byteCount) requires(Mapped) {
|
||||
const auto* base = reinterpret_cast<const char*>(this->value);
|
||||
WebGPU::wgpuWriteBufferRange(handle, dstByteOffset,
|
||||
base + srcByteOffset,
|
||||
static_cast<std::int32_t>(byteCount));
|
||||
}
|
||||
|
||||
// Push one element's worth of bytes from the host mirror to GPU.
|
||||
// Use when a single SoA slot was mutated (body construction,
|
||||
// per-instance flag flip) and a full FlushDevice would clobber
|
||||
// the GPU-side updates the sim has applied to neighboring slots.
|
||||
void FlushDeviceSlot(std::uint32_t idx) requires(Mapped) {
|
||||
constexpr std::uint32_t kStride = sizeof(T);
|
||||
const std::uint32_t off = idx * kStride;
|
||||
FlushDeviceRange(off, off, kStride);
|
||||
}
|
||||
|
||||
// Schedule a GPU→CPU readback of this buffer's entire contents.
|
||||
// Asynchronous; data isn't ready until a later PollReadback
|
||||
// returns true. Successive Enqueues without a Poll are dropped
|
||||
// — they're a no-op while the previous map is in flight.
|
||||
//
|
||||
// `resetBytes` ≥ 0 — if non-zero, the first `resetBytes` bytes
|
||||
// of THIS buffer are clearBuffer-cleared on the GPU command
|
||||
// encoder immediately after the copy, so the readback captures
|
||||
// the pre-clear bytes and the next frame's writers see zeros.
|
||||
// The reset is tied to a successful enqueue (skipped enqueue =
|
||||
// skipped reset), preserving accumulated state across missed
|
||||
// drains.
|
||||
void EnqueueReadback(std::uint32_t resetBytes = 0) {
|
||||
WebGPU::wgpuReadbackEnqueue(handle,
|
||||
static_cast<std::int32_t>(size),
|
||||
static_cast<std::int32_t>(resetBytes));
|
||||
}
|
||||
// Try to copy the readback bytes into this->value. Returns true
|
||||
// if the previous EnqueueReadback resolved and the data is now
|
||||
// mirrored into .value; false if the map is still pending.
|
||||
bool PollReadback() requires(Mapped) {
|
||||
return WebGPU::wgpuReadbackPoll(handle, this->value,
|
||||
static_cast<std::int32_t>(size)) != 0;
|
||||
}
|
||||
// Non-consuming readiness probe. Returns true if a subsequent
|
||||
// PollReadback would succeed without changing state otherwise.
|
||||
// Use to verify a sibling buffer is also ready before consuming.
|
||||
bool IsReadbackReady() const {
|
||||
return WebGPU::wgpuReadbackReady(handle) != 0;
|
||||
}
|
||||
|
||||
~WebGPUBuffer() { Clear(); }
|
||||
};
|
||||
|
|
|
|||
|
|
@ -36,6 +36,11 @@ export namespace Crafter {
|
|||
SampledTexture = 1, // sampled texture_2d<f32>, handle is a slot into heap.imageTable
|
||||
Sampler = 2, // filtering sampler, handle is a slot into heap.samplerTable
|
||||
SampledTextureArray = 3, // sampled texture_2d_array<f32>, handle is a slot into heap.imageTable
|
||||
// read-write storage SSBO (var<storage, read_write> in WGSL). Use
|
||||
// for buffers shaders need to MUTATE — e.g. physics shaders that
|
||||
// integrate node momentum, write brace stress, or output TLAS
|
||||
// instance transforms.
|
||||
BufferReadWrite = 4,
|
||||
};
|
||||
|
||||
struct UICustomBinding {
|
||||
|
|
|
|||
|
|
@ -71,5 +71,6 @@ export import :WebGPU;
|
|||
export import :WebGPUBuffer;
|
||||
export import :DescriptorHeapWebGPU;
|
||||
export import :WebGPUComputeShader;
|
||||
export import :PlainComputeShader;
|
||||
export import :ShaderBindingTableWebGPU;
|
||||
export import :PipelineRTWebGPU;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue