Crafter.Graphics/implementations/Crafter.Graphics-Mesh-WebGPU.cpp

/*
Crafter®.Graphics
Copyright (C) 2026 Catcrafts®
catcrafts.net
*/

// DOM-mode Mesh implementation: SAH BVH2 built on the host, then
// forwarded to the JS bridge which appends the four data streams
// (vertices, indices, BVH nodes, primRemap) into the global RT mesh
// heaps. The handle returned by wgpuRegisterMeshBLAS goes into
// RTInstance::accelerationStructureReference and lets the TLAS-build
// compute pass and the traversal kernel find the BLAS data later.
//
// BVH layout must stay binary-identical to the WGSL `BVHNode` struct
// declared in additional/dom-webgpu.js (rtWgslPrelude).

module;
module Crafter.Graphics:Mesh_implWebGPU;

import :Mesh;
import :WebGPU;
import Crafter.Asset;
import Crafter.Math;
import std;

using namespace Crafter;

namespace {
    // ─── BVH builder (binned SAH, 8 bins, BVH2) ────────────────────────

    constexpr std::uint32_t kBinCount      = 8;
    constexpr std::uint32_t kMaxLeafSize   = 4;
    constexpr float         kTraversalCost = 1.0f;
    constexpr float         kIntersectCost = 1.0f;

    struct AABB {
        float lo[3] { std::numeric_limits<float>::infinity(),
                      std::numeric_limits<float>::infinity(),
                      std::numeric_limits<float>::infinity() };
        float hi[3] {-std::numeric_limits<float>::infinity(),
                     -std::numeric_limits<float>::infinity(),
                     -std::numeric_limits<float>::infinity() };

        void Extend(const float p[3]) noexcept {
            for (int a = 0; a < 3; ++a) {
                if (p[a] < lo[a]) lo[a] = p[a];
                if (p[a] > hi[a]) hi[a] = p[a];
            }
        }
        void Extend(const AABB& o) noexcept {
            for (int a = 0; a < 3; ++a) {
                if (o.lo[a] < lo[a]) lo[a] = o.lo[a];
                if (o.hi[a] > hi[a]) hi[a] = o.hi[a];
            }
        }
        float SurfaceArea() const noexcept {
            float dx = hi[0] - lo[0];
            float dy = hi[1] - lo[1];
            float dz = hi[2] - lo[2];
            if (dx < 0.0f || dy < 0.0f || dz < 0.0f) return 0.0f;
            return 2.0f * (dx*dy + dx*dz + dy*dz);
        }
    };

    struct PrimRef {
        AABB box;
        float centroid[3];
        std::uint32_t triIndex;
    };

    struct Bin {
        AABB box;
        std::uint32_t count = 0;
    };

    struct Builder {
        std::vector<PrimRef> prims;
        std::vector<BVHNode> nodes;

        std::pair<std::uint32_t, std::uint32_t> AllocateChildren() {
            std::uint32_t l = static_cast<std::uint32_t>(nodes.size());
            nodes.emplace_back();
            nodes.emplace_back();
            return { l, l + 1 };
        }

        void BuildRecursive(std::uint32_t nodeIdx,
                            std::uint32_t first,
                            std::uint32_t count) {
            AABB bounds, centroidBounds;
            for (std::uint32_t i = 0; i < count; ++i) {
                const auto& p = prims[first + i];
                bounds.Extend(p.box);
                centroidBounds.Extend(p.centroid);
            }

            auto emitLeaf = [&] {
                BVHNode& n = nodes[nodeIdx];
                std::memcpy(n.aabbMin, bounds.lo, sizeof(bounds.lo));
                std::memcpy(n.aabbMax, bounds.hi, sizeof(bounds.hi));
                n.firstChildOrPrim = first;
                n.primCount        = count;
            };

            if (count <= kMaxLeafSize) { emitLeaf(); return; }

            int   bestAxis  = -1;
            float bestCost  = std::numeric_limits<float>::infinity();
            std::uint32_t bestBin = 0;

            float parentArea = bounds.SurfaceArea();
            if (parentArea <= 0.0f) { emitLeaf(); return; }

            for (int axis = 0; axis < 3; ++axis) {
                float extent = centroidBounds.hi[axis] - centroidBounds.lo[axis];
                if (extent <= 0.0f) continue;
                float invExtent = static_cast<float>(kBinCount) / extent;

                std::array<Bin, kBinCount> bins{};
                for (std::uint32_t i = 0; i < count; ++i) {
                    const auto& p = prims[first + i];
                    float t = (p.centroid[axis] - centroidBounds.lo[axis]) * invExtent;
                    std::uint32_t b = static_cast<std::uint32_t>(t);
                    if (b >= kBinCount) b = kBinCount - 1;
                    bins[b].box.Extend(p.box);
                    bins[b].count += 1;
                }

                std::array<AABB,         kBinCount - 1> leftBox;
                std::array<std::uint32_t,kBinCount - 1> leftCount{};
                {
                    AABB acc; std::uint32_t cnt = 0;
                    for (std::uint32_t i = 0; i < kBinCount - 1; ++i) {
                        acc.Extend(bins[i].box);
                        cnt += bins[i].count;
                        leftBox[i]   = acc;
                        leftCount[i] = cnt;
                    }
                }
                {
                    AABB acc; std::uint32_t cnt = 0;
                    for (std::int32_t i = kBinCount - 1; i >= 1; --i) {
                        acc.Extend(bins[i].box);
                        cnt += bins[i].count;
                        std::uint32_t split = static_cast<std::uint32_t>(i - 1);
                        if (leftCount[split] == 0 || cnt == 0) continue;
                        float cost = kTraversalCost
                                   + (leftBox[split].SurfaceArea() * leftCount[split]
                                      + acc.SurfaceArea() * cnt) * kIntersectCost / parentArea;
                        if (cost < bestCost) {
                            bestCost = cost;
                            bestAxis = axis;
                            bestBin  = split;
                        }
                    }
                }
            }

            float leafCost = static_cast<float>(count) * kIntersectCost;
            if (bestAxis < 0 || bestCost >= leafCost) { emitLeaf(); return; }

            float invExtent = static_cast<float>(kBinCount)
                            / (centroidBounds.hi[bestAxis] - centroidBounds.lo[bestAxis]);
            float lo = centroidBounds.lo[bestAxis];
            auto mid = std::partition(
                prims.begin() + first, prims.begin() + first + count,
                [&](const PrimRef& p) {
                    float t = (p.centroid[bestAxis] - lo) * invExtent;
                    std::uint32_t b = static_cast<std::uint32_t>(t);
                    if (b >= kBinCount) b = kBinCount - 1;
                    return b <= bestBin;
                });
            std::uint32_t leftCount =
                static_cast<std::uint32_t>(mid - (prims.begin() + first));
            if (leftCount == 0 || leftCount == count) { emitLeaf(); return; }

            auto [leftIdx, rightIdx] = AllocateChildren();
            {
                BVHNode& n = nodes[nodeIdx];
                std::memcpy(n.aabbMin, bounds.lo, sizeof(bounds.lo));
                std::memcpy(n.aabbMax, bounds.hi, sizeof(bounds.hi));
                n.firstChildOrPrim = leftIdx;
                n.primCount        = 0;
            }
            BuildRecursive(leftIdx,  first,             leftCount);
            BuildRecursive(rightIdx, first + leftCount, count - leftCount);
        }

        void Build(std::span<const Vector<float, 3, 3>> vertices,
                   std::span<const std::uint32_t>       indices) {
            std::uint32_t triCount = static_cast<std::uint32_t>(indices.size()) / 3;
            prims.resize(triCount);
            for (std::uint32_t i = 0; i < triCount; ++i) {
                std::uint32_t i0 = indices[i*3 + 0];
                std::uint32_t i1 = indices[i*3 + 1];
                std::uint32_t i2 = indices[i*3 + 2];
                const auto& v0 = vertices[i0];
                const auto& v1 = vertices[i1];
                const auto& v2 = vertices[i2];
                float p0[3] { v0.v[0], v0.v[1], v0.v[2] };
                float p1[3] { v1.v[0], v1.v[1], v1.v[2] };
                float p2[3] { v2.v[0], v2.v[1], v2.v[2] };
                auto& pr = prims[i];
                pr.box.Extend(p0);
                pr.box.Extend(p1);
                pr.box.Extend(p2);
                pr.centroid[0] = (pr.box.lo[0] + pr.box.hi[0]) * 0.5f;
                pr.centroid[1] = (pr.box.lo[1] + pr.box.hi[1]) * 0.5f;
                pr.centroid[2] = (pr.box.lo[2] + pr.box.hi[2]) * 0.5f;
                pr.triIndex = i;
            }
            nodes.reserve(triCount * 2);
            nodes.emplace_back();
            BuildRecursive(0, 0, triCount);
        }
    };
}

namespace {
    // Shared between the positions-only and the compressed-asset Build paths.
    // attribsBytes is empty for positions-only meshes; the JS bridge skips
    // the attribs-heap append in that case.
    void BuildBVHAndRegister(Mesh& mesh,
                             std::span<const Vector<float, 3, 3>> vertices,
                             std::span<const std::uint32_t>       indices,
                             std::span<const std::byte>           attribsBytes) {
        mesh.triangleCount = static_cast<std::uint32_t>(indices.size()) / 3;
        mesh.vertexCount   = static_cast<std::uint32_t>(vertices.size());

        Builder builder;
        builder.Build(vertices, indices);

        std::vector<std::uint32_t> primRemap(mesh.triangleCount);
        for (std::uint32_t i = 0; i < mesh.triangleCount; ++i) {
            primRemap[i] = builder.prims[i].triIndex;
        }

        const BVHNode& root = builder.nodes[0];
        mesh.blasAddr = WebGPU::wgpuRegisterMeshBLAS(
            root.aabbMin[0], root.aabbMin[1], root.aabbMin[2],
            root.aabbMax[0], root.aabbMax[1], root.aabbMax[2],
            vertices.data(),       static_cast<std::int32_t>(vertices.size()),
            indices.data(),        static_cast<std::int32_t>(indices.size()),
            builder.nodes.data(),  static_cast<std::int32_t>(builder.nodes.size()),
            primRemap.data(),      static_cast<std::int32_t>(primRemap.size()),
            attribsBytes.data(),   static_cast<std::int32_t>(attribsBytes.size()));
    }
}

void Mesh::Build(std::span<Vector<float, 3, 3>> vertices,
                 std::span<std::uint32_t>       indices,
                 WebGPUCommandEncoderRef        /*cmd*/) {
    BuildBVHAndRegister(*this, vertices, indices, {});
}

void Mesh::Build(const CompressedMeshAsset& asset,
                 WebGPUCommandEncoderRef    /*cmd*/) {
    std::vector<Vector<float, 3, 3>> vertices(asset.vertexCount);
    std::vector<std::uint32_t>       indices(asset.indexCount);
    std::vector<std::byte>           dataBytes(
        static_cast<std::size_t>(asset.dataCount) * asset.dataStride);

    // CompressedBlob always carries 3 regions for MeshAsset (the data region
    // can have decompressedSize=0). DecompressCPU validates output sizes
    // against region sizes, so the empty-data path needs the empty span.
    std::array<std::span<std::byte>, 3> outputs = {
        std::as_writable_bytes(std::span(vertices)),
        std::as_writable_bytes(std::span(indices)),
        std::span<std::byte>(dataBytes),
    };
    Compression::DecompressCPU(asset.blob,
        std::span(outputs).first(asset.blob.regions.size()));

    BuildBVHAndRegister(*this, vertices, indices, std::span(dataBytes));
}