/*
Crafter®.Graphics
Copyright (C) 2026 Catcrafts®
catcrafts.net
*/

// DOM-mode TLAS upkeep. BuildTLAS is split in two phases so a physics
// compute pass can run between them:
//   - BuildTLASUpload mirrors the CPU-side RTInstance array into the
//     host-visible instance buffer (with partial-write semantics that
//     preserve the transform bytes for elements flagged
//     transformOwnedByGpu, see notes in the body) and uploads the
//     metadata buffer.
//   - BuildTLASBuild dispatches the JS-side TLAS-build compute pass —
//     which consults the per-BLAS records published at Mesh::Build()
//     time to produce world-space AABBs and inverse transforms in the
//     format `traceRay` / `rayQuery` consume.
// The combined BuildTLAS calls both back-to-back; callers that want to
// interleave a physics tlas-transform compute pass (which writes the
// transform bytes BuildTLASUpload leaves intact) call Upload + their
// compute pass + Build manually.

module;
module Crafter.Graphics:RenderingElement3D_implWebGPU;

import :RenderingElement3D;
import :Mesh;
import :WebGPU;
import :WebGPUBuffer;
import std;

using namespace Crafter;

std::vector<RenderingElement3D*> RenderingElement3D::elements;

void RenderingElement3D::Add(RenderingElement3D* e) {
    e->indexInElements = static_cast<std::uint32_t>(elements.size());
    elements.push_back(e);
}

void RenderingElement3D::Remove(RenderingElement3D* e) {
    std::uint32_t idx = e->indexInElements;
    if (idx == std::numeric_limits<std::uint32_t>::max()) return;
    std::uint32_t last = static_cast<std::uint32_t>(elements.size() - 1);
    if (idx != last) {
        elements[idx] = elements[last];
        elements[idx]->indexInElements = idx;
    }
    elements.pop_back();
    e->indexInElements = std::numeric_limits<std::uint32_t>::max();
}

void RenderingElement3D::BuildTLASUpload(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
    auto& tlas = tlases[index];
    const std::uint32_t primitiveCount = static_cast<std::uint32_t>(elements.size());
    if (primitiveCount == 0) {
        tlas.builtInstanceCount = 0;
        return;
    }

    constexpr std::uint32_t kNPadded   = 65536u;     // size for instance / metadata mirrors
    constexpr std::uint32_t kLbvhMax   = 16384u;     // matches N_PADDED in lbvhBuildWgsl
    constexpr std::uint32_t kNodeCount = 2u * kNPadded - 1u;

    // ALL TLAS-side GPU buffers get allocated ONCE and never resized.
    // The LBVH-build shader takes the real instance count via a uniform
    // (lbvhPc.nReal) instead of arrayLength(&entries), so the
    // tlas.buffer / entryOrder / mortonCodes don't need to grow when
    // the application's element count changes.
    //
    // Why this matters: an earlier version resized these per-frame on
    // primitiveCount change. The destroy+recreate cycle on the GPU
    // buffer caused subtle mid-game flicker as soon as any element was
    // added (e.g. firing a projectile) — fort braces would appear to
    // briefly vanish in patterns deterministic on the projectile's
    // angle. Suspected driver-level memory recycling without proper
    // zero-init; the fixed-size allocation sidesteps it entirely.
    if (tlas.instanceBuffer.handle == 0) {
        tlas.instanceBuffer.Resize(kNPadded);
        tlas.metadataBuffer.Resize(kNPadded);
        tlas.bvhNodes.Resize(kNodeCount * 32u);
        tlas.sortTempA.Resize(kNPadded * 4u);
        tlas.sortTempB.Resize(kNPadded * 4u);
        tlas.tlasBins.Resize(64 * 32);
        // TLAS-entry / order / morton-code buffers: sized for the LBVH
        // cap (16384). lbvhBuildMain iterates `lbvhPc.nReal` real
        // entries; the remainder stays zero / sentinel. Keep these
        // stable across element-count changes so the renderer's bind
        // group references the same buffer handle every frame.
        tlas.buffer.Resize(kLbvhMax * 144u);
        tlas.entryOrder.Resize(kLbvhMax * 4u);
        tlas.mortonCodes.Resize(kLbvhMax * 4u);
    }

    // NB: tlas.buffer / entryOrder / mortonCodes get resized in
    // BuildTLASBuild, NOT here. Resize destroys + recreates the GPU
    // resource (and the JS-side handle); the rayQuery dispatches that
    // run between BuildTLASUpload and BuildTLASBuild (projectile-collide,
    // splash, builder-pick) still hold the previous frame's TLAS in
    // rtState.current{Tlas,EntryOrder,Bvh}. If we resized here, those
    // handles would point at destroyed buffers and the dispatches would
    // log "no TLAS built yet" every frame the element count changed
    // (e.g. every projectile fire). Resizing inside BuildTLASBuild,
    // immediately before wgpuBuildTLAS publishes the new handles, keeps
    // the JS-side current* refs in sync with the GPU resources.

    for (std::uint32_t i = 0; i < primitiveCount; ++i) {
        auto& dst = tlas.instanceBuffer.value[i];
        const auto& src = elements[i]->instance;
        if (elements[i]->transformOwnedByGpu) {
            // Preserve whatever the GPU compute shader most recently
            // wrote into dst.transform. Update only the non-transform
            // fields.
            dst.instanceCustomIndex                    = src.instanceCustomIndex;
            dst.mask                                   = src.mask;
            dst.instanceShaderBindingTableRecordOffset = src.instanceShaderBindingTableRecordOffset;
            dst.flags                                  = src.flags;
            dst.accelerationStructureReference         = src.accelerationStructureReference;
        } else {
            dst = src;
        }
        tlas.metadataBuffer.value[i] = elements[i]->userMetadata;
    }

    // Upload the instance buffer with partial-write semantics: for runs
    // of CPU-driven elements (transformOwnedByGpu=false) we push the
    // whole 64-byte struct in one writeBuffer call; for GPU-driven runs
    // we push only the trailing 16 metadata bytes per element, leaving
    // the transform field intact for the physics-tlas-transform compute
    // shader to update. The two arms below produce identical GPU state
    // when every element is CPU-driven — this is a no-op refactor until
    // 3DForts flips its physics elements to transformOwnedByGpu=true.
    constexpr std::uint32_t kInstSize      = sizeof(RTInstance);          // 64
    constexpr std::uint32_t kTransformSize = sizeof(RTTransformMatrix);   // 48
    constexpr std::uint32_t kMetaSize      = kInstSize - kTransformSize;  // 16

    std::uint32_t runStart = 0;
    bool runOwned = elements[0]->transformOwnedByGpu;
    for (std::uint32_t i = 1; i <= primitiveCount; ++i) {
        const bool atEnd     = (i == primitiveCount);
        const bool currOwned = atEnd ? !runOwned : elements[i]->transformOwnedByGpu;
        if (currOwned == runOwned && !atEnd) continue;

        if (runOwned) {
            // GPU-driven run — metadata only, per element. Cannot batch
            // because the metadata bytes are non-contiguous in the
            // instance buffer (one 16-byte chunk per 64-byte slot).
            for (std::uint32_t j = runStart; j < i; ++j) {
                const std::uint32_t off = j * kInstSize + kTransformSize;
                tlas.instanceBuffer.FlushDeviceRange(off, off, kMetaSize);
            }
        } else {
            // CPU-driven run — one contiguous writeBuffer.
            const std::uint32_t startOff = runStart * kInstSize;
            const std::uint32_t bytes    = (i - runStart) * kInstSize;
            tlas.instanceBuffer.FlushDeviceRange(startOff, startOff, bytes);
        }
        runStart = i;
        runOwned = currOwned;
    }

    tlas.metadataBuffer.FlushDevice();
}

void RenderingElement3D::BuildTLASBuild(WebGPUCommandEncoderRef /*cmd*/, std::uint32_t index) {
    auto& tlas = tlases[index];
    const std::uint32_t primitiveCount = static_cast<std::uint32_t>(elements.size());
    if (primitiveCount == 0) {
        // Upload already cleared builtInstanceCount; nothing to dispatch.
        return;
    }

    // No per-count Resize. tlas.buffer / entryOrder / mortonCodes were
    // allocated at kLbvhMax in BuildTLASUpload's first call and stay
    // that size. The LBVH shader reads the real count from a uniform
    // (lbvhPc.nReal) wgpuBuildTLAS writes each call.

    WebGPU::wgpuBuildTLAS(tlas.instanceBuffer.handle,
                          static_cast<std::int32_t>(primitiveCount),
                          tlas.buffer.handle,
                          tlas.entryOrder.handle,
                          tlas.mortonCodes.handle,
                          tlas.tlasBins.handle,
                          tlas.bvhNodes.handle,
                          tlas.sortTempA.handle,
                          tlas.sortTempB.handle);

    tlas.builtInstanceCount = primitiveCount;
}

void RenderingElement3D::BuildTLAS(WebGPUCommandEncoderRef cmd, std::uint32_t index) {
    BuildTLASUpload(cmd, index);
    BuildTLASBuild(cmd, index);
}