/*
Crafter®.Graphics
Copyright (C) 2026 Catcrafts®
catcrafts.net
*/

// JS bridge declarations for the DOM-mode WebGPU backend. Each function
// corresponds to one entry in `additional/dom-webgpu.js`. Handles are
// opaque uint32 cookies into the JS-side handle tables.

export module Crafter.Graphics:WebGPU;
#ifdef CRAFTER_GRAPHICS_WINDOW_DOM
import std;

export namespace Crafter {
    using WebGPUBufferRef         = std::uint32_t;
    using WebGPUTextureRef        = std::uint32_t;
    using WebGPUSamplerRef        = std::uint32_t;
    using WebGPUCommandEncoderRef = std::uint32_t; // unused as a real handle; just a marker type for portability
}

namespace Crafter::WebGPU {
    __attribute__((import_module("env"), import_name("wgpuGetCanvasWidth")))
    extern "C" std::int32_t wgpuGetCanvasWidth();
    __attribute__((import_module("env"), import_name("wgpuGetCanvasHeight")))
    extern "C" std::int32_t wgpuGetCanvasHeight();
    __attribute__((import_module("env"), import_name("wgpuSurfaceWidth")))
    extern "C" std::int32_t wgpuSurfaceWidth();
    __attribute__((import_module("env"), import_name("wgpuSurfaceHeight")))
    extern "C" std::int32_t wgpuSurfaceHeight();
    __attribute__((import_module("env"), import_name("wgpuInit")))
    extern "C" void wgpuInit();

    __attribute__((import_module("env"), import_name("wgpuCreateBuffer")))
    extern "C" std::uint32_t wgpuCreateBuffer(std::int32_t byteSize);
    __attribute__((import_module("env"), import_name("wgpuWriteBuffer")))
    extern "C" void wgpuWriteBuffer(std::uint32_t handle, const void* srcPtr, std::int32_t byteSize);
    __attribute__((import_module("env"), import_name("wgpuWriteBufferRange")))
    extern "C" void wgpuWriteBufferRange(std::uint32_t handle,
                                          std::uint32_t dstByteOffset,
                                          const void* srcPtr,
                                          std::int32_t byteSize);
    // Kick off a GPU→CPU readback for the entire `byteSize`-byte prefix
    // of the buffer at `handle`. Returns immediately; the actual map
    // resolves asynchronously. Successive Enqueues without a Poll in
    // between are no-ops until the previous map resolves.
    //
    // `resetBytes` ≥ 0 — if non-zero, the JS bridge encodes a
    // clearBuffer over the first `resetBytes` bytes of the source
    // buffer immediately after the copy, in the same command encoder.
    // Used by Forts3D's GPU event queues to zero the atomic-add count
    // for the next frame's substeps. The reset is TIED to a successful
    // enqueue: if the enqueue was skipped (previous map still pending),
    // the reset is skipped too — so events written by substeps during
    // the missed-drain window accumulate into the next successful
    // capture instead of being silently wiped.
    __attribute__((import_module("env"), import_name("wgpuReadbackEnqueue")))
    extern "C" void wgpuReadbackEnqueue(std::uint32_t handle,
                                         std::int32_t byteSize,
                                         std::int32_t resetBytes);
    // Poll a previously-enqueued readback. Returns 1 and writes the
    // bytes into `dstPtr` if the map resolved; returns 0 otherwise.
    __attribute__((import_module("env"), import_name("wgpuReadbackPoll")))
    extern "C" std::int32_t wgpuReadbackPoll(std::uint32_t handle, void* dstPtr, std::int32_t byteSize);
    // Non-consuming readiness probe. Returns 1 if the readback has
    // resolved and the next Poll would succeed; returns 0 otherwise.
    // Used to gate multi-buffer drains (header + array) so neither side
    // gets consumed until both are ready — otherwise the consumed side's
    // data is lost while the other side waits for its map to resolve.
    __attribute__((import_module("env"), import_name("wgpuReadbackReady")))
    extern "C" std::int32_t wgpuReadbackReady(std::uint32_t handle);
    __attribute__((import_module("env"), import_name("wgpuDestroyBuffer")))
    extern "C" void wgpuDestroyBuffer(std::uint32_t handle);

    __attribute__((import_module("env"), import_name("wgpuCreateAtlasTexture")))
    extern "C" std::uint32_t wgpuCreateAtlasTexture(std::int32_t w, std::int32_t h);
    __attribute__((import_module("env"), import_name("wgpuWriteAtlasRegion")))
    extern "C" void wgpuWriteAtlasRegion(std::uint32_t handle, const void* srcPtr,
                                         std::int32_t srcW, std::int32_t srcH,
                                         std::int32_t srcBytesPerRow,
                                         std::int32_t dstX, std::int32_t dstY,
                                         std::int32_t copyW, std::int32_t copyH);
    __attribute__((import_module("env"), import_name("wgpuDestroyTexture")))
    extern "C" void wgpuDestroyTexture(std::uint32_t handle);

    // General-purpose rgba8unorm 2D texture for material albedo etc.
    // Separate from the atlas path because atlas uses r8unorm + sub-region
    // writes; this one takes the whole image in one shot.
    __attribute__((import_module("env"), import_name("wgpuCreateImage2D")))
    extern "C" std::uint32_t wgpuCreateImage2D(std::int32_t w, std::int32_t h);
    __attribute__((import_module("env"), import_name("wgpuWriteImage2D")))
    extern "C" void wgpuWriteImage2D(std::uint32_t handle, const void* srcPtr,
                                     std::int32_t byteSize,
                                     std::int32_t w, std::int32_t h);

    // 2D texture array — `layerCount` rgba8unorm layers of identical (w × h).
    // Sampled via `texture_2d_array<f32>` in WGSL (UICustomBindingKind 3).
    // Used by Image2DArray<RGBA8> to stack per-material albedos for one
    // multi-material scene.
    __attribute__((import_module("env"), import_name("wgpuCreateImage2DArray")))
    extern "C" std::uint32_t wgpuCreateImage2DArray(std::int32_t w, std::int32_t h,
                                                    std::int32_t layerCount, std::int32_t mipLevels);
    // Upload a single mip level for one array layer. `level` indexes into
    // the texture's mip chain (0 = base); `w` / `h` must be the dimensions
    // at that level. Callers pass each level's pixels separately — mip
    // generation is host-side.
    __attribute__((import_module("env"), import_name("wgpuWriteImage2DLayer")))
    extern "C" void wgpuWriteImage2DLayer(std::uint32_t handle, std::int32_t layer, std::int32_t level,
                                          const void* srcPtr, std::int32_t byteSize,
                                          std::int32_t w, std::int32_t h);

    __attribute__((import_module("env"), import_name("wgpuCreateLinearClampSampler")))
    extern "C" std::uint32_t wgpuCreateLinearClampSampler();

    // Linear-filtered, repeat-addressed sampler with mipmap linear-filter.
    // The usual choice for tiled material textures (woodBrace, panel, etc.)
    // which expect UV > 1.0 to wrap.
    __attribute__((import_module("env"), import_name("wgpuCreateLinearRepeatSampler")))
    extern "C" std::uint32_t wgpuCreateLinearRepeatSampler();

    __attribute__((import_module("env"), import_name("wgpuFrameBegin")))
    extern "C" void wgpuFrameBegin();
    __attribute__((import_module("env"), import_name("wgpuFrameEnd")))
    extern "C" void wgpuFrameEnd();

    __attribute__((import_module("env"), import_name("wgpuDispatchQuads")))
    extern "C" void wgpuDispatchQuads(std::uint32_t itemsHandle, const void* headerPtr,
                                      std::int32_t gx, std::int32_t gy);
    __attribute__((import_module("env"), import_name("wgpuDispatchCircles")))
    extern "C" void wgpuDispatchCircles(std::uint32_t itemsHandle, const void* headerPtr,
                                        std::int32_t gx, std::int32_t gy);
    __attribute__((import_module("env"), import_name("wgpuDispatchImages")))
    extern "C" void wgpuDispatchImages(std::uint32_t itemsHandle, const void* headerPtr,
                                       std::int32_t gx, std::int32_t gy,
                                       std::uint32_t texHandle, std::uint32_t sampHandle);
    __attribute__((import_module("env"), import_name("wgpuDispatchText")))
    extern "C" void wgpuDispatchText(std::uint32_t itemsHandle, const void* headerPtr,
                                     std::int32_t gx, std::int32_t gy,
                                     std::uint32_t atlasHandle, std::uint32_t sampHandle);

    // ─── custom user-authored compute shaders ───────────────────────────
    // rayQueryFlag = 1 swaps group(1) from the UI ping-pong pair to the RT
    // data heaps (TLAS, BVH, meshRecs, verts, idx, primRemap, outImage) and
    // prepends a WGSL prelude exposing the rayQuery* API. Shaders that set
    // this MUST NOT declare their own @group(1) bindings.
    __attribute__((import_module("env"), import_name("wgpuLoadCustomShader")))
    extern "C" std::uint32_t wgpuLoadCustomShader(const void* wgslPtr, std::int32_t wgslLen,
                                                  const void* bindingsPtr, std::int32_t bindingsCount,
                                                  std::int32_t rayQueryFlag);
    __attribute__((import_module("env"), import_name("wgpuDispatchCustom")))
    extern "C" void wgpuDispatchCustom(std::uint32_t pipelineHandle,
                                       const void* pushPtr, std::int32_t pushBytes,
                                       const void* handlesPtr, std::int32_t handlesCount,
                                       std::int32_t gx, std::int32_t gy, std::int32_t gz);

    // ─── software raytracing ───────────────────────────────────────────
    //
    // Mesh::Build forwards vertex / index / BVH-node / primRemap arrays
    // to the JS bridge, which queue.writeBuffers them into the global
    // RT mesh heaps (growing if needed) and records the per-mesh offsets
    // under a freshly-allocated u32 handle. The handle is what user code
    // stores in RTInstance::accelerationStructureReference; the WebGPU
    // TLAS-build compute shader resolves it back to root AABB + heap
    // offsets at dispatch time. Returns 0 on failure.
    // The optional `attribsPtr` / `attribsByteCount` carry per-vertex
    // attribute payload (normals, UVs, etc. — layout is example-defined)
    // that gets appended to a global attribs heap and exposed to RT
    // closest-hit shaders as `vertexAttribs : array<u32>` at
    // @group(1) @binding(7). Pass (nullptr, 0) for positions-only meshes.
    // `geomType` selects the primitive kind: 0 = triangles (the
    // verticesPtr/indicesPtr streams), 1 = AABBs (VK_GEOMETRY_TYPE_AABBS) —
    // then verticesPtr holds 2 vec3 per primitive [min, max], indexCount is
    // 0, and an intersection shader supplies the hit. `opaqueFlag` is the
    // geometry's opaque bit (0 lets any-hit run). `primCount` is the
    // triangle / AABB primitive count.
    __attribute__((import_module("env"), import_name("wgpuRegisterMeshBLAS")))
    extern "C" std::uint32_t wgpuRegisterMeshBLAS(
        float minX, float minY, float minZ,
        float maxX, float maxY, float maxZ,
        const void* verticesPtr, std::int32_t vertexCount,
        const void* indicesPtr,  std::int32_t indexCount,
        const void* bvhNodesPtr, std::int32_t bvhNodeCount,
        const void* primRemapPtr, std::int32_t primRemapCount,
        const void* attribsPtr,  std::int32_t attribsByteCount,
        std::int32_t geomType, std::int32_t opaqueFlag, std::int32_t primCount);

    // RT pipeline build. The library composes WGSL by concatenating the
    // traversal library, generated hit-group switches, and the user-
    // supplied raygen / miss / closesthit / anyhit bodies. `bindings` is
    // UICustomBinding-shaped (8 bytes each) declaring extra @group(2)+
    // resources the user's closest-hit / miss / raygen WGSL references.
    // Pass (nullptr, 0) for a pipeline with no user-declared bindings.
    // Returns an opaque pipeline handle.
    __attribute__((import_module("env"), import_name("wgpuLoadRTPipeline")))
    extern "C" std::uint32_t wgpuLoadRTPipeline(const void* wgslPtr, std::int32_t wgslLen,
                                                const void* bindingsPtr, std::int32_t bindingsCount);

    // Dispatch a TraceRays-equivalent pass: the RT pipeline is dispatched
    // over a (gx, gy) tile grid; the library writes the push data (camera,
    // payload, etc. — opaque) into a uniform ring buffer, attaches the TLAS
    // + global mesh heap, and runs one workgroup per 8x8 screen tile.
    // `handles[]` carries resolved WebGPU resource handles for every user
    // binding declared at pipeline-load time, in the same order. Pass
    // (nullptr, 0) for a pipeline with no user bindings.
    __attribute__((import_module("env"), import_name("wgpuDispatchRT")))
    extern "C" void wgpuDispatchRT(std::uint32_t pipelineHandle,
                                   const void* pushPtr, std::int32_t pushBytes,
                                   std::uint32_t tlasBufHandle,
                                   std::int32_t  instanceCount,
                                   std::int32_t  gx, std::int32_t gy,
                                   const void* handlesPtr, std::int32_t handlesCount,
                                   std::int32_t maxDepth);

    // GPU TLAS-build dispatch. Two sequential compute passes:
    //   1. tlasBuildMain — per-instance world AABB + identity permutation
    //      + naive Morton (overwritten in pass 2). Outputs the flat
    //      tlasBuf SSBO consumed by traceRay / rayQuery.
    //   2. lbvhBuildMain — single workgroup of 1024 threads; reduces
    //      scene AABB, recomputes Morton with proper normalization,
    //      bitonic-sorts (morton, instance_id), writes the sorted
    //      permutation into `entryOrderBufHandle`, and refits a
    //      sweep-tree BVH into `bvhNodesBufHandle` bottom-up.
    // Pre-LBVH bin-build is gone; `binsBufHandle` is kept in the
    // signature as a placeholder so the C++ side doesn't churn.
    __attribute__((import_module("env"), import_name("wgpuBuildTLAS")))
    extern "C" void wgpuBuildTLAS(std::uint32_t instanceBufHandle,
                                  std::int32_t  instanceCount,
                                  std::uint32_t tlasOutBufHandle,
                                  std::uint32_t entryOrderBufHandle,
                                  std::uint32_t mortonBufHandle,
                                  std::uint32_t binsBufHandle,
                                  std::uint32_t bvhNodesBufHandle,
                                  std::uint32_t sortTempABufHandle,
                                  std::uint32_t sortTempBBufHandle);

    // ── Standalone compute pipelines ───────────────────────────────────
    //
    // Mirror of the native ComputeShader API: load a user-authored
    // compute WGSL with arbitrary @group bindings, dispatch it at any
    // point in the frame (inside or outside the UI compute pass —
    // physics ticks dispatch from update lambdas, which fire outside
    // the per-frame render encoder).
    //
    // WGSL contract:
    //   @group(0) @binding(0) — uniform PushData (optional; only if
    //                            pushUniformSize > 0 at load).
    //   @group(1+) @binding(N) — user bindings declared via
    //                            UICustomBinding[]. When rayQuery is
    //                            on, @group(1) is reserved for the RT
    //                            heap and user bindings start at
    //                            @group(2).
    __attribute__((import_module("env"), import_name("wgpuLoadComputePipeline")))
    extern "C" std::uint32_t wgpuLoadComputePipeline(
        const void* wgslPtr, std::int32_t wgslLen,
        std::int32_t pushUniformSize,
        const void* bindingsPtr, std::int32_t bindingsCount,
        std::int32_t rayQueryFlag);

    __attribute__((import_module("env"), import_name("wgpuDispatchCompute")))
    extern "C" void wgpuDispatchCompute(
        std::uint32_t pipelineHandle,
        const void* pushPtr, std::int32_t pushBytes,
        const void* handlesPtr, std::int32_t handlesCount,
        std::int32_t gx, std::int32_t gy, std::int32_t gz);
}
#endif // CRAFTER_GRAPHICS_WINDOW_DOM