Crafter.Math/interfaces/Crafter.Math-Intersection.cppm

/*
Crafter®.Math
Copyright (C) 2026 Catcrafts®
catcrafts.net

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 3.0 as published by the Free Software Foundation;

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
*/

export module Crafter.Math:Intersection;
import :VectorF32;
import :MatrixRowMajor;
import std;

namespace Crafter {
    namespace detail {
        // Splat a single Len-vector into all Packing slots of the wider type
        // via a temporary float buffer. Performed once per intersection call;
        // the inner SIMD loop dominates so the round-trip is in the noise.
        template <std::uint8_t Packing, std::uint8_t Len>
        inline VectorF32<Len, Packing> SplatToPacking(VectorF32<Len, 1> v) {
            alignas(64) float buf[VectorF32<Len, Packing>::AlignmentElement] = {};
            std::array<float, VectorF32<Len, 1>::AlignmentElement> flat = v.template Store<float>();
            for (std::uint8_t p = 0; p < Packing; ++p) {
                for (std::uint8_t k = 0; k < Len; ++k) buf[p * Len + k] = flat[k];
            }
            return VectorF32<Len, Packing>(buf);
        }

        // Interleave two arrays of size N=BatchSize into the 2*N positional
        // argument list expected by the variadic Dot. Returns the packed
        // VectorF32<1, Packing*BatchSize> with one dot product per slot.
        template <std::uint8_t Len, std::uint8_t Packing, std::size_t N>
        inline auto DotArrays(
            std::array<VectorF32<Len, Packing>, N> const& a,
            std::array<VectorF32<Len, Packing>, N> const& b
        ) {
            return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
                std::array<VectorF32<Len, Packing>, 2 * N> flat;
                ((flat[2 * Is] = a[Is], flat[2 * Is + 1] = b[Is]), ...);
                return std::apply([](auto... args) {
                    return VectorF32<Len, Packing>::Dot(args...);
                }, flat);
            }(std::make_index_sequence<N>{});
        }

        // Gather the `Component`-th lane of every sub-vector across an array
        // of N packed VectorF32<3, Packing> into a flat VectorF32<1, Packing*N>
        // with one scalar per pair. Used to materialize halfSize.x / .y / .z
        // alongside per-pair scalar projections in a single SIMD register.
        template <std::uint8_t Component, std::uint8_t Packing>
        inline auto ExtractComponent(
            std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& arr
        ) {
            constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
            constexpr std::uint8_t Total = Packing * N;
            using OutVec = VectorF32<1, Total>;
            alignas(64) float buf[OutVec::AlignmentElement] = {};
            for (std::uint8_t b = 0; b < N; ++b) {
                auto v = arr[b].template Store<float>();
                for (std::uint8_t p = 0; p < Packing; ++p) {
                    buf[b * Packing + p] = v[p * 3 + Component];
                }
            }
            return OutVec(buf);
        }

        // Lane-wise absolute value. Done via a flat float buffer because the
        // F32 module does not expose a SIMD Abs primitive. Only called O(15)
        // times per OBB-OBB call, so the round-trip is negligible compared to
        // the dot-product work.
        template <std::uint8_t Total>
        inline VectorF32<1, Total> AbsVec(VectorF32<1, Total> v) {
            alignas(64) float buf[VectorF32<1, Total>::AlignmentElement];
            v.Store(buf);
            for (std::uint8_t i = 0; i < Total; ++i) buf[i] = std::abs(buf[i]);
            return VectorF32<1, Total>(buf);
        }
    }

    // Packed batch of Packing * BatchSize OBBs, each described by world-space
    // origin, three orthonormal rotation axes (rows of the rotation matrix),
    // and per-axis half-extents. Each std::array element packs `Packing`
    // sub-OBBs; there are BatchSize such elements, so the struct holds
    // Packing * BatchSize OBBs total.
    //
    // Callers that have OBBs as MatrixRowMajor + halfSize need to extract the
    // three axes and the origin themselves — keeping the routines in terms of
    // packed VectorF32<3, Packing> lets every SIMD op stay in registers.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    struct PackedOBBs {
        static constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
        static constexpr std::uint8_t Total = Packing * N;

        std::array<VectorF32<3, Packing>, N> halfSize;
        std::array<VectorF32<3, Packing>, N> xAxis;
        std::array<VectorF32<3, Packing>, N> yAxis;
        std::array<VectorF32<3, Packing>, N> zAxis;
        std::array<VectorF32<3, Packing>, N> origin;
    };

    // All intersection tests are batched over Packing*BatchSize primitives at
    // a time, where `Packing = VectorF32<3,1>::OptimalPacking` for the current
    // ISA (5 on AVX-512, 2 on AVX2, 1 on SSE/WASM/scalar) and BatchSize is the
    // arity that fills one output register. Callers form the packed input by
    // laying out `Packing` sub-primitives consecutively per vertex slot, then
    // assemble `BatchSize` such packed slots into the std::array argument.
    // Result lane `i` corresponds to triangle/sphere/box index `i`.

    // Möller-Trumbore against Packing*BatchSize triangles sharing one ray.
    // Returns ray parameter t per triangle, or float max where the ray misses.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestRayTriangle(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v0,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v1,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v2
    ) {
        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
        constexpr std::uint8_t Total = Packing * N;
        using PVec = VectorF32<3, Packing>;

        PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
        PVec rayDirP    = detail::SplatToPacking<Packing>(rayDir);

        std::array<PVec, N> E1, E2, H, S, Q, rayDirArr;
        for (std::uint8_t i = 0; i < N; ++i) {
            E1[i] = v1[i] - v0[i];
            E2[i] = v2[i] - v0[i];
            H[i]  = PVec::Cross(rayDirP, E2[i]);
            S[i]  = rayOriginP - v0[i];
            Q[i]  = PVec::Cross(S[i], E1[i]);
            rayDirArr[i] = rayDirP;
        }

        auto det  = detail::DotArrays(E1, H);
        auto uNum = detail::DotArrays(S, H);
        auto vNum = detail::DotArrays(rayDirArr, Q);
        auto tNum = detail::DotArrays(E2, Q);

        auto detArr = det.template Store<float>();
        auto uArr   = uNum.template Store<float>();
        auto vArr   = vNum.template Store<float>();
        auto tArr   = tNum.template Store<float>();

        constexpr float eps = std::numeric_limits<float>::epsilon();
        constexpr float maxF = std::numeric_limits<float>::max();
        alignas(64) std::array<float, VectorF32<1, Total>::AlignmentElement> out{};
        for (std::uint8_t i = 0; i < Total; ++i) {
            float d = detArr[i];
            if (d <= eps) { out[i] = maxF; continue; }
            float invD = 1.0f / d;
            float u = uArr[i] * invD;
            if (u < 0.0f || u > 1.0f) { out[i] = maxF; continue; }
            float v = vArr[i] * invD;
            if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; }
            out[i] = tArr[i] * invD;
        }
        return VectorF32<1, Total>(out.data());
    }

    // One ray against Packing*BatchSize spheres. `radii` holds one radius per
    // sphere in lane order matching the result.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestRaySphere(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
        VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii
    ) {
        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
        constexpr std::uint8_t Total = Packing * N;
        using PVec = VectorF32<3, Packing>;
        using OutVec = VectorF32<1, Total>;

        PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
        PVec rayDirP = detail::SplatToPacking<Packing>(rayDir);

        std::array<PVec, N> s;
        std::array<PVec, N> rayDirArr;
        for (std::uint8_t i = 0; i < N; ++i) {
            s[i] = rayOriginP - pos[i];
            rayDirArr[i] = rayDirP;
        }

        // dirDotS_i = rayDir · (rayOrigin - pos_i)
        auto dirDotS = detail::DotArrays(rayDirArr, s);
        // sqDist_i = |rayOrigin - pos_i|² across all packed slots.
        auto sqDist  = std::apply([](auto... args) { return PVec::LengthSq(args...); }, s);
        // aScalar = rayDir · rayDir, broadcast across every lane.
        auto aScalar = std::apply([](auto... args) { return PVec::LengthSq(args...); }, rayDirArr);

        OutVec two(2.0f);
        OutVec four(4.0f);
        OutVec b = two * dirDotS;
        OutVec c = sqDist - radii * radii;
        // discriminant = b² - 4·a·c
        OutVec disc = b * b - four * aScalar * c;

        auto discArr = disc.template Store<float>();
        auto bArr = b.template Store<float>();
        auto aArr = aScalar.template Store<float>();

        constexpr float maxF = std::numeric_limits<float>::max();
        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
        for (std::uint8_t i = 0; i < Total; ++i) {
            float d = discArr[i];
            if (d < 0.0f) { out[i] = maxF; continue; }
            float sqrtD = std::sqrt(d);
            float t = -0.5f * (bArr[i] + sqrtD) / aArr[i];
            out[i] = (t > 0.0f) ? t : maxF;
        }
        return OutVec(out.data());
    }

    // Packing that fits both Len=3 (positions, sizes) and Len=4 (quaternions)
    // in one SIMD register. Len=4's OptimalPacking is always ≤ Len=3's, so we
    // use the smaller of the two so a single Packing covers every type the
    // routine needs.
    inline constexpr std::uint8_t RayOBBPacking = std::min(
        VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking);

    // One ray against Packing*BatchSize OBBs. Each box is described by
    // world-space position, full-extent size, and a unit quaternion rotation.
    export template <std::uint8_t Packing = RayOBBPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestRayOrientedBox(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& size,
        std::array<VectorF32<4, Packing>, VectorF32<3, Packing>::BatchSize> const& rot
    ) {
        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
        constexpr std::uint8_t Total = Packing * N;
        using PVec3 = VectorF32<3, Packing>;
        using PVec4 = VectorF32<4, Packing>;
        using OutVec = VectorF32<1, Total>;

        PVec3 rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
        PVec3 rayDirP    = detail::SplatToPacking<Packing>(rayDir);

        // Conjugate quaternion: negate xyz, keep w. Constant-folded into one
        // XOR with a mask vector inside Negate.
        std::array<PVec3, N> localOrigin, localDir, half;
        for (std::uint8_t i = 0; i < N; ++i) {
            PVec4 invRot = rot[i].template Negate<{{true, true, true, false}}>();
            localOrigin[i] = PVec3::Rotate(rayOriginP - pos[i], invRot);
            localDir[i]    = PVec3::Rotate(rayDirP, invRot);
            half[i]        = size[i] * 0.5f;
        }

        std::array<std::array<float, PVec3::AlignmentElement>, N> origLanes, dirLanes, halfLanes;
        for (std::uint8_t i = 0; i < N; ++i) {
            origLanes[i] = localOrigin[i].template Store<float>();
            dirLanes[i]  = localDir[i].template Store<float>();
            halfLanes[i] = half[i].template Store<float>();
        }

        constexpr float eps = std::numeric_limits<float>::epsilon();
        constexpr float maxF = std::numeric_limits<float>::max();
        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
        for (std::uint8_t b = 0; b < Total; ++b) {
            std::uint8_t batchIdx = b / Packing;
            std::uint8_t subIdx = b % Packing;
            float tMin = 0.0f;
            float tMax = maxF;
            bool miss = false;
            for (std::uint8_t i = 0; i < 3; ++i) {
                std::uint8_t lane = static_cast<std::uint8_t>(subIdx * 3 + i);
                float d = dirLanes[batchIdx][lane];
                float o = origLanes[batchIdx][lane];
                float h = halfLanes[batchIdx][lane];
                if (std::abs(d) < eps) {
                    if (o < -h || o > h) { miss = true; break; }
                } else {
                    float invD = 1.0f / d;
                    float t1 = (-h - o) * invD;
                    float t2 = ( h - o) * invD;
                    if (t1 > t2) std::swap(t1, t2);
                    tMin = std::max(tMin, t1);
                    tMax = std::min(tMax, t2);
                    if (tMin > tMax) { miss = true; break; }
                }
            }
            out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax);
        }
        return OutVec(out.data());
    }

    // One sphere against Packing*BatchSize OBBs described by a PackedOBBs.
    // Returns 0.0 per pair where the sphere intersects the box, max-float
    // otherwise. `radii` carries one sphere radius per pair in the same lane
    // order as the resulting test output.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestSphereOrientedBox(
        VectorF32<3, 1> sphereCenter,
        VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii,
        PackedOBBs<Packing> const& boxes
    ) {
        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
        constexpr std::uint8_t Total = Packing * N;
        using PVec3 = VectorF32<3, Packing>;
        using OutVec = VectorF32<1, Total>;

        PVec3 sphereCenterP = detail::SplatToPacking<Packing>(sphereCenter);
        std::array<PVec3, N> delta;
        for (std::uint8_t i = 0; i < N; ++i) {
            delta[i] = sphereCenterP - boxes.origin[i];
        }

        // Project the world-space delta onto each box axis.
        auto locX = detail::DotArrays(delta, boxes.xAxis);
        auto locY = detail::DotArrays(delta, boxes.yAxis);
        auto locZ = detail::DotArrays(delta, boxes.zAxis);

        auto lxArr = locX.template Store<float>();
        auto lyArr = locY.template Store<float>();
        auto lzArr = locZ.template Store<float>();
        auto rArr  = radii.template Store<float>();
        std::array<std::array<float, PVec3::AlignmentElement>, N> sizeLanes;
        for (std::uint8_t i = 0; i < N; ++i) {
            sizeLanes[i] = boxes.halfSize[i].template Store<float>();
        }

        constexpr float maxF = std::numeric_limits<float>::max();
        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
        for (std::uint8_t i = 0; i < Total; ++i) {
            std::uint8_t batchIdx = i / Packing;
            std::uint8_t subIdx = i % Packing;
            float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i];
            float sx = sizeLanes[batchIdx][subIdx * 3 + 0];
            float sy = sizeLanes[batchIdx][subIdx * 3 + 1];
            float sz = sizeLanes[batchIdx][subIdx * 3 + 2];
            float cx = std::clamp(lx, -sx, sx);
            float cy = std::clamp(ly, -sy, sy);
            float cz = std::clamp(lz, -sz, sz);
            float dx = lx - cx, dy = ly - cy, dz = lz - cz;
            float distSq = dx * dx + dy * dy + dz * dz;
            float r = rArr[i];
            // Returns 0.0 on hit, max on miss — same "t-like" output signature
            // as the ray-vs-X tests.
            out[i] = (distSq <= r * r) ? 0.0f : maxF;
        }
        return OutVec(out.data());
    }

    // Eight local corners of a unit OBB transformed by `matrix`. Uses one
    // batched 4-pair Dot per result row (x, y, z), reproducing the corners in
    // two groups of four.
    export inline std::array<VectorF32<3, 1>, 8> GetOBBCorners(
        VectorF32<3, 1> size, MatrixRowMajor<float, 4, 3, 1> matrix
    ) {
        std::array<float, 4> sz = size.template Store<float>();
        const float sx = sz[0], sy = sz[1], sz_ = sz[2];

        VectorF32<4, 1> mx = matrix.rows[0];
        VectorF32<4, 1> my = matrix.rows[1];
        VectorF32<4, 1> mz = matrix.rows[2];

        // Eight homogeneous corner vectors with w=1 so the translation column
        // of `matrix` participates in the dot product.
        alignas(16) float c0[4] = { -sx, -sy, -sz_, 1.0f };
        alignas(16) float c1[4] = {  sx, -sy, -sz_, 1.0f };
        alignas(16) float c2[4] = { -sx,  sy, -sz_, 1.0f };
        alignas(16) float c3[4] = {  sx,  sy, -sz_, 1.0f };
        alignas(16) float c4[4] = { -sx, -sy,  sz_, 1.0f };
        alignas(16) float c5[4] = {  sx, -sy,  sz_, 1.0f };
        alignas(16) float c6[4] = { -sx,  sy,  sz_, 1.0f };
        alignas(16) float c7[4] = {  sx,  sy,  sz_, 1.0f };
        VectorF32<4, 1> v0(c0), v1(c1), v2(c2), v3(c3);
        VectorF32<4, 1> v4(c4), v5(c5), v6(c6), v7(c7);

        // First four corners (0..3): batch x, y, z dots.
        VectorF32<1, 4> xLo = VectorF32<4, 1>::Dot(mx, v0, mx, v1, mx, v2, mx, v3);
        VectorF32<1, 4> yLo = VectorF32<4, 1>::Dot(my, v0, my, v1, my, v2, my, v3);
        VectorF32<1, 4> zLo = VectorF32<4, 1>::Dot(mz, v0, mz, v1, mz, v2, mz, v3);
        // Second four corners (4..7).
        VectorF32<1, 4> xHi = VectorF32<4, 1>::Dot(mx, v4, mx, v5, mx, v6, mx, v7);
        VectorF32<1, 4> yHi = VectorF32<4, 1>::Dot(my, v4, my, v5, my, v6, my, v7);
        VectorF32<1, 4> zHi = VectorF32<4, 1>::Dot(mz, v4, mz, v5, mz, v6, mz, v7);

        std::array<float, 4> xLoA = xLo.template Store<float>();
        std::array<float, 4> yLoA = yLo.template Store<float>();
        std::array<float, 4> zLoA = zLo.template Store<float>();
        std::array<float, 4> xHiA = xHi.template Store<float>();
        std::array<float, 4> yHiA = yHi.template Store<float>();
        std::array<float, 4> zHiA = zHi.template Store<float>();

        std::array<VectorF32<3, 1>, 8> result;
        for (std::uint8_t i = 0; i < 4; ++i) {
            alignas(16) float buf[4] = { xLoA[i], yLoA[i], zLoA[i], 0.0f };
            result[i] = VectorF32<3, 1>(buf);
        }
        for (std::uint8_t i = 0; i < 4; ++i) {
            alignas(16) float buf[4] = { xHiA[i], yHiA[i], zHiA[i], 0.0f };
            result[4 + i] = VectorF32<3, 1>(buf);
        }
        return result;
    }

    // SAT against the 15 separating axis candidates (3 from box A, 3 from
    // box B, 9 cross products). Returns 0.0 per pair when the boxes overlap
    // and max-float when a separating axis was found, matching the
    // "smaller-is-closer" convention of the ray-vs-X tests.
    //
    // The corner-free formulation: for an OBB (origin O, unit axes X/Y/Z,
    // half-extents h) and a separating-axis candidate a, the projection
    // interval is centered at O·a with radius hx|X·a| + hy|Y·a| + hz|Z·a|.
    // Each axis therefore only needs four dot products per box (and a couple
    // of fused-multiply-adds) instead of eight corner projections — every
    // sub-pair runs in parallel inside the SIMD lanes.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestOrientedBoxOrientedBox(
        PackedOBBs<Packing> const& a, PackedOBBs<Packing> const& b
    ) {
        using PVec = VectorF32<3, Packing>;
        constexpr std::uint8_t N = PVec::BatchSize;
        constexpr std::uint8_t Total = Packing * N;
        using OutVec = VectorF32<1, Total>;

        // Per-pair half-extents pulled out of each PackedOBBs into flat
        // VectorF32<1, Total> registers so they can multiply the projection
        // dots directly.
        OutVec halfA_x = detail::ExtractComponent<0, Packing>(a.halfSize);
        OutVec halfA_y = detail::ExtractComponent<1, Packing>(a.halfSize);
        OutVec halfA_z = detail::ExtractComponent<2, Packing>(a.halfSize);
        OutVec halfB_x = detail::ExtractComponent<0, Packing>(b.halfSize);
        OutVec halfB_y = detail::ExtractComponent<1, Packing>(b.halfSize);
        OutVec halfB_z = detail::ExtractComponent<2, Packing>(b.halfSize);

        constexpr float maxF = std::numeric_limits<float>::max();
        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
        for (std::uint8_t i = 0; i < Total; ++i) out[i] = 0.0f; // start: overlap

        auto axesOfA = [&](std::uint8_t i) -> std::array<PVec, N> const& {
            return (i == 0) ? a.xAxis : (i == 1) ? a.yAxis : a.zAxis;
        };
        auto axesOfB = [&](std::uint8_t i) -> std::array<PVec, N> const& {
            return (i == 0) ? b.xAxis : (i == 1) ? b.yAxis : b.zAxis;
        };

        // For each separating-axis candidate, compute per-pair min/max for
        // both boxes and OR the "separating" condition into `out`.
        auto checkAxis = [&](std::array<PVec, N> const& axis) {
            OutVec cA = detail::DotArrays(a.origin, axis);
            OutVec dA_x = detail::DotArrays(a.xAxis, axis);
            OutVec dA_y = detail::DotArrays(a.yAxis, axis);
            OutVec dA_z = detail::DotArrays(a.zAxis, axis);
            OutVec rA = halfA_x * detail::AbsVec(dA_x)
                      + halfA_y * detail::AbsVec(dA_y)
                      + halfA_z * detail::AbsVec(dA_z);

            OutVec cB = detail::DotArrays(b.origin, axis);
            OutVec dB_x = detail::DotArrays(b.xAxis, axis);
            OutVec dB_y = detail::DotArrays(b.yAxis, axis);
            OutVec dB_z = detail::DotArrays(b.zAxis, axis);
            OutVec rB = halfB_x * detail::AbsVec(dB_x)
                      + halfB_y * detail::AbsVec(dB_y)
                      + halfB_z * detail::AbsVec(dB_z);

            OutVec minA = cA - rA;
            OutVec maxA = cA + rA;
            OutVec minB = cB - rB;
            OutVec maxB = cB + rB;

            auto minAArr = minA.template Store<float>();
            auto maxAArr = maxA.template Store<float>();
            auto minBArr = minB.template Store<float>();
            auto maxBArr = maxB.template Store<float>();
            for (std::uint8_t i = 0; i < Total; ++i) {
                // NaN comparisons (from degenerate cross axes) return false and
                // correctly leave `out[i]` untouched on this axis.
                if (maxAArr[i] < minBArr[i] || maxBArr[i] < minAArr[i]) {
                    out[i] = maxF;
                }
            }
        };

        checkAxis(a.xAxis); checkAxis(a.yAxis); checkAxis(a.zAxis);
        checkAxis(b.xAxis); checkAxis(b.yAxis); checkAxis(b.zAxis);

        // The 9 cross-product axes. Each batch slot's cross axes are computed
        // per-slot, then normalized together (one PVec::Normalize per cross
        // index processes N packed slots in parallel).
        for (std::uint8_t i = 0; i < 3; ++i) {
            auto const& aAx = axesOfA(i);
            for (std::uint8_t j = 0; j < 3; ++j) {
                auto const& bAx = axesOfB(j);
                std::array<PVec, N> crossAx;
                for (std::uint8_t k = 0; k < N; ++k) crossAx[k] = PVec::Cross(aAx[k], bAx[k]);
                auto normalized = std::apply([](auto... args) {
                    return PVec::Normalize(args...);
                }, crossAx);
                checkAxis(normalized);
            }
        }

        return OutVec(out.data());
    }
}