packed intersection and matrix

2026-05-18 19:57:40 +02:00 · 2026-05-18 19:57:40 +02:00 · f0becd1582
commit f0becd1582
parent 027947cae6
7 changed files with 948 additions and 557 deletions
--- a/interfaces/Crafter.Math-Intersection.cppm
+++ b/interfaces/Crafter.Math-Intersection.cppm
@ -23,64 +23,141 @@ import :MatrixRowMajor;
 import std;

 namespace Crafter {
-    // All intersection tests are batched over four primitives at a time so they
-    // feed the VectorF32<3,1>::Dot / Cross / Length / Normalize four-pair
-    // overloads directly. The single-primitive case is just "pass the same
-    // primitive four times and read lane 0" - there is no single-vector
-    // fast-path because the SIMD pipelines want full lanes.
+    namespace detail {
+        // Splat a single Len-vector into all Packing slots of the wider type
+        // via a temporary float buffer. Performed once per intersection call;
+        // the inner SIMD loop dominates so the round-trip is in the noise.
+        template <std::uint8_t Packing, std::uint8_t Len>
+        inline VectorF32<Len, Packing> SplatToPacking(VectorF32<Len, 1> v) {
+            alignas(64) float buf[VectorF32<Len, Packing>::AlignmentElement] = {};
+            std::array<float, VectorF32<Len, 1>::AlignmentElement> flat = v.template Store<float>();
+            for (std::uint8_t p = 0; p < Packing; ++p) {
+                for (std::uint8_t k = 0; k < Len; ++k) buf[p * Len + k] = flat[k];
+            }
+            return VectorF32<Len, Packing>(buf);
+        }

-    // Möller-Trumbore against four triangles sharing one ray. Returns ray
-    // parameter t per triangle, or float max where the ray misses.
-    export inline VectorF32<1, 4> IntersectionTestRayTriangle(
+        // Interleave two arrays of size N=BatchSize into the 2*N positional
+        // argument list expected by the variadic Dot. Returns the packed
+        // VectorF32<1, Packing*BatchSize> with one dot product per slot.
+        template <std::uint8_t Len, std::uint8_t Packing, std::size_t N>
+        inline auto DotArrays(
+            std::array<VectorF32<Len, Packing>, N> const& a,
+            std::array<VectorF32<Len, Packing>, N> const& b
+        ) {
+            return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
+                std::array<VectorF32<Len, Packing>, 2 * N> flat;
+                ((flat[2 * Is] = a[Is], flat[2 * Is + 1] = b[Is]), ...);
+                return std::apply([](auto... args) {
+                    return VectorF32<Len, Packing>::Dot(args...);
+                }, flat);
+            }(std::make_index_sequence<N>{});
+        }
+
+        // Gather the `Component`-th lane of every sub-vector across an array
+        // of N packed VectorF32<3, Packing> into a flat VectorF32<1, Packing*N>
+        // with one scalar per pair. Used to materialize halfSize.x / .y / .z
+        // alongside per-pair scalar projections in a single SIMD register.
+        template <std::uint8_t Component, std::uint8_t Packing>
+        inline auto ExtractComponent(
+            std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& arr
+        ) {
+            constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
+            constexpr std::uint8_t Total = Packing * N;
+            using OutVec = VectorF32<1, Total>;
+            alignas(64) float buf[OutVec::AlignmentElement] = {};
+            for (std::uint8_t b = 0; b < N; ++b) {
+                auto v = arr[b].template Store<float>();
+                for (std::uint8_t p = 0; p < Packing; ++p) {
+                    buf[b * Packing + p] = v[p * 3 + Component];
+                }
+            }
+            return OutVec(buf);
+        }
+
+        // Lane-wise absolute value. Done via a flat float buffer because the
+        // F32 module does not expose a SIMD Abs primitive. Only called O(15)
+        // times per OBB-OBB call, so the round-trip is negligible compared to
+        // the dot-product work.
+        template <std::uint8_t Total>
+        inline VectorF32<1, Total> AbsVec(VectorF32<1, Total> v) {
+            alignas(64) float buf[VectorF32<1, Total>::AlignmentElement];
+            v.Store(buf);
+            for (std::uint8_t i = 0; i < Total; ++i) buf[i] = std::abs(buf[i]);
+            return VectorF32<1, Total>(buf);
+        }
+    }
+
+    // Packed batch of Packing * BatchSize OBBs, each described by world-space
+    // origin, three orthonormal rotation axes (rows of the rotation matrix),
+    // and per-axis half-extents. Each std::array element packs `Packing`
+    // sub-OBBs; there are BatchSize such elements, so the struct holds
+    // Packing * BatchSize OBBs total.
+    //
+    // Callers that have OBBs as MatrixRowMajor + halfSize need to extract the
+    // three axes and the origin themselves — keeping the routines in terms of
+    // packed VectorF32<3, Packing> lets every SIMD op stay in registers.
+    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
+    struct PackedOBBs {
+        static constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
+        static constexpr std::uint8_t Total = Packing * N;
+
+        std::array<VectorF32<3, Packing>, N> halfSize;
+        std::array<VectorF32<3, Packing>, N> xAxis;
+        std::array<VectorF32<3, Packing>, N> yAxis;
+        std::array<VectorF32<3, Packing>, N> zAxis;
+        std::array<VectorF32<3, Packing>, N> origin;
+    };
+
+    // All intersection tests are batched over Packing*BatchSize primitives at
+    // a time, where `Packing = VectorF32<3,1>::OptimalPacking` for the current
+    // ISA (5 on AVX-512, 2 on AVX2, 1 on SSE/WASM/scalar) and BatchSize is the
+    // arity that fills one output register. Callers form the packed input by
+    // laying out `Packing` sub-primitives consecutively per vertex slot, then
+    // assemble `BatchSize` such packed slots into the std::array argument.
+    // Result lane `i` corresponds to triangle/sphere/box index `i`.
+
+    // Möller-Trumbore against Packing*BatchSize triangles sharing one ray.
+    // Returns ray parameter t per triangle, or float max where the ray misses.
+    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
+    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
+    IntersectionTestRayTriangle(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
-        VectorF32<3, 1> aV0, VectorF32<3, 1> aV1, VectorF32<3, 1> aV2,
-        VectorF32<3, 1> bV0, VectorF32<3, 1> bV1, VectorF32<3, 1> bV2,
-        VectorF32<3, 1> cV0, VectorF32<3, 1> cV1, VectorF32<3, 1> cV2,
-        VectorF32<3, 1> dV0, VectorF32<3, 1> dV1, VectorF32<3, 1> dV2
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v0,
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v1,
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v2
    ) {
-        VectorF32<3, 1> aE1 = aV1 - aV0;
-        VectorF32<3, 1> aE2 = aV2 - aV0;
-        VectorF32<3, 1> bE1 = bV1 - bV0;
-        VectorF32<3, 1> bE2 = bV2 - bV0;
-        VectorF32<3, 1> cE1 = cV1 - cV0;
-        VectorF32<3, 1> cE2 = cV2 - cV0;
-        VectorF32<3, 1> dE1 = dV1 - dV0;
-        VectorF32<3, 1> dE2 = dV2 - dV0;
+        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
+        constexpr std::uint8_t Total = Packing * N;
+        using PVec = VectorF32<3, Packing>;

-        VectorF32<3, 1> aH = VectorF32<3, 1>::Cross(rayDir, aE2);
-        VectorF32<3, 1> bH = VectorF32<3, 1>::Cross(rayDir, bE2);
-        VectorF32<3, 1> cH = VectorF32<3, 1>::Cross(rayDir, cE2);
-        VectorF32<3, 1> dH = VectorF32<3, 1>::Cross(rayDir, dE2);
+        PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
+        PVec rayDirP    = detail::SplatToPacking<Packing>(rayDir);

-        VectorF32<3, 1> aS = rayOrigin - aV0;
-        VectorF32<3, 1> bS = rayOrigin - bV0;
-        VectorF32<3, 1> cS = rayOrigin - cV0;
-        VectorF32<3, 1> dS = rayOrigin - dV0;
+        std::array<PVec, N> E1, E2, H, S, Q, rayDirArr;
+        for (std::uint8_t i = 0; i < N; ++i) {
+            E1[i] = v1[i] - v0[i];
+            E2[i] = v2[i] - v0[i];
+            H[i]  = PVec::Cross(rayDirP, E2[i]);
+            S[i]  = rayOriginP - v0[i];
+            Q[i]  = PVec::Cross(S[i], E1[i]);
+            rayDirArr[i] = rayDirP;
+        }

-        VectorF32<3, 1> aQ = VectorF32<3, 1>::Cross(aS, aE1);
-        VectorF32<3, 1> bQ = VectorF32<3, 1>::Cross(bS, bE1);
-        VectorF32<3, 1> cQ = VectorF32<3, 1>::Cross(cS, cE1);
-        VectorF32<3, 1> dQ = VectorF32<3, 1>::Cross(dS, dE1);
+        auto det  = detail::DotArrays(E1, H);
+        auto uNum = detail::DotArrays(S, H);
+        auto vNum = detail::DotArrays(rayDirArr, Q);
+        auto tNum = detail::DotArrays(E2, Q);

-        // Four 3-component dots packed into one __m128 per call.
-        VectorF32<1, 4> det = VectorF32<3, 1>::Dot(
-            aE1, aH, bE1, bH, cE1, cH, dE1, dH);
-        VectorF32<1, 4> uNum = VectorF32<3, 1>::Dot(
-            aS, aH, bS, bH, cS, cH, dS, dH);
-        VectorF32<1, 4> vNum = VectorF32<3, 1>::Dot(
-            rayDir, aQ, rayDir, bQ, rayDir, cQ, rayDir, dQ);
-        VectorF32<1, 4> tNum = VectorF32<3, 1>::Dot(
-            aE2, aQ, bE2, bQ, cE2, cQ, dE2, dQ);
-
-        std::array<float, 4> detArr = det.template Store<float>();
-        std::array<float, 4> uArr = uNum.template Store<float>();
-        std::array<float, 4> vArr = vNum.template Store<float>();
-        std::array<float, 4> tArr = tNum.template Store<float>();
+        auto detArr = det.template Store<float>();
+        auto uArr   = uNum.template Store<float>();
+        auto vArr   = vNum.template Store<float>();
+        auto tArr   = tNum.template Store<float>();

        constexpr float eps = std::numeric_limits<float>::epsilon();
        constexpr float maxF = std::numeric_limits<float>::max();
-        alignas(16) std::array<float, 4> out{};
-        for (std::uint8_t i = 0; i < 4; ++i) {
+        alignas(64) std::array<float, VectorF32<1, Total>::AlignmentElement> out{};
+        for (std::uint8_t i = 0; i < Total; ++i) {
            float d = detArr[i];
            if (d <= eps) { out[i] = maxF; continue; }
            float invD = 1.0f / d;
@ -90,115 +167,120 @@ namespace Crafter {
            if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; }
            out[i] = tArr[i] * invD;
        }
-        return VectorF32<1, 4>(out.data());
+        return VectorF32<1, Total>(out.data());
    }

-    // One ray against four spheres. radii must hold {rA, rB, rC, rD} in lanes
-    // 0..3.
-    export inline VectorF32<1, 4> IntersectionTestRaySphere(
+    // One ray against Packing*BatchSize spheres. `radii` holds one radius per
+    // sphere in lane order matching the result.
+    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
+    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
+    IntersectionTestRaySphere(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
-        VectorF32<3, 1> posA, VectorF32<3, 1> posB,
-        VectorF32<3, 1> posC, VectorF32<3, 1> posD,
-        VectorF32<1, 4> radii
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
+        VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii
    ) {
-        VectorF32<3, 1> sA = rayOrigin - posA;
-        VectorF32<3, 1> sB = rayOrigin - posB;
-        VectorF32<3, 1> sC = rayOrigin - posC;
-        VectorF32<3, 1> sD = rayOrigin - posD;
+        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
+        constexpr std::uint8_t Total = Packing * N;
+        using PVec = VectorF32<3, Packing>;
+        using OutVec = VectorF32<1, Total>;
+
+        PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
+        PVec rayDirP = detail::SplatToPacking<Packing>(rayDir);
+
+        std::array<PVec, N> s;
+        std::array<PVec, N> rayDirArr;
+        for (std::uint8_t i = 0; i < N; ++i) {
+            s[i] = rayOriginP - pos[i];
+            rayDirArr[i] = rayDirP;
+        }

        // dirDotS_i = rayDir · (rayOrigin - pos_i)
-        VectorF32<1, 4> dirDotS = VectorF32<3, 1>::Dot(
-            rayDir, sA, rayDir, sB, rayDir, sC, rayDir, sD);
-        // sqDist_i = |rayOrigin - pos_i|² (a.k.a. LengthSq of the s vectors)
-        VectorF32<1, 4> sqDist = VectorF32<3, 1>::LengthSq(sA, sB, sC, sD);
-        // aScalar = rayDir · rayDir, broadcast across four lanes.
-        VectorF32<1, 4> aScalar = VectorF32<3, 1>::LengthSq(
-            rayDir, rayDir, rayDir, rayDir);
+        auto dirDotS = detail::DotArrays(rayDirArr, s);
+        // sqDist_i = |rayOrigin - pos_i|² across all packed slots.
+        auto sqDist  = std::apply([](auto... args) { return PVec::LengthSq(args...); }, s);
+        // aScalar = rayDir · rayDir, broadcast across every lane.
+        auto aScalar = std::apply([](auto... args) { return PVec::LengthSq(args...); }, rayDirArr);

-        VectorF32<1, 4> two(2.0f);
-        VectorF32<1, 4> four(4.0f);
-        VectorF32<1, 4> b = two * dirDotS;
-        VectorF32<1, 4> c = sqDist - radii * radii;
+        OutVec two(2.0f);
+        OutVec four(4.0f);
+        OutVec b = two * dirDotS;
+        OutVec c = sqDist - radii * radii;
        // discriminant = b² - 4·a·c
-        VectorF32<1, 4> disc = b * b - four * aScalar * c;
+        OutVec disc = b * b - four * aScalar * c;

-        std::array<float, 4> discArr = disc.template Store<float>();
-        std::array<float, 4> bArr = b.template Store<float>();
-        std::array<float, 4> aArr = aScalar.template Store<float>();
+        auto discArr = disc.template Store<float>();
+        auto bArr = b.template Store<float>();
+        auto aArr = aScalar.template Store<float>();

        constexpr float maxF = std::numeric_limits<float>::max();
-        alignas(16) std::array<float, 4> out{};
-        for (std::uint8_t i = 0; i < 4; ++i) {
+        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
+        for (std::uint8_t i = 0; i < Total; ++i) {
            float d = discArr[i];
            if (d < 0.0f) { out[i] = maxF; continue; }
            float sqrtD = std::sqrt(d);
            float t = -0.5f * (bArr[i] + sqrtD) / aArr[i];
            out[i] = (t > 0.0f) ? t : maxF;
        }
-        return VectorF32<1, 4>(out.data());
+        return OutVec(out.data());
    }

-    // One ray against four OBBs. Each box is described by world-space position,
-    // half-extent vector (per-axis sizes), and a unit quaternion rotation.
-    export inline VectorF32<1, 4> IntersectionTestRayOrientedBox(
+    // Packing that fits both Len=3 (positions, sizes) and Len=4 (quaternions)
+    // in one SIMD register. Len=4's OptimalPacking is always ≤ Len=3's, so we
+    // use the smaller of the two so a single Packing covers every type the
+    // routine needs.
+    inline constexpr std::uint8_t RayOBBPacking = std::min(
+        VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking);
+
+    // One ray against Packing*BatchSize OBBs. Each box is described by
+    // world-space position, full-extent size, and a unit quaternion rotation.
+    export template <std::uint8_t Packing = RayOBBPacking>
+    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
+    IntersectionTestRayOrientedBox(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
-        VectorF32<3, 1> posA, VectorF32<3, 1> sizeA, VectorF32<4, 1> rotA,
-        VectorF32<3, 1> posB, VectorF32<3, 1> sizeB, VectorF32<4, 1> rotB,
-        VectorF32<3, 1> posC, VectorF32<3, 1> sizeC, VectorF32<4, 1> rotC,
-        VectorF32<3, 1> posD, VectorF32<3, 1> sizeD, VectorF32<4, 1> rotD
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& size,
+        std::array<VectorF32<4, Packing>, VectorF32<3, Packing>::BatchSize> const& rot
    ) {
-        // Conjugate quaternion: negate xyz, keep w. Negate<{true,true,true,false}>
-        // is constant-folded into a single XOR with a mask vector.
-        VectorF32<4, 1> invRotA = rotA.template Negate<{{true, true, true, false}}>();
-        VectorF32<4, 1> invRotB = rotB.template Negate<{{true, true, true, false}}>();
-        VectorF32<4, 1> invRotC = rotC.template Negate<{{true, true, true, false}}>();
-        VectorF32<4, 1> invRotD = rotD.template Negate<{{true, true, true, false}}>();
+        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
+        constexpr std::uint8_t Total = Packing * N;
+        using PVec3 = VectorF32<3, Packing>;
+        using PVec4 = VectorF32<4, Packing>;
+        using OutVec = VectorF32<1, Total>;

-        VectorF32<3, 1> localOriginA = VectorF32<3, 1>::Rotate(rayOrigin - posA, invRotA);
-        VectorF32<3, 1> localOriginB = VectorF32<3, 1>::Rotate(rayOrigin - posB, invRotB);
-        VectorF32<3, 1> localOriginC = VectorF32<3, 1>::Rotate(rayOrigin - posC, invRotC);
-        VectorF32<3, 1> localOriginD = VectorF32<3, 1>::Rotate(rayOrigin - posD, invRotD);
+        PVec3 rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
+        PVec3 rayDirP    = detail::SplatToPacking<Packing>(rayDir);

-        VectorF32<3, 1> localDirA = VectorF32<3, 1>::Rotate(rayDir, invRotA);
-        VectorF32<3, 1> localDirB = VectorF32<3, 1>::Rotate(rayDir, invRotB);
-        VectorF32<3, 1> localDirC = VectorF32<3, 1>::Rotate(rayDir, invRotC);
-        VectorF32<3, 1> localDirD = VectorF32<3, 1>::Rotate(rayDir, invRotD);
+        // Conjugate quaternion: negate xyz, keep w. Constant-folded into one
+        // XOR with a mask vector inside Negate.
+        std::array<PVec3, N> localOrigin, localDir, half;
+        for (std::uint8_t i = 0; i < N; ++i) {
+            PVec4 invRot = rot[i].template Negate<{{true, true, true, false}}>();
+            localOrigin[i] = PVec3::Rotate(rayOriginP - pos[i], invRot);
+            localDir[i]    = PVec3::Rotate(rayDirP, invRot);
+            half[i]        = size[i] * 0.5f;
+        }

-        VectorF32<3, 1> halfA = sizeA * 0.5f;
-        VectorF32<3, 1> halfB = sizeB * 0.5f;
-        VectorF32<3, 1> halfC = sizeC * 0.5f;
-        VectorF32<3, 1> halfD = sizeD * 0.5f;
-
-        std::array<std::array<float, 4>, 4> origLanes{
-            localOriginA.template Store<float>(),
-            localOriginB.template Store<float>(),
-            localOriginC.template Store<float>(),
-            localOriginD.template Store<float>(),
-        };
-        std::array<std::array<float, 4>, 4> dirLanes{
-            localDirA.template Store<float>(),
-            localDirB.template Store<float>(),
-            localDirC.template Store<float>(),
-            localDirD.template Store<float>(),
-        };
-        std::array<std::array<float, 4>, 4> halfLanes{
-            halfA.template Store<float>(),
-            halfB.template Store<float>(),
-            halfC.template Store<float>(),
-            halfD.template Store<float>(),
-        };
+        std::array<std::array<float, PVec3::AlignmentElement>, N> origLanes, dirLanes, halfLanes;
+        for (std::uint8_t i = 0; i < N; ++i) {
+            origLanes[i] = localOrigin[i].template Store<float>();
+            dirLanes[i]  = localDir[i].template Store<float>();
+            halfLanes[i] = half[i].template Store<float>();
+        }

        constexpr float eps = std::numeric_limits<float>::epsilon();
        constexpr float maxF = std::numeric_limits<float>::max();
-        alignas(16) std::array<float, 4> out{};
-        for (std::uint8_t b = 0; b < 4; ++b) {
+        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
+        for (std::uint8_t b = 0; b < Total; ++b) {
+            std::uint8_t batchIdx = b / Packing;
+            std::uint8_t subIdx = b % Packing;
            float tMin = 0.0f;
            float tMax = maxF;
            bool miss = false;
            for (std::uint8_t i = 0; i < 3; ++i) {
-                float d = dirLanes[b][i];
-                float o = origLanes[b][i];
-                float h = halfLanes[b][i];
+                std::uint8_t lane = static_cast<std::uint8_t>(subIdx * 3 + i);
+                float d = dirLanes[batchIdx][lane];
+                float o = origLanes[batchIdx][lane];
+                float h = halfLanes[batchIdx][lane];
                if (std::abs(d) < eps) {
                    if (o < -h || o > h) { miss = true; break; }
                } else {
@ -213,87 +295,65 @@ namespace Crafter {
            }
            out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax);
        }
-        return VectorF32<1, 4>(out.data());
+        return OutVec(out.data());
    }

-    // One sphere against four OBBs. boxMatrix encodes rotation in m[r][0..2]
-    // and translation in m[r][3].
-    export inline VectorF32<1, 4> IntersectionTestSphereOrientedBox(
-        VectorF32<3, 1> sphereCenter, VectorF32<1, 4> radii,
-        VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
-        VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB,
-        VectorF32<3, 1> sizeC, MatrixRowMajor<float, 4, 3, 1> boxC,
-        VectorF32<3, 1> sizeD, MatrixRowMajor<float, 4, 3, 1> boxD
+    // One sphere against Packing*BatchSize OBBs described by a PackedOBBs.
+    // Returns 0.0 per pair where the sphere intersects the box, max-float
+    // otherwise. `radii` carries one sphere radius per pair in the same lane
+    // order as the resulting test output.
+    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
+    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
+    IntersectionTestSphereOrientedBox(
+        VectorF32<3, 1> sphereCenter,
+        VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii,
+        PackedOBBs<Packing> const& boxes
    ) {
-        auto perBox = [&](MatrixRowMajor<float, 4, 3, 1> const& m,
-                          VectorF32<3, 1> const& size,
-                          VectorF32<3, 1>& xAxis,
-                          VectorF32<3, 1>& yAxis,
-                          VectorF32<3, 1>& zAxis,
-                          VectorF32<3, 1>& delta) {
-            // Existing semantics: the OBB axes are read from the rows of the
-            // upper 3x3 block, and the translation column is gathered from the
-            // w lane of each row.
-            std::array<float, 4> r0 = m.rows[0].template Store<float>();
-            std::array<float, 4> r1 = m.rows[1].template Store<float>();
-            std::array<float, 4> r2 = m.rows[2].template Store<float>();
-            alignas(16) float xBuf[4] = { r0[0], r0[1], r0[2], 0.0f };
-            alignas(16) float yBuf[4] = { r1[0], r1[1], r1[2], 0.0f };
-            alignas(16) float zBuf[4] = { r2[0], r2[1], r2[2], 0.0f };
-            alignas(16) float oBuf[4] = { r0[3], r1[3], r2[3], 0.0f };
-            xAxis = VectorF32<3, 1>(xBuf);
-            yAxis = VectorF32<3, 1>(yBuf);
-            zAxis = VectorF32<3, 1>(zBuf);
-            VectorF32<3, 1> origin(oBuf);
-            delta = sphereCenter - origin;
-            (void)size;
-        };
+        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
+        constexpr std::uint8_t Total = Packing * N;
+        using PVec3 = VectorF32<3, Packing>;
+        using OutVec = VectorF32<1, Total>;

-        VectorF32<3, 1> xA, yA, zA, dA;
-        VectorF32<3, 1> xB, yB, zB, dB;
-        VectorF32<3, 1> xC, yC, zC, dC;
-        VectorF32<3, 1> xD, yD, zD, dD;
-        perBox(boxA, sizeA, xA, yA, zA, dA);
-        perBox(boxB, sizeB, xB, yB, zB, dB);
-        perBox(boxC, sizeC, xC, yC, zC, dC);
-        perBox(boxD, sizeD, xD, yD, zD, dD);
+        PVec3 sphereCenterP = detail::SplatToPacking<Packing>(sphereCenter);
+        std::array<PVec3, N> delta;
+        for (std::uint8_t i = 0; i < N; ++i) {
+            delta[i] = sphereCenterP - boxes.origin[i];
+        }

-        // Local sphere center per box: project delta onto each box axis. We
-        // produce {lx, ly, lz, lx, ly, lz, lx, ly, lz, lx, ly, lz} as three
-        // packed 4-wide Dot results (one Dot per axis).
-        VectorF32<1, 4> locX = VectorF32<3, 1>::Dot(
-            dA, xA, dB, xB, dC, xC, dD, xD);
-        VectorF32<1, 4> locY = VectorF32<3, 1>::Dot(
-            dA, yA, dB, yB, dC, yC, dD, yD);
-        VectorF32<1, 4> locZ = VectorF32<3, 1>::Dot(
-            dA, zA, dB, zB, dC, zC, dD, zD);
+        // Project the world-space delta onto each box axis.
+        auto locX = detail::DotArrays(delta, boxes.xAxis);
+        auto locY = detail::DotArrays(delta, boxes.yAxis);
+        auto locZ = detail::DotArrays(delta, boxes.zAxis);

-        std::array<float, 4> lxArr = locX.template Store<float>();
-        std::array<float, 4> lyArr = locY.template Store<float>();
-        std::array<float, 4> lzArr = locZ.template Store<float>();
-        std::array<float, 4> rArr  = radii.template Store<float>();
-        std::array<std::array<float, 4>, 4> sizeLanes{
-            sizeA.template Store<float>(),
-            sizeB.template Store<float>(),
-            sizeC.template Store<float>(),
-            sizeD.template Store<float>(),
-        };
+        auto lxArr = locX.template Store<float>();
+        auto lyArr = locY.template Store<float>();
+        auto lzArr = locZ.template Store<float>();
+        auto rArr  = radii.template Store<float>();
+        std::array<std::array<float, PVec3::AlignmentElement>, N> sizeLanes;
+        for (std::uint8_t i = 0; i < N; ++i) {
+            sizeLanes[i] = boxes.halfSize[i].template Store<float>();
+        }

-        alignas(16) std::array<float, 4> out{};
-        for (std::uint8_t i = 0; i < 4; ++i) {
+        constexpr float maxF = std::numeric_limits<float>::max();
+        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
+        for (std::uint8_t i = 0; i < Total; ++i) {
+            std::uint8_t batchIdx = i / Packing;
+            std::uint8_t subIdx = i % Packing;
            float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i];
-            float sx = sizeLanes[i][0], sy = sizeLanes[i][1], sz = sizeLanes[i][2];
+            float sx = sizeLanes[batchIdx][subIdx * 3 + 0];
+            float sy = sizeLanes[batchIdx][subIdx * 3 + 1];
+            float sz = sizeLanes[batchIdx][subIdx * 3 + 2];
            float cx = std::clamp(lx, -sx, sx);
            float cy = std::clamp(ly, -sy, sy);
            float cz = std::clamp(lz, -sz, sz);
            float dx = lx - cx, dy = ly - cy, dz = lz - cz;
            float distSq = dx * dx + dy * dy + dz * dz;
            float r = rArr[i];
-            // Returns 0.0 on hit, max on miss - keeps a consistent
-            // "t-like" output signature with the other intersection tests.
-            out[i] = (distSq <= r * r) ? 0.0f : std::numeric_limits<float>::max();
+            // Returns 0.0 on hit, max on miss — same "t-like" output signature
+            // as the ray-vs-X tests.
+            out[i] = (distSq <= r * r) ? 0.0f : maxF;
        }
-        return VectorF32<1, 4>(out.data());
+        return OutVec(out.data());
    }

    // Eight local corners of a unit OBB transformed by `matrix`. Uses one
@ -350,100 +410,104 @@ namespace Crafter {
        return result;
    }

-    // SAT against fifteen separating axes (3 box-A, 3 box-B, 9 cross products).
-    // We compute every corner projection with batched 4-pair Dots: each axis
-    // projects four corners per call, two calls per axis covers the 8 corners.
-    export inline bool IntersectionTestOrientedBoxOrientedBox(
-        VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
-        VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB
+    // SAT against the 15 separating axis candidates (3 from box A, 3 from
+    // box B, 9 cross products). Returns 0.0 per pair when the boxes overlap
+    // and max-float when a separating axis was found, matching the
+    // "smaller-is-closer" convention of the ray-vs-X tests.
+    //
+    // The corner-free formulation: for an OBB (origin O, unit axes X/Y/Z,
+    // half-extents h) and a separating-axis candidate a, the projection
+    // interval is centered at O·a with radius hx|X·a| + hy|Y·a| + hz|Z·a|.
+    // Each axis therefore only needs four dot products per box (and a couple
+    // of fused-multiply-adds) instead of eight corner projections — every
+    // sub-pair runs in parallel inside the SIMD lanes.
+    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
+    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
+    IntersectionTestOrientedBoxOrientedBox(
+        PackedOBBs<Packing> const& a, PackedOBBs<Packing> const& b
    ) {
-        std::array<VectorF32<3, 1>, 8> cornersA = GetOBBCorners(sizeA, boxA);
-        std::array<VectorF32<3, 1>, 8> cornersB = GetOBBCorners(sizeB, boxB);
+        using PVec = VectorF32<3, Packing>;
+        constexpr std::uint8_t N = PVec::BatchSize;
+        constexpr std::uint8_t Total = Packing * N;
+        using OutVec = VectorF32<1, Total>;

-        // Axes are the upper-3 lanes of each matrix row (same convention as
-        // SphereOrientedBox). ExtractLo<3> just retypes the SIMD register; the
-        // 4th lane is ignored by the Len=3 ops below.
-        std::array<VectorF32<3, 1>, 3> axesA = {
-            boxA.rows[0].template ExtractLo<3>(),
-            boxA.rows[1].template ExtractLo<3>(),
-            boxA.rows[2].template ExtractLo<3>(),
+        // Per-pair half-extents pulled out of each PackedOBBs into flat
+        // VectorF32<1, Total> registers so they can multiply the projection
+        // dots directly.
+        OutVec halfA_x = detail::ExtractComponent<0, Packing>(a.halfSize);
+        OutVec halfA_y = detail::ExtractComponent<1, Packing>(a.halfSize);
+        OutVec halfA_z = detail::ExtractComponent<2, Packing>(a.halfSize);
+        OutVec halfB_x = detail::ExtractComponent<0, Packing>(b.halfSize);
+        OutVec halfB_y = detail::ExtractComponent<1, Packing>(b.halfSize);
+        OutVec halfB_z = detail::ExtractComponent<2, Packing>(b.halfSize);
+
+        constexpr float maxF = std::numeric_limits<float>::max();
+        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
+        for (std::uint8_t i = 0; i < Total; ++i) out[i] = 0.0f; // start: overlap
+
+        auto axesOfA = [&](std::uint8_t i) -> std::array<PVec, N> const& {
+            return (i == 0) ? a.xAxis : (i == 1) ? a.yAxis : a.zAxis;
        };
-        std::array<VectorF32<3, 1>, 3> axesB = {
-            boxB.rows[0].template ExtractLo<3>(),
-            boxB.rows[1].template ExtractLo<3>(),
-            boxB.rows[2].template ExtractLo<3>(),
+        auto axesOfB = [&](std::uint8_t i) -> std::array<PVec, N> const& {
+            return (i == 0) ? b.xAxis : (i == 1) ? b.yAxis : b.zAxis;
        };

-        std::array<VectorF32<3, 1>, 15> axes{};
-        axes[0] = axesA[0]; axes[1] = axesA[1]; axes[2] = axesA[2];
-        axes[3] = axesB[0]; axes[4] = axesB[1]; axes[5] = axesB[2];
-        // Normalize all nine cross axes together with a single batched
-        // Normalize call (Packing=3 not in the API, so two calls of four +
-        // one of one would be needed; for now just normalize in two batches
-        // of four and the trailing one inline).
-        std::array<VectorF32<3, 1>, 9> crossAxes{};
-        std::uint8_t k = 0;
+        // For each separating-axis candidate, compute per-pair min/max for
+        // both boxes and OR the "separating" condition into `out`.
+        auto checkAxis = [&](std::array<PVec, N> const& axis) {
+            OutVec cA = detail::DotArrays(a.origin, axis);
+            OutVec dA_x = detail::DotArrays(a.xAxis, axis);
+            OutVec dA_y = detail::DotArrays(a.yAxis, axis);
+            OutVec dA_z = detail::DotArrays(a.zAxis, axis);
+            OutVec rA = halfA_x * detail::AbsVec(dA_x)
+                      + halfA_y * detail::AbsVec(dA_y)
+                      + halfA_z * detail::AbsVec(dA_z);
+
+            OutVec cB = detail::DotArrays(b.origin, axis);
+            OutVec dB_x = detail::DotArrays(b.xAxis, axis);
+            OutVec dB_y = detail::DotArrays(b.yAxis, axis);
+            OutVec dB_z = detail::DotArrays(b.zAxis, axis);
+            OutVec rB = halfB_x * detail::AbsVec(dB_x)
+                      + halfB_y * detail::AbsVec(dB_y)
+                      + halfB_z * detail::AbsVec(dB_z);
+
+            OutVec minA = cA - rA;
+            OutVec maxA = cA + rA;
+            OutVec minB = cB - rB;
+            OutVec maxB = cB + rB;
+
+            auto minAArr = minA.template Store<float>();
+            auto maxAArr = maxA.template Store<float>();
+            auto minBArr = minB.template Store<float>();
+            auto maxBArr = maxB.template Store<float>();
+            for (std::uint8_t i = 0; i < Total; ++i) {
+                // NaN comparisons (from degenerate cross axes) return false and
+                // correctly leave `out[i]` untouched on this axis.
+                if (maxAArr[i] < minBArr[i] || maxBArr[i] < minAArr[i]) {
+                    out[i] = maxF;
+                }
+            }
+        };
+
+        checkAxis(a.xAxis); checkAxis(a.yAxis); checkAxis(a.zAxis);
+        checkAxis(b.xAxis); checkAxis(b.yAxis); checkAxis(b.zAxis);
+
+        // The 9 cross-product axes. Each batch slot's cross axes are computed
+        // per-slot, then normalized together (one PVec::Normalize per cross
+        // index processes N packed slots in parallel).
        for (std::uint8_t i = 0; i < 3; ++i) {
+            auto const& aAx = axesOfA(i);
            for (std::uint8_t j = 0; j < 3; ++j) {
-                crossAxes[k++] = VectorF32<3, 1>::Cross(axesA[i], axesB[j]);
+                auto const& bAx = axesOfB(j);
+                std::array<PVec, N> crossAx;
+                for (std::uint8_t k = 0; k < N; ++k) crossAx[k] = PVec::Cross(aAx[k], bAx[k]);
+                auto normalized = std::apply([](auto... args) {
+                    return PVec::Normalize(args...);
+                }, crossAx);
+                checkAxis(normalized);
            }
        }
-        auto norm0 = VectorF32<3, 1>::Normalize(crossAxes[0], crossAxes[1], crossAxes[2], crossAxes[3]);
-        auto norm1 = VectorF32<3, 1>::Normalize(crossAxes[4], crossAxes[5], crossAxes[6], crossAxes[7]);
-        auto norm2 = VectorF32<3, 1>::Normalize(crossAxes[8], crossAxes[8], crossAxes[8], crossAxes[8]);
-        axes[6]  = std::get<0>(norm0);
-        axes[7]  = std::get<1>(norm0);
-        axes[8]  = std::get<2>(norm0);
-        axes[9]  = std::get<3>(norm0);
-        axes[10] = std::get<0>(norm1);
-        axes[11] = std::get<1>(norm1);
-        axes[12] = std::get<2>(norm1);
-        axes[13] = std::get<3>(norm1);
-        axes[14] = std::get<0>(norm2);

-        for (std::uint8_t axisIdx = 0; axisIdx < 15; ++axisIdx) {
-            VectorF32<3, 1> axis = axes[axisIdx];
-            // Project all 8 corners of each box onto `axis` using two batched
-            // 4-pair Dot calls (lo and hi corners).
-            VectorF32<1, 4> projA_lo = VectorF32<3, 1>::Dot(
-                cornersA[0], axis, cornersA[1], axis,
-                cornersA[2], axis, cornersA[3], axis);
-            VectorF32<1, 4> projA_hi = VectorF32<3, 1>::Dot(
-                cornersA[4], axis, cornersA[5], axis,
-                cornersA[6], axis, cornersA[7], axis);
-            VectorF32<1, 4> projB_lo = VectorF32<3, 1>::Dot(
-                cornersB[0], axis, cornersB[1], axis,
-                cornersB[2], axis, cornersB[3], axis);
-            VectorF32<1, 4> projB_hi = VectorF32<3, 1>::Dot(
-                cornersB[4], axis, cornersB[5], axis,
-                cornersB[6], axis, cornersB[7], axis);
-
-            std::array<float, 4> aLo = projA_lo.template Store<float>();
-            std::array<float, 4> aHi = projA_hi.template Store<float>();
-            std::array<float, 4> bLo = projB_lo.template Store<float>();
-            std::array<float, 4> bHi = projB_hi.template Store<float>();
-
-            float minA = aLo[0], maxA = aLo[0];
-            for (std::uint8_t i = 1; i < 4; ++i) {
-                minA = std::min(minA, aLo[i]);
-                maxA = std::max(maxA, aLo[i]);
-            }
-            for (std::uint8_t i = 0; i < 4; ++i) {
-                minA = std::min(minA, aHi[i]);
-                maxA = std::max(maxA, aHi[i]);
-            }
-            float minB = bLo[0], maxB = bLo[0];
-            for (std::uint8_t i = 1; i < 4; ++i) {
-                minB = std::min(minB, bLo[i]);
-                maxB = std::max(maxB, bLo[i]);
-            }
-            for (std::uint8_t i = 0; i < 4; ++i) {
-                minB = std::min(minB, bHi[i]);
-                maxB = std::max(maxB, bHi[i]);
-            }
-
-            if (maxA < minB || maxB < minA) return false;
-        }
-        return true;
+        return OutVec(out.data());
    }
 }