packed intersection and matrix

2026-05-18 19:57:40 +02:00 · 2026-05-18 19:57:40 +02:00 · f0becd1582
commit f0becd1582
parent 027947cae6
7 changed files with 948 additions and 557 deletions
--- a/interfaces/Crafter.Math-Common.cppm
+++ b/interfaces/Crafter.Math-Common.cppm
@ -87,6 +87,17 @@ namespace Crafter {
        static constexpr std::uint8_t AlignmentElement = GetAlingment()/sizeof(T);
        static constexpr std::uint8_t Alignment = GetAlingment();
        // Number of input vectors per batched Normalize/Dot/Length call that
        // exactly fills the output register on the current (Len, Packing, ISA).
        // Each input contributes `Packing` scalar results; an output register
        // holds `AlignmentElement` lanes, so optimal arity = lanes / packing.
        static constexpr std::uint8_t BatchSize = AlignmentElement / Packing;
        // Largest Packing that still fits a single SIMD register for this
        // (Len, T) on the current ISA. Independent of the current Packing
        // dimension — meant for higher-level batching code that wants to
        // process Packing sub-primitives at once (e.g. intersection tests).
        // Falls back to 1 in the pathological case Len > MaxElement.
        static constexpr std::uint8_t OptimalPacking = (MaxElement / Len) > 0 ? (MaxElement / Len) : 1;
        static_assert(Len * Packing <= MaxElement, "Len * Packing exceeds MaxElement");
        protected:
@ -97,6 +108,22 @@ namespace Crafter {
            return arr;
        }
        // True iff every per-Packing-slot shuffle (output, source) pair stays
        // within the same PerLane chunk. shuffle_epi32 / shuffle_epi8 are
        // applied per 128-bit lane, so any cross-lane move has to fall through
        // to a cross-lane permute path instead.
        template <std::array<std::uint8_t, Len> ShuffleValues>
        static consteval bool LaneSafeShuffle() {
            for (std::uint8_t p = 0; p < Packing; ++p) {
                for (std::uint8_t i = 0; i < Len; ++i) {
                    std::uint8_t outIdx = static_cast<std::uint8_t>(p * Len + i);
                    std::uint8_t srcIdx = static_cast<std::uint8_t>(p * Len + ShuffleValues[i]);
                    if (outIdx / PerLane != srcIdx / PerLane) return false;
                }
            }
            return true;
        }
        template <std::array<std::uint8_t, Len> ShuffleValues>
        static consteval bool CheckEpi32Shuffle() {
            if constexpr (PerLane == 8) {
@ -113,7 +140,7 @@ namespace Crafter {
                    }
                }
            }
-            return true;
+            return LaneSafeShuffle<ShuffleValues>();
        }
        template <std::array<std::uint8_t, Len> ShuffleValues>
@ -124,7 +151,7 @@ namespace Crafter {
                    return false;
                }
            }
-            return true;
+            return LaneSafeShuffle<ShuffleValues>();
        }
        template <std::array<std::uint8_t, Len> ShuffleValues>
--- a/interfaces/Crafter.Math-Intersection.cppm
+++ b/interfaces/Crafter.Math-Intersection.cppm
@ -23,64 +23,141 @@ import :MatrixRowMajor;
 import std;
 namespace Crafter {
-    // All intersection tests are batched over four primitives at a time so they
+    namespace detail {
-    // feed the VectorF32<3,1>::Dot / Cross / Length / Normalize four-pair
+        // Splat a single Len-vector into all Packing slots of the wider type
-    // overloads directly. The single-primitive case is just "pass the same
+        // via a temporary float buffer. Performed once per intersection call;
-    // primitive four times and read lane 0" - there is no single-vector
+        // the inner SIMD loop dominates so the round-trip is in the noise.
-    // fast-path because the SIMD pipelines want full lanes.
+        template <std::uint8_t Packing, std::uint8_t Len>
        inline VectorF32<Len, Packing> SplatToPacking(VectorF32<Len, 1> v) {
            alignas(64) float buf[VectorF32<Len, Packing>::AlignmentElement] = {};
            std::array<float, VectorF32<Len, 1>::AlignmentElement> flat = v.template Store<float>();
            for (std::uint8_t p = 0; p < Packing; ++p) {
                for (std::uint8_t k = 0; k < Len; ++k) buf[p * Len + k] = flat[k];
            }
            return VectorF32<Len, Packing>(buf);
        }
-    // Möller-Trumbore against four triangles sharing one ray. Returns ray
+        // Interleave two arrays of size N=BatchSize into the 2*N positional
-    // parameter t per triangle, or float max where the ray misses.
+        // argument list expected by the variadic Dot. Returns the packed
-    export inline VectorF32<1, 4> IntersectionTestRayTriangle(
+        // VectorF32<1, Packing*BatchSize> with one dot product per slot.
-        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
+        template <std::uint8_t Len, std::uint8_t Packing, std::size_t N>
-        VectorF32<3, 1> aV0, VectorF32<3, 1> aV1, VectorF32<3, 1> aV2,
+        inline auto DotArrays(
-        VectorF32<3, 1> bV0, VectorF32<3, 1> bV1, VectorF32<3, 1> bV2,
+            std::array<VectorF32<Len, Packing>, N> const& a,
-        VectorF32<3, 1> cV0, VectorF32<3, 1> cV1, VectorF32<3, 1> cV2,
+            std::array<VectorF32<Len, Packing>, N> const& b
        VectorF32<3, 1> dV0, VectorF32<3, 1> dV1, VectorF32<3, 1> dV2
        ) {
-        VectorF32<3, 1> aE1 = aV1 - aV0;
+            return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
-        VectorF32<3, 1> aE2 = aV2 - aV0;
+                std::array<VectorF32<Len, Packing>, 2 * N> flat;
-        VectorF32<3, 1> bE1 = bV1 - bV0;
+                ((flat[2 * Is] = a[Is], flat[2 * Is + 1] = b[Is]), ...);
-        VectorF32<3, 1> bE2 = bV2 - bV0;
+                return std::apply([](auto... args) {
-        VectorF32<3, 1> cE1 = cV1 - cV0;
+                    return VectorF32<Len, Packing>::Dot(args...);
-        VectorF32<3, 1> cE2 = cV2 - cV0;
+                }, flat);
-        VectorF32<3, 1> dE1 = dV1 - dV0;
+            }(std::make_index_sequence<N>{});
-        VectorF32<3, 1> dE2 = dV2 - dV0;
+        }
-        VectorF32<3, 1> aH = VectorF32<3, 1>::Cross(rayDir, aE2);
+        // Gather the `Component`-th lane of every sub-vector across an array
-        VectorF32<3, 1> bH = VectorF32<3, 1>::Cross(rayDir, bE2);
+        // of N packed VectorF32<3, Packing> into a flat VectorF32<1, Packing*N>
-        VectorF32<3, 1> cH = VectorF32<3, 1>::Cross(rayDir, cE2);
+        // with one scalar per pair. Used to materialize halfSize.x / .y / .z
-        VectorF32<3, 1> dH = VectorF32<3, 1>::Cross(rayDir, dE2);
+        // alongside per-pair scalar projections in a single SIMD register.
        template <std::uint8_t Component, std::uint8_t Packing>
        inline auto ExtractComponent(
            std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& arr
        ) {
            constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
            constexpr std::uint8_t Total = Packing * N;
            using OutVec = VectorF32<1, Total>;
            alignas(64) float buf[OutVec::AlignmentElement] = {};
            for (std::uint8_t b = 0; b < N; ++b) {
                auto v = arr[b].template Store<float>();
                for (std::uint8_t p = 0; p < Packing; ++p) {
                    buf[b * Packing + p] = v[p * 3 + Component];
                }
            }
            return OutVec(buf);
        }
-        VectorF32<3, 1> aS = rayOrigin - aV0;
+        // Lane-wise absolute value. Done via a flat float buffer because the
-        VectorF32<3, 1> bS = rayOrigin - bV0;
+        // F32 module does not expose a SIMD Abs primitive. Only called O(15)
-        VectorF32<3, 1> cS = rayOrigin - cV0;
+        // times per OBB-OBB call, so the round-trip is negligible compared to
-        VectorF32<3, 1> dS = rayOrigin - dV0;
+        // the dot-product work.
        template <std::uint8_t Total>
        inline VectorF32<1, Total> AbsVec(VectorF32<1, Total> v) {
            alignas(64) float buf[VectorF32<1, Total>::AlignmentElement];
            v.Store(buf);
            for (std::uint8_t i = 0; i < Total; ++i) buf[i] = std::abs(buf[i]);
            return VectorF32<1, Total>(buf);
        }
    }
-        VectorF32<3, 1> aQ = VectorF32<3, 1>::Cross(aS, aE1);
+    // Packed batch of Packing * BatchSize OBBs, each described by world-space
-        VectorF32<3, 1> bQ = VectorF32<3, 1>::Cross(bS, bE1);
+    // origin, three orthonormal rotation axes (rows of the rotation matrix),
-        VectorF32<3, 1> cQ = VectorF32<3, 1>::Cross(cS, cE1);
+    // and per-axis half-extents. Each std::array element packs `Packing`
-        VectorF32<3, 1> dQ = VectorF32<3, 1>::Cross(dS, dE1);
+    // sub-OBBs; there are BatchSize such elements, so the struct holds
    // Packing * BatchSize OBBs total.
    //
    // Callers that have OBBs as MatrixRowMajor + halfSize need to extract the
    // three axes and the origin themselves — keeping the routines in terms of
    // packed VectorF32<3, Packing> lets every SIMD op stay in registers.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    struct PackedOBBs {
        static constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
        static constexpr std::uint8_t Total = Packing * N;
-        // Four 3-component dots packed into one __m128 per call.
+        std::array<VectorF32<3, Packing>, N> halfSize;
-        VectorF32<1, 4> det = VectorF32<3, 1>::Dot(
+        std::array<VectorF32<3, Packing>, N> xAxis;
-            aE1, aH, bE1, bH, cE1, cH, dE1, dH);
+        std::array<VectorF32<3, Packing>, N> yAxis;
-        VectorF32<1, 4> uNum = VectorF32<3, 1>::Dot(
+        std::array<VectorF32<3, Packing>, N> zAxis;
-            aS, aH, bS, bH, cS, cH, dS, dH);
+        std::array<VectorF32<3, Packing>, N> origin;
-        VectorF32<1, 4> vNum = VectorF32<3, 1>::Dot(
+    };
            rayDir, aQ, rayDir, bQ, rayDir, cQ, rayDir, dQ);
        VectorF32<1, 4> tNum = VectorF32<3, 1>::Dot(
            aE2, aQ, bE2, bQ, cE2, cQ, dE2, dQ);
-        std::array<float, 4> detArr = det.template Store<float>();
+    // All intersection tests are batched over Packing*BatchSize primitives at
-        std::array<float, 4> uArr = uNum.template Store<float>();
+    // a time, where `Packing = VectorF32<3,1>::OptimalPacking` for the current
-        std::array<float, 4> vArr = vNum.template Store<float>();
+    // ISA (5 on AVX-512, 2 on AVX2, 1 on SSE/WASM/scalar) and BatchSize is the
-        std::array<float, 4> tArr = tNum.template Store<float>();
+    // arity that fills one output register. Callers form the packed input by
    // laying out `Packing` sub-primitives consecutively per vertex slot, then
    // assemble `BatchSize` such packed slots into the std::array argument.
    // Result lane `i` corresponds to triangle/sphere/box index `i`.
    // Möller-Trumbore against Packing*BatchSize triangles sharing one ray.
    // Returns ray parameter t per triangle, or float max where the ray misses.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestRayTriangle(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v0,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v1,
        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v2
    ) {
        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
        constexpr std::uint8_t Total = Packing * N;
        using PVec = VectorF32<3, Packing>;
        PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
        PVec rayDirP    = detail::SplatToPacking<Packing>(rayDir);
        std::array<PVec, N> E1, E2, H, S, Q, rayDirArr;
        for (std::uint8_t i = 0; i < N; ++i) {
            E1[i] = v1[i] - v0[i];
            E2[i] = v2[i] - v0[i];
            H[i]  = PVec::Cross(rayDirP, E2[i]);
            S[i]  = rayOriginP - v0[i];
            Q[i]  = PVec::Cross(S[i], E1[i]);
            rayDirArr[i] = rayDirP;
        }
        auto det  = detail::DotArrays(E1, H);
        auto uNum = detail::DotArrays(S, H);
        auto vNum = detail::DotArrays(rayDirArr, Q);
        auto tNum = detail::DotArrays(E2, Q);
        auto detArr = det.template Store<float>();
        auto uArr   = uNum.template Store<float>();
        auto vArr   = vNum.template Store<float>();
        auto tArr   = tNum.template Store<float>();
        constexpr float eps = std::numeric_limits<float>::epsilon();
        constexpr float maxF = std::numeric_limits<float>::max();
-        alignas(16) std::array<float, 4> out{};
+        alignas(64) std::array<float, VectorF32<1, Total>::AlignmentElement> out{};
-        for (std::uint8_t i = 0; i < 4; ++i) {
+        for (std::uint8_t i = 0; i < Total; ++i) {
            float d = detArr[i];
            if (d <= eps) { out[i] = maxF; continue; }
            float invD = 1.0f / d;
@ -90,115 +167,120 @@ namespace Crafter {
            if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; }
            out[i] = tArr[i] * invD;
        }
-        return VectorF32<1, 4>(out.data());
+        return VectorF32<1, Total>(out.data());
    }
-    // One ray against four spheres. radii must hold {rA, rB, rC, rD} in lanes
+    // One ray against Packing*BatchSize spheres. `radii` holds one radius per
-    // 0..3.
+    // sphere in lane order matching the result.
-    export inline VectorF32<1, 4> IntersectionTestRaySphere(
+    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestRaySphere(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
-        VectorF32<3, 1> posA, VectorF32<3, 1> posB,
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
-        VectorF32<3, 1> posC, VectorF32<3, 1> posD,
+        VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii
        VectorF32<1, 4> radii
    ) {
-        VectorF32<3, 1> sA = rayOrigin - posA;
+        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
-        VectorF32<3, 1> sB = rayOrigin - posB;
+        constexpr std::uint8_t Total = Packing * N;
-        VectorF32<3, 1> sC = rayOrigin - posC;
+        using PVec = VectorF32<3, Packing>;
-        VectorF32<3, 1> sD = rayOrigin - posD;
+        using OutVec = VectorF32<1, Total>;
        PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
        PVec rayDirP = detail::SplatToPacking<Packing>(rayDir);
        std::array<PVec, N> s;
        std::array<PVec, N> rayDirArr;
        for (std::uint8_t i = 0; i < N; ++i) {
            s[i] = rayOriginP - pos[i];
            rayDirArr[i] = rayDirP;
        }
        // dirDotS_i = rayDir · (rayOrigin - pos_i)
-        VectorF32<1, 4> dirDotS = VectorF32<3, 1>::Dot(
+        auto dirDotS = detail::DotArrays(rayDirArr, s);
-            rayDir, sA, rayDir, sB, rayDir, sC, rayDir, sD);
+        // sqDist_i = |rayOrigin - pos_i|² across all packed slots.
-        // sqDist_i = |rayOrigin - pos_i|² (a.k.a. LengthSq of the s vectors)
+        auto sqDist  = std::apply([](auto... args) { return PVec::LengthSq(args...); }, s);
-        VectorF32<1, 4> sqDist = VectorF32<3, 1>::LengthSq(sA, sB, sC, sD);
+        // aScalar = rayDir · rayDir, broadcast across every lane.
-        // aScalar = rayDir · rayDir, broadcast across four lanes.
+        auto aScalar = std::apply([](auto... args) { return PVec::LengthSq(args...); }, rayDirArr);
        VectorF32<1, 4> aScalar = VectorF32<3, 1>::LengthSq(
            rayDir, rayDir, rayDir, rayDir);
-        VectorF32<1, 4> two(2.0f);
+        OutVec two(2.0f);
-        VectorF32<1, 4> four(4.0f);
+        OutVec four(4.0f);
-        VectorF32<1, 4> b = two * dirDotS;
+        OutVec b = two * dirDotS;
-        VectorF32<1, 4> c = sqDist - radii * radii;
+        OutVec c = sqDist - radii * radii;
        // discriminant = b² - 4·a·c
-        VectorF32<1, 4> disc = b * b - four * aScalar * c;
+        OutVec disc = b * b - four * aScalar * c;
-        std::array<float, 4> discArr = disc.template Store<float>();
+        auto discArr = disc.template Store<float>();
-        std::array<float, 4> bArr = b.template Store<float>();
+        auto bArr = b.template Store<float>();
-        std::array<float, 4> aArr = aScalar.template Store<float>();
+        auto aArr = aScalar.template Store<float>();
        constexpr float maxF = std::numeric_limits<float>::max();
-        alignas(16) std::array<float, 4> out{};
+        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
-        for (std::uint8_t i = 0; i < 4; ++i) {
+        for (std::uint8_t i = 0; i < Total; ++i) {
            float d = discArr[i];
            if (d < 0.0f) { out[i] = maxF; continue; }
            float sqrtD = std::sqrt(d);
            float t = -0.5f * (bArr[i] + sqrtD) / aArr[i];
            out[i] = (t > 0.0f) ? t : maxF;
        }
-        return VectorF32<1, 4>(out.data());
+        return OutVec(out.data());
    }
-    // One ray against four OBBs. Each box is described by world-space position,
+    // Packing that fits both Len=3 (positions, sizes) and Len=4 (quaternions)
-    // half-extent vector (per-axis sizes), and a unit quaternion rotation.
+    // in one SIMD register. Len=4's OptimalPacking is always ≤ Len=3's, so we
-    export inline VectorF32<1, 4> IntersectionTestRayOrientedBox(
+    // use the smaller of the two so a single Packing covers every type the
    // routine needs.
    inline constexpr std::uint8_t RayOBBPacking = std::min(
        VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking);
    // One ray against Packing*BatchSize OBBs. Each box is described by
    // world-space position, full-extent size, and a unit quaternion rotation.
    export template <std::uint8_t Packing = RayOBBPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestRayOrientedBox(
        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
-        VectorF32<3, 1> posA, VectorF32<3, 1> sizeA, VectorF32<4, 1> rotA,
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
-        VectorF32<3, 1> posB, VectorF32<3, 1> sizeB, VectorF32<4, 1> rotB,
+        std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& size,
-        VectorF32<3, 1> posC, VectorF32<3, 1> sizeC, VectorF32<4, 1> rotC,
+        std::array<VectorF32<4, Packing>, VectorF32<3, Packing>::BatchSize> const& rot
        VectorF32<3, 1> posD, VectorF32<3, 1> sizeD, VectorF32<4, 1> rotD
    ) {
-        // Conjugate quaternion: negate xyz, keep w. Negate<{true,true,true,false}>
+        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
-        // is constant-folded into a single XOR with a mask vector.
+        constexpr std::uint8_t Total = Packing * N;
-        VectorF32<4, 1> invRotA = rotA.template Negate<{{true, true, true, false}}>();
+        using PVec3 = VectorF32<3, Packing>;
-        VectorF32<4, 1> invRotB = rotB.template Negate<{{true, true, true, false}}>();
+        using PVec4 = VectorF32<4, Packing>;
-        VectorF32<4, 1> invRotC = rotC.template Negate<{{true, true, true, false}}>();
+        using OutVec = VectorF32<1, Total>;
        VectorF32<4, 1> invRotD = rotD.template Negate<{{true, true, true, false}}>();
-        VectorF32<3, 1> localOriginA = VectorF32<3, 1>::Rotate(rayOrigin - posA, invRotA);
+        PVec3 rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
-        VectorF32<3, 1> localOriginB = VectorF32<3, 1>::Rotate(rayOrigin - posB, invRotB);
+        PVec3 rayDirP    = detail::SplatToPacking<Packing>(rayDir);
        VectorF32<3, 1> localOriginC = VectorF32<3, 1>::Rotate(rayOrigin - posC, invRotC);
        VectorF32<3, 1> localOriginD = VectorF32<3, 1>::Rotate(rayOrigin - posD, invRotD);
-        VectorF32<3, 1> localDirA = VectorF32<3, 1>::Rotate(rayDir, invRotA);
+        // Conjugate quaternion: negate xyz, keep w. Constant-folded into one
-        VectorF32<3, 1> localDirB = VectorF32<3, 1>::Rotate(rayDir, invRotB);
+        // XOR with a mask vector inside Negate.
-        VectorF32<3, 1> localDirC = VectorF32<3, 1>::Rotate(rayDir, invRotC);
+        std::array<PVec3, N> localOrigin, localDir, half;
-        VectorF32<3, 1> localDirD = VectorF32<3, 1>::Rotate(rayDir, invRotD);
+        for (std::uint8_t i = 0; i < N; ++i) {
            PVec4 invRot = rot[i].template Negate<{{true, true, true, false}}>();
            localOrigin[i] = PVec3::Rotate(rayOriginP - pos[i], invRot);
            localDir[i]    = PVec3::Rotate(rayDirP, invRot);
            half[i]        = size[i] * 0.5f;
        }
-        VectorF32<3, 1> halfA = sizeA * 0.5f;
+        std::array<std::array<float, PVec3::AlignmentElement>, N> origLanes, dirLanes, halfLanes;
-        VectorF32<3, 1> halfB = sizeB * 0.5f;
+        for (std::uint8_t i = 0; i < N; ++i) {
-        VectorF32<3, 1> halfC = sizeC * 0.5f;
+            origLanes[i] = localOrigin[i].template Store<float>();
-        VectorF32<3, 1> halfD = sizeD * 0.5f;
+            dirLanes[i]  = localDir[i].template Store<float>();
-
+            halfLanes[i] = half[i].template Store<float>();
-        std::array<std::array<float, 4>, 4> origLanes{
+        }
            localOriginA.template Store<float>(),
            localOriginB.template Store<float>(),
            localOriginC.template Store<float>(),
            localOriginD.template Store<float>(),
        };
        std::array<std::array<float, 4>, 4> dirLanes{
            localDirA.template Store<float>(),
            localDirB.template Store<float>(),
            localDirC.template Store<float>(),
            localDirD.template Store<float>(),
        };
        std::array<std::array<float, 4>, 4> halfLanes{
            halfA.template Store<float>(),
            halfB.template Store<float>(),
            halfC.template Store<float>(),
            halfD.template Store<float>(),
        };
        constexpr float eps = std::numeric_limits<float>::epsilon();
        constexpr float maxF = std::numeric_limits<float>::max();
-        alignas(16) std::array<float, 4> out{};
+        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
-        for (std::uint8_t b = 0; b < 4; ++b) {
+        for (std::uint8_t b = 0; b < Total; ++b) {
            std::uint8_t batchIdx = b / Packing;
            std::uint8_t subIdx = b % Packing;
            float tMin = 0.0f;
            float tMax = maxF;
            bool miss = false;
            for (std::uint8_t i = 0; i < 3; ++i) {
-                float d = dirLanes[b][i];
+                std::uint8_t lane = static_cast<std::uint8_t>(subIdx * 3 + i);
-                float o = origLanes[b][i];
+                float d = dirLanes[batchIdx][lane];
-                float h = halfLanes[b][i];
+                float o = origLanes[batchIdx][lane];
                float h = halfLanes[batchIdx][lane];
                if (std::abs(d) < eps) {
                    if (o < -h || o > h) { miss = true; break; }
                } else {
@ -213,87 +295,65 @@ namespace Crafter {
            }
            out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax);
        }
-        return VectorF32<1, 4>(out.data());
+        return OutVec(out.data());
    }
-    // One sphere against four OBBs. boxMatrix encodes rotation in m[r][0..2]
+    // One sphere against Packing*BatchSize OBBs described by a PackedOBBs.
-    // and translation in m[r][3].
+    // Returns 0.0 per pair where the sphere intersects the box, max-float
-    export inline VectorF32<1, 4> IntersectionTestSphereOrientedBox(
+    // otherwise. `radii` carries one sphere radius per pair in the same lane
-        VectorF32<3, 1> sphereCenter, VectorF32<1, 4> radii,
+    // order as the resulting test output.
-        VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
+    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
-        VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB,
+    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
-        VectorF32<3, 1> sizeC, MatrixRowMajor<float, 4, 3, 1> boxC,
+    IntersectionTestSphereOrientedBox(
-        VectorF32<3, 1> sizeD, MatrixRowMajor<float, 4, 3, 1> boxD
+        VectorF32<3, 1> sphereCenter,
        VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii,
        PackedOBBs<Packing> const& boxes
    ) {
-        auto perBox = [&](MatrixRowMajor<float, 4, 3, 1> const& m,
+        constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
-                          VectorF32<3, 1> const& size,
+        constexpr std::uint8_t Total = Packing * N;
-                          VectorF32<3, 1>& xAxis,
+        using PVec3 = VectorF32<3, Packing>;
-                          VectorF32<3, 1>& yAxis,
+        using OutVec = VectorF32<1, Total>;
                          VectorF32<3, 1>& zAxis,
                          VectorF32<3, 1>& delta) {
            // Existing semantics: the OBB axes are read from the rows of the
            // upper 3x3 block, and the translation column is gathered from the
            // w lane of each row.
            std::array<float, 4> r0 = m.rows[0].template Store<float>();
            std::array<float, 4> r1 = m.rows[1].template Store<float>();
            std::array<float, 4> r2 = m.rows[2].template Store<float>();
            alignas(16) float xBuf[4] = { r0[0], r0[1], r0[2], 0.0f };
            alignas(16) float yBuf[4] = { r1[0], r1[1], r1[2], 0.0f };
            alignas(16) float zBuf[4] = { r2[0], r2[1], r2[2], 0.0f };
            alignas(16) float oBuf[4] = { r0[3], r1[3], r2[3], 0.0f };
            xAxis = VectorF32<3, 1>(xBuf);
            yAxis = VectorF32<3, 1>(yBuf);
            zAxis = VectorF32<3, 1>(zBuf);
            VectorF32<3, 1> origin(oBuf);
            delta = sphereCenter - origin;
            (void)size;
        };
-        VectorF32<3, 1> xA, yA, zA, dA;
+        PVec3 sphereCenterP = detail::SplatToPacking<Packing>(sphereCenter);
-        VectorF32<3, 1> xB, yB, zB, dB;
+        std::array<PVec3, N> delta;
-        VectorF32<3, 1> xC, yC, zC, dC;
+        for (std::uint8_t i = 0; i < N; ++i) {
-        VectorF32<3, 1> xD, yD, zD, dD;
+            delta[i] = sphereCenterP - boxes.origin[i];
-        perBox(boxA, sizeA, xA, yA, zA, dA);
+        }
        perBox(boxB, sizeB, xB, yB, zB, dB);
        perBox(boxC, sizeC, xC, yC, zC, dC);
        perBox(boxD, sizeD, xD, yD, zD, dD);
-        // Local sphere center per box: project delta onto each box axis. We
+        // Project the world-space delta onto each box axis.
-        // produce {lx, ly, lz, lx, ly, lz, lx, ly, lz, lx, ly, lz} as three
+        auto locX = detail::DotArrays(delta, boxes.xAxis);
-        // packed 4-wide Dot results (one Dot per axis).
+        auto locY = detail::DotArrays(delta, boxes.yAxis);
-        VectorF32<1, 4> locX = VectorF32<3, 1>::Dot(
+        auto locZ = detail::DotArrays(delta, boxes.zAxis);
            dA, xA, dB, xB, dC, xC, dD, xD);
        VectorF32<1, 4> locY = VectorF32<3, 1>::Dot(
            dA, yA, dB, yB, dC, yC, dD, yD);
        VectorF32<1, 4> locZ = VectorF32<3, 1>::Dot(
            dA, zA, dB, zB, dC, zC, dD, zD);
-        std::array<float, 4> lxArr = locX.template Store<float>();
+        auto lxArr = locX.template Store<float>();
-        std::array<float, 4> lyArr = locY.template Store<float>();
+        auto lyArr = locY.template Store<float>();
-        std::array<float, 4> lzArr = locZ.template Store<float>();
+        auto lzArr = locZ.template Store<float>();
-        std::array<float, 4> rArr  = radii.template Store<float>();
+        auto rArr  = radii.template Store<float>();
-        std::array<std::array<float, 4>, 4> sizeLanes{
+        std::array<std::array<float, PVec3::AlignmentElement>, N> sizeLanes;
-            sizeA.template Store<float>(),
+        for (std::uint8_t i = 0; i < N; ++i) {
-            sizeB.template Store<float>(),
+            sizeLanes[i] = boxes.halfSize[i].template Store<float>();
-            sizeC.template Store<float>(),
+        }
            sizeD.template Store<float>(),
        };
-        alignas(16) std::array<float, 4> out{};
+        constexpr float maxF = std::numeric_limits<float>::max();
-        for (std::uint8_t i = 0; i < 4; ++i) {
+        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
        for (std::uint8_t i = 0; i < Total; ++i) {
            std::uint8_t batchIdx = i / Packing;
            std::uint8_t subIdx = i % Packing;
            float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i];
-            float sx = sizeLanes[i][0], sy = sizeLanes[i][1], sz = sizeLanes[i][2];
+            float sx = sizeLanes[batchIdx][subIdx * 3 + 0];
            float sy = sizeLanes[batchIdx][subIdx * 3 + 1];
            float sz = sizeLanes[batchIdx][subIdx * 3 + 2];
            float cx = std::clamp(lx, -sx, sx);
            float cy = std::clamp(ly, -sy, sy);
            float cz = std::clamp(lz, -sz, sz);
            float dx = lx - cx, dy = ly - cy, dz = lz - cz;
            float distSq = dx * dx + dy * dy + dz * dz;
            float r = rArr[i];
-            // Returns 0.0 on hit, max on miss - keeps a consistent
+            // Returns 0.0 on hit, max on miss — same "t-like" output signature
-            // "t-like" output signature with the other intersection tests.
+            // as the ray-vs-X tests.
-            out[i] = (distSq <= r * r) ? 0.0f : std::numeric_limits<float>::max();
+            out[i] = (distSq <= r * r) ? 0.0f : maxF;
        }
-        return VectorF32<1, 4>(out.data());
+        return OutVec(out.data());
    }
    // Eight local corners of a unit OBB transformed by `matrix`. Uses one
@ -350,100 +410,104 @@ namespace Crafter {
        return result;
    }
-    // SAT against fifteen separating axes (3 box-A, 3 box-B, 9 cross products).
+    // SAT against the 15 separating axis candidates (3 from box A, 3 from
-    // We compute every corner projection with batched 4-pair Dots: each axis
+    // box B, 9 cross products). Returns 0.0 per pair when the boxes overlap
-    // projects four corners per call, two calls per axis covers the 8 corners.
+    // and max-float when a separating axis was found, matching the
-    export inline bool IntersectionTestOrientedBoxOrientedBox(
+    // "smaller-is-closer" convention of the ray-vs-X tests.
-        VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
+    //
-        VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB
+    // The corner-free formulation: for an OBB (origin O, unit axes X/Y/Z,
    // half-extents h) and a separating-axis candidate a, the projection
    // interval is centered at O·a with radius hx|X·a| + hy|Y·a| + hz|Z·a|.
    // Each axis therefore only needs four dot products per box (and a couple
    // of fused-multiply-adds) instead of eight corner projections — every
    // sub-pair runs in parallel inside the SIMD lanes.
    export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
    inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
    IntersectionTestOrientedBoxOrientedBox(
        PackedOBBs<Packing> const& a, PackedOBBs<Packing> const& b
    ) {
-        std::array<VectorF32<3, 1>, 8> cornersA = GetOBBCorners(sizeA, boxA);
+        using PVec = VectorF32<3, Packing>;
-        std::array<VectorF32<3, 1>, 8> cornersB = GetOBBCorners(sizeB, boxB);
+        constexpr std::uint8_t N = PVec::BatchSize;
        constexpr std::uint8_t Total = Packing * N;
        using OutVec = VectorF32<1, Total>;
-        // Axes are the upper-3 lanes of each matrix row (same convention as
+        // Per-pair half-extents pulled out of each PackedOBBs into flat
-        // SphereOrientedBox). ExtractLo<3> just retypes the SIMD register; the
+        // VectorF32<1, Total> registers so they can multiply the projection
-        // 4th lane is ignored by the Len=3 ops below.
+        // dots directly.
-        std::array<VectorF32<3, 1>, 3> axesA = {
+        OutVec halfA_x = detail::ExtractComponent<0, Packing>(a.halfSize);
-            boxA.rows[0].template ExtractLo<3>(),
+        OutVec halfA_y = detail::ExtractComponent<1, Packing>(a.halfSize);
-            boxA.rows[1].template ExtractLo<3>(),
+        OutVec halfA_z = detail::ExtractComponent<2, Packing>(a.halfSize);
-            boxA.rows[2].template ExtractLo<3>(),
+        OutVec halfB_x = detail::ExtractComponent<0, Packing>(b.halfSize);
        OutVec halfB_y = detail::ExtractComponent<1, Packing>(b.halfSize);
        OutVec halfB_z = detail::ExtractComponent<2, Packing>(b.halfSize);
        constexpr float maxF = std::numeric_limits<float>::max();
        alignas(64) std::array<float, OutVec::AlignmentElement> out{};
        for (std::uint8_t i = 0; i < Total; ++i) out[i] = 0.0f; // start: overlap
        auto axesOfA = [&](std::uint8_t i) -> std::array<PVec, N> const& {
            return (i == 0) ? a.xAxis : (i == 1) ? a.yAxis : a.zAxis;
        };
-        std::array<VectorF32<3, 1>, 3> axesB = {
+        auto axesOfB = [&](std::uint8_t i) -> std::array<PVec, N> const& {
-            boxB.rows[0].template ExtractLo<3>(),
+            return (i == 0) ? b.xAxis : (i == 1) ? b.yAxis : b.zAxis;
            boxB.rows[1].template ExtractLo<3>(),
            boxB.rows[2].template ExtractLo<3>(),
        };
-        std::array<VectorF32<3, 1>, 15> axes{};
+        // For each separating-axis candidate, compute per-pair min/max for
-        axes[0] = axesA[0]; axes[1] = axesA[1]; axes[2] = axesA[2];
+        // both boxes and OR the "separating" condition into `out`.
-        axes[3] = axesB[0]; axes[4] = axesB[1]; axes[5] = axesB[2];
+        auto checkAxis = [&](std::array<PVec, N> const& axis) {
-        // Normalize all nine cross axes together with a single batched
+            OutVec cA = detail::DotArrays(a.origin, axis);
-        // Normalize call (Packing=3 not in the API, so two calls of four +
+            OutVec dA_x = detail::DotArrays(a.xAxis, axis);
-        // one of one would be needed; for now just normalize in two batches
+            OutVec dA_y = detail::DotArrays(a.yAxis, axis);
-        // of four and the trailing one inline).
+            OutVec dA_z = detail::DotArrays(a.zAxis, axis);
-        std::array<VectorF32<3, 1>, 9> crossAxes{};
+            OutVec rA = halfA_x * detail::AbsVec(dA_x)
-        std::uint8_t k = 0;
+                      + halfA_y * detail::AbsVec(dA_y)
                      + halfA_z * detail::AbsVec(dA_z);
            OutVec cB = detail::DotArrays(b.origin, axis);
            OutVec dB_x = detail::DotArrays(b.xAxis, axis);
            OutVec dB_y = detail::DotArrays(b.yAxis, axis);
            OutVec dB_z = detail::DotArrays(b.zAxis, axis);
            OutVec rB = halfB_x * detail::AbsVec(dB_x)
                      + halfB_y * detail::AbsVec(dB_y)
                      + halfB_z * detail::AbsVec(dB_z);
            OutVec minA = cA - rA;
            OutVec maxA = cA + rA;
            OutVec minB = cB - rB;
            OutVec maxB = cB + rB;
            auto minAArr = minA.template Store<float>();
            auto maxAArr = maxA.template Store<float>();
            auto minBArr = minB.template Store<float>();
            auto maxBArr = maxB.template Store<float>();
            for (std::uint8_t i = 0; i < Total; ++i) {
                // NaN comparisons (from degenerate cross axes) return false and
                // correctly leave `out[i]` untouched on this axis.
                if (maxAArr[i] < minBArr[i] || maxBArr[i] < minAArr[i]) {
                    out[i] = maxF;
                }
            }
        };
        checkAxis(a.xAxis); checkAxis(a.yAxis); checkAxis(a.zAxis);
        checkAxis(b.xAxis); checkAxis(b.yAxis); checkAxis(b.zAxis);
        // The 9 cross-product axes. Each batch slot's cross axes are computed
        // per-slot, then normalized together (one PVec::Normalize per cross
        // index processes N packed slots in parallel).
        for (std::uint8_t i = 0; i < 3; ++i) {
            auto const& aAx = axesOfA(i);
            for (std::uint8_t j = 0; j < 3; ++j) {
-                crossAxes[k++] = VectorF32<3, 1>::Cross(axesA[i], axesB[j]);
+                auto const& bAx = axesOfB(j);
                std::array<PVec, N> crossAx;
                for (std::uint8_t k = 0; k < N; ++k) crossAx[k] = PVec::Cross(aAx[k], bAx[k]);
                auto normalized = std::apply([](auto... args) {
                    return PVec::Normalize(args...);
                }, crossAx);
                checkAxis(normalized);
            }
        }
        auto norm0 = VectorF32<3, 1>::Normalize(crossAxes[0], crossAxes[1], crossAxes[2], crossAxes[3]);
        auto norm1 = VectorF32<3, 1>::Normalize(crossAxes[4], crossAxes[5], crossAxes[6], crossAxes[7]);
        auto norm2 = VectorF32<3, 1>::Normalize(crossAxes[8], crossAxes[8], crossAxes[8], crossAxes[8]);
        axes[6]  = std::get<0>(norm0);
        axes[7]  = std::get<1>(norm0);
        axes[8]  = std::get<2>(norm0);
        axes[9]  = std::get<3>(norm0);
        axes[10] = std::get<0>(norm1);
        axes[11] = std::get<1>(norm1);
        axes[12] = std::get<2>(norm1);
        axes[13] = std::get<3>(norm1);
        axes[14] = std::get<0>(norm2);
        for (std::uint8_t axisIdx = 0; axisIdx < 15; ++axisIdx) {
            VectorF32<3, 1> axis = axes[axisIdx];
            // Project all 8 corners of each box onto `axis` using two batched
            // 4-pair Dot calls (lo and hi corners).
            VectorF32<1, 4> projA_lo = VectorF32<3, 1>::Dot(
                cornersA[0], axis, cornersA[1], axis,
                cornersA[2], axis, cornersA[3], axis);
            VectorF32<1, 4> projA_hi = VectorF32<3, 1>::Dot(
                cornersA[4], axis, cornersA[5], axis,
                cornersA[6], axis, cornersA[7], axis);
            VectorF32<1, 4> projB_lo = VectorF32<3, 1>::Dot(
                cornersB[0], axis, cornersB[1], axis,
                cornersB[2], axis, cornersB[3], axis);
            VectorF32<1, 4> projB_hi = VectorF32<3, 1>::Dot(
                cornersB[4], axis, cornersB[5], axis,
                cornersB[6], axis, cornersB[7], axis);
            std::array<float, 4> aLo = projA_lo.template Store<float>();
            std::array<float, 4> aHi = projA_hi.template Store<float>();
            std::array<float, 4> bLo = projB_lo.template Store<float>();
            std::array<float, 4> bHi = projB_hi.template Store<float>();
            float minA = aLo[0], maxA = aLo[0];
            for (std::uint8_t i = 1; i < 4; ++i) {
                minA = std::min(minA, aLo[i]);
                maxA = std::max(maxA, aLo[i]);
            }
            for (std::uint8_t i = 0; i < 4; ++i) {
                minA = std::min(minA, aHi[i]);
                maxA = std::max(maxA, aHi[i]);
            }
            float minB = bLo[0], maxB = bLo[0];
            for (std::uint8_t i = 1; i < 4; ++i) {
                minB = std::min(minB, bLo[i]);
                maxB = std::max(maxB, bLo[i]);
            }
            for (std::uint8_t i = 0; i < 4; ++i) {
                minB = std::min(minB, bHi[i]);
                maxB = std::max(maxB, bHi[i]);
            }
-            if (maxA < minB || maxB < minA) return false;
+        return OutVec(out.data());
        }
        return true;
    }
 }
--- a/interfaces/Crafter.Math-MatrixRowMajor.cppm
+++ b/interfaces/Crafter.Math-MatrixRowMajor.cppm
@ -270,12 +270,12 @@ namespace Crafter {
            // R0 = up × R2 is linear in R2, so its normalized direction does
            // not depend on whether we hand R2 in before or after its own
            // normalize. Computing R0_raw from the un-normalized R2 lets us
-            // satisfy the 4-input Normalize requirement with one batched call
+            // satisfy the VectorF32<3,1>::BatchSize Normalize requirement with
-            // (duplicating R2 and R0 in the padding slots).
+            // one batched call (duplicating R2 and R0 in the padding slots).
            VectorF32<3, 1> R0Raw = VectorF32<3, 1>::Cross(upDirection, eyeDirection);
            auto normalized = VectorF32<3, 1>::Normalize(eyeDirection, R0Raw, eyeDirection, R0Raw);
-            VectorF32<3, 1> R2 = std::get<0>(normalized);
+            VectorF32<3, 1> R2 = normalized[0];
-            VectorF32<3, 1> R0 = std::get<1>(normalized);
+            VectorF32<3, 1> R0 = normalized[1];
            VectorF32<3, 1> R1 = VectorF32<3, 1>::Cross(R2, R0);
            VectorF32<3, 1> negEye = -eyePosition;
--- a/interfaces/Crafter.Math-VectorF16.cppm
+++ b/interfaces/Crafter.Math-VectorF16.cppm
@ -554,7 +554,39 @@ namespace Crafter {
            }
 		}
-        constexpr static std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>, VectorF16<Len, Packing>, VectorF16<Len, Packing>> Normalize(
+        // Public variadic surface — one name per op, arity locked to BatchSize
        // (or 2*BatchSize for Dot). Forwards to the *Pack helpers below which
        // carry the SIMD bodies and per-(Len,Packing) requires clauses.
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
        constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
            return NormalizePack(first, rest...);
        }
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
        constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
            return LengthPack(first, rest...);
        }
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
        constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
            return LengthSqPack(first, rest...);
        }
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, _Float16>::BatchSize))
        constexpr static auto Dot(VectorF16<Len, Packing> first, Rest... rest) {
            return DotPack(first, rest...);
        }
        private:
        constexpr static std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize> NormalizePack(
                VectorF16<Len, Packing> A,
                VectorF16<Len, Packing> C,
                VectorF16<Len, Packing> E,
@ -616,7 +648,7 @@ namespace Crafter {
            }
 		}
-        constexpr static std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>> Normalize(
+        constexpr static std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize> NormalizePack(
                VectorF16<Len, Packing> A,
                VectorF16<Len, Packing> E
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
@ -662,13 +694,13 @@ namespace Crafter {
            }
 		}
-        constexpr static VectorF16<1, Packing*4> Length(
+        constexpr static VectorF16<1, Packing*4> LengthPack(
                VectorF16<Len, Packing> A,
                VectorF16<Len, Packing> C,
                VectorF16<Len, Packing> E,
                VectorF16<Len, Packing> G
            ) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
-            VectorF16<1, Packing*4> lenghtSq = LengthSq(A, C, E, G);
+            VectorF16<1, Packing*4> lenghtSq = LengthSqPack(A, C, E, G);
            if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
                return VectorF16<1, Packing*4>(_mm_sqrt_ph(lenghtSq.v));
            } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m256h>) {
@ -678,11 +710,11 @@ namespace Crafter {
            }
 		}
-        constexpr static VectorF16<1, Packing*2> Length(
+        constexpr static VectorF16<1, Packing*2> LengthPack(
                VectorF16<Len, Packing> A,
                VectorF16<Len, Packing> E
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
-            VectorF16<1, Packing*2> lenghtSq = LengthSq(A, E);
+            VectorF16<1, Packing*2> lenghtSq = LengthSqPack(A, E);
            if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
                return VectorF16<1, Packing*2>(_mm_sqrt_ph(lenghtSq.v));
            } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m256h>) {
@ -692,23 +724,23 @@ namespace Crafter {
            }
 		}
-        constexpr static VectorF16<1, Packing*4> LengthSq(
+        constexpr static VectorF16<1, Packing*4> LengthSqPack(
                VectorF16<Len, Packing> A,
                VectorF16<Len, Packing> C,
                VectorF16<Len, Packing> E,
                VectorF16<Len, Packing> G
            ) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
-            return Dot(A, A, C, C, E, E, G, G);
+            return DotPack(A, A, C, C, E, E, G, G);
 		}
-        constexpr static VectorF16<1, Packing*2> LengthSq(
+        constexpr static VectorF16<1, Packing*2> LengthSqPack(
                VectorF16<Len, Packing> A,
                VectorF16<Len, Packing> E
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
-            return Dot(A, A, E, E);
+            return DotPack(A, A, E, E);
 		}
-        constexpr static VectorF16<1, Packing*4> Dot(
+        constexpr static VectorF16<1, Packing*4> DotPack(
                VectorF16<Len, Packing> A0, VectorF16<Len, Packing> A1, 
                VectorF16<Len, Packing> C0, VectorF16<Len, Packing> C1, 
                VectorF16<Len, Packing> E0, VectorF16<Len, Packing> E1, 
@ -744,7 +776,7 @@ namespace Crafter {
            }
        }
-        constexpr static VectorF16<1, Packing*2> Dot(
+        constexpr static VectorF16<1, Packing*2> DotPack(
                VectorF16<Len, Packing> A0, VectorF16<Len, Packing> A1, 
                VectorF16<Len, Packing> E0, VectorF16<Len, Packing> E1
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
@ -1200,9 +1232,10 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
        constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
-            constexpr std::uint8_t N = 1 + sizeof...(Rest);
+            constexpr std::uint8_t N = VectorBase<Len, Packing, _Float16>::BatchSize;
            VectorF16<1, static_cast<std::uint8_t>(Packing * N)> r;
            std::array<VectorF16<Len, Packing>, N> args{ first, rest... };
            for (std::uint8_t i = 0; i < N; ++i)
@ -1218,7 +1251,8 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
        constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
            auto sq = LengthSq(first, rest...);
            for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i)
@ -1227,7 +1261,8 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
        constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
            auto normOne = [](VectorF16<Len, Packing> u) {
                VectorF16<Len, Packing> out;
@ -1243,7 +1278,7 @@ namespace Crafter {
                }
                return out;
            };
-            return std::make_tuple(normOne(first), normOne(rest)...);
+            return std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize>{ normOne(first), normOne(rest)... };
        }
        constexpr static VectorF16<Len, Packing> Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) {
--- a/interfaces/Crafter.Math-VectorF32.cppm
+++ b/interfaces/Crafter.Math-VectorF32.cppm
@ -449,7 +449,7 @@ namespace Crafter {
        }
        template <std::array<bool, Len> values>
-        constexpr VectorF32<Len, Packing> Negate() {
+        constexpr VectorF32<Len, Packing> Negate() const {
            std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>();
            if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
                return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data())))));
@ -549,7 +549,39 @@ namespace Crafter {
            }
        }
-        constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
+        // Public variadic surface — one name per op, arity locked to BatchSize.
        // The Pack helpers below carry the SIMD bodies and the per-(Len,Packing)
        // requires clauses; this wrapper just forwards once arity matches.
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
            return NormalizePack(first, rest...);
        }
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
            return LengthPack(first, rest...);
        }
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
            return LengthSqPack(first, rest...);
        }
        template <typename... Rest>
            requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                      (1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto Dot(VectorF32<Len, Packing> first, Rest... rest) {
            return DotPack(first, rest...);
        }
        private:
        constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> B,
                VectorF32<Len, Packing> C,
@ -614,7 +646,7 @@ namespace Crafter {
            }
 		}
-        constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
+        constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> B,
                VectorF32<Len, Packing> C,
@ -638,7 +670,7 @@ namespace Crafter {
            };
 		}
-        constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
+        constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> B,
                VectorF32<Len, Packing> C,
@ -663,7 +695,7 @@ namespace Crafter {
 		}
        #ifdef __AVX512F__
-        constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
+        constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> B,
                VectorF32<Len, Packing> C
@ -685,7 +717,7 @@ namespace Crafter {
 		}
        #endif
-        constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
+        constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> B
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
@ -733,13 +765,13 @@ namespace Crafter {
            }
 		}
-        constexpr static VectorF32<1, Packing*4> Length(
+        constexpr static VectorF32<1, Packing*4> LengthPack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> B,
                VectorF32<Len, Packing> C,
                VectorF32<Len, Packing> D
            ) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
-            VectorF32<1, Packing*4> lenghtSq = LengthSq(A, B, C, D);
+            VectorF32<1, Packing*4> lenghtSq = LengthSqPack(A, B, C, D);
            if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
                return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v));
            } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
@ -749,42 +781,42 @@ namespace Crafter {
            }
 		}
-        constexpr static VectorF32<1, 4> Length(
+        constexpr static VectorF32<1, 4> LengthPack(
            VectorF32<Len, Packing> A,
            VectorF32<Len, Packing> B,
            VectorF32<Len, Packing> C,
            VectorF32<Len, Packing> D
        ) requires(Len == 3 && Packing == 1) {
-            VectorF32<1, 4> lenghtSq = LengthSq(A, B, C, D);
+            VectorF32<1, 4> lenghtSq = LengthSqPack(A, B, C, D);
            return VectorF32<1, 4>(_mm_sqrt_ps(lenghtSq.v));
        }
-        constexpr static VectorF32<1, 8> Length(
+        constexpr static VectorF32<1, 8> LengthPack(
            VectorF32<Len, Packing> A,
            VectorF32<Len, Packing> B,
            VectorF32<Len, Packing> C,
            VectorF32<Len, Packing> D
        ) requires(Len == 3 && Packing == 2) {
-            VectorF32<1, 8> lenghtSq = LengthSq(A, B, C, D);
+            VectorF32<1, 8> lenghtSq = LengthSqPack(A, B, C, D);
            return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
        }
        #ifdef __AVX512F__
-        constexpr static VectorF32<1, 15> Length(
+        constexpr static VectorF32<1, 15> LengthPack(
            VectorF32<Len, Packing> A,
            VectorF32<Len, Packing> B,
            VectorF32<Len, Packing> C
        ) requires(Len == 3 && Packing == 5) {
-            VectorF32<1, 15> lenghtSq = LengthSq(A, B, C);
+            VectorF32<1, 15> lenghtSq = LengthSqPack(A, B, C);
            return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v));
        }
        #endif
-        constexpr static VectorF32<1, Packing*2> Length(
+        constexpr static VectorF32<1, Packing*2> LengthPack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> C
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
-            VectorF32<1, Packing*2> lenghtSq = LengthSq(A, C);
+            VectorF32<1, Packing*2> lenghtSq = LengthSqPack(A, C);
            if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
                return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v));
            } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
@ -796,51 +828,51 @@ namespace Crafter {
            }
 		}
-        constexpr static VectorF32<1, Packing*4> LengthSq(
+        constexpr static VectorF32<1, Packing*4> LengthSqPack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> B,
                VectorF32<Len, Packing> C,
                VectorF32<Len, Packing> D
            ) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
-            return Dot(A, A, B, B, C, C, D, D);
+            return DotPack(A, A, B, B, C, C, D, D);
 		}
-        constexpr static VectorF32<1, 4> LengthSq(
+        constexpr static VectorF32<1, 4> LengthSqPack(
            VectorF32<Len, Packing> A,
            VectorF32<Len, Packing> B,
            VectorF32<Len, Packing> C,
            VectorF32<Len, Packing> D
        ) requires(Len == 3 && Packing == 1) {
-            return Dot(A, A, B, B, C, C, D, D);
+            return DotPack(A, A, B, B, C, C, D, D);
        }
-        constexpr static VectorF32<1, 8> LengthSq(
+        constexpr static VectorF32<1, 8> LengthSqPack(
            VectorF32<Len, Packing> A,
            VectorF32<Len, Packing> B,
            VectorF32<Len, Packing> C,
            VectorF32<Len, Packing> D
        ) requires(Len == 3 && Packing == 2) {
-            return Dot(A, A, B, B, C, C, D, D);
+            return DotPack(A, A, B, B, C, C, D, D);
        }
        #ifdef __AVX512F__
-        constexpr static VectorF32<1, 15> LengthSq(
+        constexpr static VectorF32<1, 15> LengthSqPack(
            VectorF32<Len, Packing> A,
            VectorF32<Len, Packing> B,
            VectorF32<Len, Packing> C
        ) requires(Len == 3 && Packing == 5) {
-            return Dot(A, A, B, B, C, C);
+            return DotPack(A, A, B, B, C, C);
        }
        #endif
-        constexpr static VectorF32<1, Packing*2> LengthSq(
+        constexpr static VectorF32<1, Packing*2> LengthSqPack(
                VectorF32<Len, Packing> A,
                VectorF32<Len, Packing> C
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
-            return Dot(A, A, C, C);
+            return DotPack(A, A, C, C);
 		}
-        constexpr static VectorF32<1, Packing*4> Dot(
+        constexpr static VectorF32<1, Packing*4> DotPack(
                VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1, 
                VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1, 
                VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1, 
@ -869,7 +901,7 @@ namespace Crafter {
            }
        }
-        constexpr static VectorF32<1, 4> Dot(
+        constexpr static VectorF32<1, 4> DotPack(
            VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
            VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
            VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
@ -914,7 +946,7 @@ namespace Crafter {
            return row1;
        }
-        constexpr static VectorF32<1, 8> Dot(
+        constexpr static VectorF32<1, 8> DotPack(
            VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
            VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
            VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
@ -1021,7 +1053,7 @@ namespace Crafter {
        }
        #ifdef __AVX512F__
-        constexpr static VectorF32<1, 15> Dot(
+        constexpr static VectorF32<1, 15> DotPack(
            VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
            VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
            VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
@ -1112,7 +1144,7 @@ namespace Crafter {
        }
        #endif
-        constexpr static VectorF32<1, Packing*2> Dot(
+        constexpr static VectorF32<1, Packing*2> DotPack(
                VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
                VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
            ) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
@ -1548,9 +1580,10 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
-            constexpr std::uint8_t N = 1 + sizeof...(Rest);
+            constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
            VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
            std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
            alignas(16) float buf[4] = {0,0,0,0};
@ -1571,41 +1604,39 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
            auto sq = LengthSq(first, rest...);
            sq.v = wasm_f32x4_sqrt(sq.v);
            return sq;
        }
-        // Four pairwise dot products packed into one v128. Only the first Len
+        // Pairwise dot products packed into one v128. Only the first Len
        // lanes contribute, so the same routine handles 3- and 4-component
        // inputs — the 4th lane of Len==3 inputs may be garbage from Cross()
-        // and must not be summed.
+        // and must not be summed. Takes BatchSize pairs (== 4 here since
-        constexpr static VectorF32<1, 4> Dot(
+        // WASM AlignmentElement is always 4 and Packing must be 1).
-            VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
+        template<typename... Rest>
-            VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
+            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
-            VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
+                     (1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize) &&
-            VectorF32<Len, Packing> D0, VectorF32<Len, Packing> D1
+                     (Len == 3 || Len == 4) && Packing == 1)
-        ) requires((Len == 3 || Len == 4) && Packing == 1) {
+        constexpr static VectorF32<1, 4> Dot(VectorF32<Len, Packing> first, Rest... rest) {
-            alignas(16) float a0[4], a1[4], b0[4], b1[4], c0[4], c1[4], d0[4], d1[4];
+            constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
-            wasm_v128_store(a0, A0.v); wasm_v128_store(a1, A1.v);
+            std::array<VectorF32<Len, Packing>, 2 * N> args{ first, rest... };
            wasm_v128_store(b0, B0.v); wasm_v128_store(b1, B1.v);
            wasm_v128_store(c0, C0.v); wasm_v128_store(c1, C1.v);
            wasm_v128_store(d0, D0.v); wasm_v128_store(d1, D1.v);
            alignas(16) float out[4] = {0,0,0,0};
-            for (std::uint8_t k = 0; k < Len; ++k) {
+            for (std::uint8_t i = 0; i < N; ++i) {
-                out[0] += a0[k] * a1[k];
+                alignas(16) float a[4], b[4];
-                out[1] += b0[k] * b1[k];
+                wasm_v128_store(a, args[2 * i].v);
-                out[2] += c0[k] * c1[k];
+                wasm_v128_store(b, args[2 * i + 1].v);
-                out[3] += d0[k] * d1[k];
+                for (std::uint8_t k = 0; k < Len; ++k) out[i] += a[k] * b[k];
            }
            return VectorF32<1, 4>(wasm_v128_load(out));
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
            auto normOne = [](VectorF32<Len, Packing> u) {
                alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v);
@ -1622,7 +1653,7 @@ namespace Crafter {
                }
                return VectorF32<Len, Packing>(wasm_v128_load(out));
            };
-            return std::make_tuple(normOne(first), normOne(rest)...);
+            return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
        }
        constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
@ -1842,9 +1873,10 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
-            constexpr std::uint8_t N = 1 + sizeof...(Rest);
+            constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
            VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
            std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
            for (std::uint8_t i = 0; i < N; ++i)
@ -1860,7 +1892,8 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
            auto sq = LengthSq(first, rest...);
            for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]);
@ -1868,7 +1901,8 @@ namespace Crafter {
        }
        template<typename... Rest>
-            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
+            requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
                     (1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
        constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
            auto normOne = [](VectorF32<Len, Packing> u) {
                VectorF32<Len, Packing> out;
@ -1884,7 +1918,7 @@ namespace Crafter {
                }
                return out;
            };
-            return std::make_tuple(normOne(first), normOne(rest)...);
+            return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
        }
        constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
--- a/tests/Intersection.cpp
+++ b/tests/Intersection.cpp
@ -40,134 +40,390 @@ VectorF32<4, 1> Vec4(float x, float y, float z, float w) {
    return VectorF32<4, 1>(buf);
 }
-VectorF32<1, 4> Vec1x4(float a, float b, float c, float d) {
+// Pack Total = Packing * N vec3 records into N packed VectorF32<3, Packing>s.
-    alignas(16) float buf[4] = { a, b, c, d };
+// `data[i]` is the i-th sub-primitive's three components in [x, y, z] order;
-    return VectorF32<1, 4>(buf);
+// the helper places `data[batch*Packing + sub]` into the `sub`-th slot of
 // `result[batch]`. Records beyond `data.size()` are left as zeros.
 template <std::uint8_t Packing>
 std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize>
 PackVec3(std::span<const std::array<float, 3>> data) {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    std::array<VectorF32<3, Packing>, N> result;
    for (std::uint8_t b = 0; b < N; ++b) {
        alignas(64) float buf[VectorF32<3, Packing>::AlignmentElement] = {};
        for (std::uint8_t s = 0; s < Packing; ++s) {
            std::size_t idx = static_cast<std::size_t>(b) * Packing + s;
            if (idx < data.size()) {
                buf[s * 3 + 0] = data[idx][0];
                buf[s * 3 + 1] = data[idx][1];
                buf[s * 3 + 2] = data[idx][2];
            }
        }
        result[b] = VectorF32<3, Packing>(buf);
    }
    return result;
 }
-// Möller-Trumbore in this codebase rejects det <= eps, so triangles must be
+// Same idea for vec4 records (quaternions).
-// wound so their geometric normal opposes the ray direction. For rays going +Z
+template <std::uint8_t Packing>
-// that means clockwise from a +Z viewer.
+std::array<VectorF32<4, Packing>, VectorF32<3, Packing>::BatchSize>
-std::string* TestRayTriangle() {
+PackVec4MatchingVec3Batch(std::span<const std::array<float, 4>> data) {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    std::array<VectorF32<4, Packing>, N> result;
    for (std::uint8_t b = 0; b < N; ++b) {
        alignas(64) float buf[VectorF32<4, Packing>::AlignmentElement] = {};
        for (std::uint8_t s = 0; s < Packing; ++s) {
            std::size_t idx = static_cast<std::size_t>(b) * Packing + s;
            if (idx < data.size()) {
                buf[s * 4 + 0] = data[idx][0];
                buf[s * 4 + 1] = data[idx][1];
                buf[s * 4 + 2] = data[idx][2];
                buf[s * 4 + 3] = data[idx][3];
            }
        }
        result[b] = VectorF32<4, Packing>(buf);
    }
    return result;
 }
 // Pack `Total` scalars into a VectorF32<1, Total>.
 template <std::uint8_t Total>
 VectorF32<1, Total> PackScalars(std::span<const float> data) {
    alignas(64) float buf[VectorF32<1, Total>::AlignmentElement] = {};
    for (std::size_t i = 0; i < data.size() && i < Total; ++i) buf[i] = data[i];
    return VectorF32<1, Total>(buf);
 }
 template <std::uint8_t Packing>
 std::string* TestRayTriangleN() {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    constexpr std::uint8_t Total = Packing * N;
    VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5);
    VectorF32<3, 1> rayDir    = Vec3(0, 0, 1);
-    // A: hits at z=0, t=5 (front-facing).
+    // Cycle of four triangle patterns, repeated to fill Total slots:
-    VectorF32<3, 1> a0 = Vec3(-1, -1, 0), a1 = Vec3(0, 1, 0), a2 = Vec3(1, -1, 0);
+    //   0: hits at z=0 (t=5)
-    // B: hits at z=10, t=15.
+    //   1: hits at z=10 (t=15)
-    VectorF32<3, 1> b0 = Vec3(-1, -1, 10), b1 = Vec3(0, 1, 10), b2 = Vec3(1, -1, 10);
+    //   2: front-facing but off to the side - u/v rejected (miss)
-    // C: front-facing triangle far off to the side - u or v out of [0,1].
+    //   3: parallel to the ray (miss)
-    VectorF32<3, 1> c0 = Vec3(99, -1, 0), c1 = Vec3(100, 1, 0), c2 = Vec3(101, -1, 0);
+    constexpr std::array<std::array<float, 3>, 4> v0_pat = {{
-    // D: triangle parallel to the ray (all vertices share y=2; ray lives in y=0).
+        {-1, -1,  0}, {-1, -1, 10}, { 99, -1,  0}, {-1,  2, -1}
-    VectorF32<3, 1> d0 = Vec3(-1, 2, -1), d1 = Vec3(1, 2, 1), d2 = Vec3(0, 2, 2);
+    }};
    constexpr std::array<std::array<float, 3>, 4> v1_pat = {{
        { 0,  1,  0}, { 0,  1, 10}, {100,  1,  0}, { 1,  2,  1}
    }};
    constexpr std::array<std::array<float, 3>, 4> v2_pat = {{
        { 1, -1,  0}, { 1, -1, 10}, {101, -1,  0}, { 0,  2,  2}
    }};
    constexpr std::array<float, 4> expected_pat = { 5.0f, 15.0f, kMaxF, kMaxF };
-    VectorF32<1, 4> t = IntersectionTestRayTriangle(rayOrigin, rayDir,
+    std::array<std::array<float, 3>, Total> v0Data, v1Data, v2Data;
-        a0, a1, a2,
+    for (std::uint8_t i = 0; i < Total; ++i) {
-        b0, b1, b2,
+        v0Data[i] = v0_pat[i % 4];
-        c0, c1, c2,
+        v1Data[i] = v1_pat[i % 4];
-        d0, d1, d2);
+        v2Data[i] = v2_pat[i % 4];
-    std::array<float, 4> s = t.template Store<float>();
+    }
    auto v0 = PackVec3<Packing>(v0Data);
    auto v1 = PackVec3<Packing>(v1Data);
    auto v2 = PackVec3<Packing>(v2Data);
-    if (!FloatEquals(s[0], 5.0f))
+    auto t = IntersectionTestRayTriangle<Packing>(rayOrigin, rayDir, v0, v1, v2);
-        return new std::string(std::format("RayTriangle A: expected 5, got {}", s[0]));
+    auto stored = t.template Store<float>();
    if (!FloatEquals(s[1], 15.0f))
        return new std::string(std::format("RayTriangle B: expected 15, got {}", s[1]));
    if (s[2] != kMaxF)
        return new std::string(std::format("RayTriangle C: expected max (miss), got {}", s[2]));
    if (s[3] != kMaxF)
        return new std::string(std::format("RayTriangle D: expected max (parallel miss), got {}", s[3]));
    return nullptr;
 }
-std::string* TestRayTriangleBackFacing() {
+    for (std::uint8_t i = 0; i < Total; ++i) {
-    // Same A vertices but CCW from +Z viewer -> back-facing for +Z ray -> miss.
+        float expected = expected_pat[i % 4];
-    VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5);
+        float got = stored[i];
-    VectorF32<3, 1> rayDir    = Vec3(0, 0, 1);
+        if (expected == kMaxF) {
-    VectorF32<3, 1> v0 = Vec3(-1, -1, 0), v1 = Vec3(1, -1, 0), v2 = Vec3(0, 1, 0);
+            if (got != kMaxF)
-
+                return new std::string(std::format(
-    VectorF32<1, 4> t = IntersectionTestRayTriangle(rayOrigin, rayDir,
+                    "RayTriangle<{}> tri {}: expected miss, got {}", Packing, i, got));
-        v0, v1, v2,
+        } else if (!FloatEquals(got, expected)) {
-        v0, v1, v2,
+            return new std::string(std::format(
-        v0, v1, v2,
+                "RayTriangle<{}> tri {}: expected {}, got {}", Packing, i, expected, got));
-        v0, v1, v2);
+        }
    std::array<float, 4> s = t.template Store<float>();
    for (std::uint8_t i = 0; i < 4; ++i) {
        if (s[i] != kMaxF)
            return new std::string(std::format("RayTriangle back-facing lane {}: expected max, got {}", i, s[i]));
    }
    return nullptr;
 }
-std::string* TestRaySphere() {
+template <std::uint8_t Packing>
-    VectorF32<3, 1> rayOrigin = Vec3(0, 0, -10);
+std::string* TestRayTriangleBackFacingN() {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    constexpr std::uint8_t Total = Packing * N;
    // Same vertices as the front-facing case but wound CCW from +Z (back-facing
    // for a +Z ray) - all sub-primitives should miss.
    std::array<std::array<float, 3>, Total> v0Data, v1Data, v2Data;
    for (std::uint8_t i = 0; i < Total; ++i) {
        v0Data[i] = {-1, -1, 0};
        v1Data[i] = { 1, -1, 0};
        v2Data[i] = { 0,  1, 0};
    }
    auto v0 = PackVec3<Packing>(v0Data);
    auto v1 = PackVec3<Packing>(v1Data);
    auto v2 = PackVec3<Packing>(v2Data);
    VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5);
    VectorF32<3, 1> rayDir    = Vec3(0, 0, 1);
-
+    auto t = IntersectionTestRayTriangle<Packing>(rayOrigin, rayDir, v0, v1, v2);
-    // A: sphere at origin radius 2 - first hit at z=-2, t=8.
+    auto stored = t.template Store<float>();
-    VectorF32<3, 1> posA = Vec3(0, 0, 0);
+    for (std::uint8_t i = 0; i < Total; ++i) {
-    // B: sphere at (0,0,20) radius 1 - first hit at z=19, t=29.
+        if (stored[i] != kMaxF)
-    VectorF32<3, 1> posB = Vec3(0, 0, 20);
+            return new std::string(std::format(
-    // C: sphere off to the side, ray misses.
+                "RayTriangle back-facing<{}> tri {}: expected max, got {}",
-    VectorF32<3, 1> posC = Vec3(10, 10, 0);
+                Packing, i, stored[i]));
-    // D: sphere behind the ray origin.
+    }
    VectorF32<3, 1> posD = Vec3(0, 0, -50);
    VectorF32<1, 4> radii = Vec1x4(2.0f, 1.0f, 0.5f, 1.0f);
    VectorF32<1, 4> t = IntersectionTestRaySphere(rayOrigin, rayDir,
        posA, posB, posC, posD, radii);
    std::array<float, 4> s = t.template Store<float>();
    if (!FloatEquals(s[0], 8.0f))
        return new std::string(std::format("RaySphere A: expected 8, got {}", s[0]));
    if (!FloatEquals(s[1], 29.0f))
        return new std::string(std::format("RaySphere B: expected 29, got {}", s[1]));
    if (s[2] != kMaxF)
        return new std::string(std::format("RaySphere C: expected max (miss), got {}", s[2]));
    if (s[3] != kMaxF)
        return new std::string(std::format("RaySphere D: expected max (behind), got {}", s[3]));
    return nullptr;
 }
-std::string* TestRayOrientedBox() {
+template <std::uint8_t Packing>
 std::string* TestRaySphereN() {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    constexpr std::uint8_t Total = Packing * N;
    VectorF32<3, 1> rayOrigin = Vec3(0, 0, -10);
    VectorF32<3, 1> rayDir    = Vec3(0, 0, 1);
    // Cycle of four sphere patterns:
    //   0: at origin, r=2, first hit z=-2, t=8
    //   1: at (0,0,20), r=1, first hit z=19, t=29
    //   2: off-axis at (10,10,0), r=0.5, miss
    //   3: behind ray at (0,0,-50), r=1, miss
    constexpr std::array<std::array<float, 3>, 4> pos_pat = {{
        { 0,  0,   0}, { 0,  0,  20}, {10, 10,   0}, { 0,  0, -50}
    }};
    constexpr std::array<float, 4> radii_pat = { 2.0f, 1.0f, 0.5f, 1.0f };
    constexpr std::array<float, 4> expected_pat = { 8.0f, 29.0f, kMaxF, kMaxF };
    std::array<std::array<float, 3>, Total> posData;
    std::array<float, Total> radiiData;
    for (std::uint8_t i = 0; i < Total; ++i) {
        posData[i] = pos_pat[i % 4];
        radiiData[i] = radii_pat[i % 4];
    }
    auto pos = PackVec3<Packing>(posData);
    auto radii = PackScalars<Total>(radiiData);
    auto t = IntersectionTestRaySphere<Packing>(rayOrigin, rayDir, pos, radii);
    auto stored = t.template Store<float>();
    for (std::uint8_t i = 0; i < Total; ++i) {
        float expected = expected_pat[i % 4];
        float got = stored[i];
        if (expected == kMaxF) {
            if (got != kMaxF)
                return new std::string(std::format(
                    "RaySphere<{}> sph {}: expected miss, got {}", Packing, i, got));
        } else if (!FloatEquals(got, expected)) {
            return new std::string(std::format(
                "RaySphere<{}> sph {}: expected {}, got {}", Packing, i, expected, got));
        }
    }
    return nullptr;
 }
 template <std::uint8_t Packing>
 std::string* TestRayOrientedBoxN() {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    constexpr std::uint8_t Total = Packing * N;
    VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5);
    VectorF32<3, 1> rayDir    = Vec3(0, 0, 1);
    // Identity quaternion (axis-aligned).
    VectorF32<4, 1> idQ = Vec4(0, 0, 0, 1);
-    // Note: RayOrientedBox treats `size` as the *full* extent (it computes
+    // Cycle of four AABB-as-OBB patterns (identity rotation):
-    // halfExtents = size * 0.5 internally). So size=2 means the box spans
+    //   0: at origin, full size 2, enters z=-1 -> t=4
-    // [-1, 1] in each axis. (SphereOrientedBox uses the opposite convention.)
+    //   1: at (0,0,10), full size 2, enters z=9 -> t=14
-    //
+    //   2: off-axis at (50,0,0) -> miss
-    // A: box at origin size 2 (half 1) -> ray enters at z=-1, t=4.
+    //   3: behind ray at (0,0,-50) -> miss
-    VectorF32<3, 1> posA = Vec3(0, 0, 0), sizeA = Vec3(2, 2, 2);
+    constexpr std::array<std::array<float, 3>, 4> pos_pat = {{
-    // B: box at (0,0,10) size 2 (half 1) -> ray enters at z=9, t=14.
+        { 0,  0,   0}, { 0,  0,  10}, {50,  0,   0}, { 0,  0, -50}
-    VectorF32<3, 1> posB = Vec3(0, 0, 10), sizeB = Vec3(2, 2, 2);
+    }};
-    // C: box off to the side - miss.
+    constexpr std::array<float, 3> size_one = { 2, 2, 2 };
-    VectorF32<3, 1> posC = Vec3(50, 0, 0), sizeC = Vec3(2, 2, 2);
+    constexpr std::array<float, 4> idQ = { 0, 0, 0, 1 };
-    // D: box behind ray - miss.
+    constexpr std::array<float, 4> expected_pat = { 4.0f, 14.0f, kMaxF, kMaxF };
    VectorF32<3, 1> posD = Vec3(0, 0, -50), sizeD = Vec3(2, 2, 2);
-    VectorF32<1, 4> t = IntersectionTestRayOrientedBox(rayOrigin, rayDir,
+    std::array<std::array<float, 3>, Total> posData;
-        posA, sizeA, idQ,
+    std::array<std::array<float, 3>, Total> sizeData;
-        posB, sizeB, idQ,
+    std::array<std::array<float, 4>, Total> rotData;
-        posC, sizeC, idQ,
+    for (std::uint8_t i = 0; i < Total; ++i) {
-        posD, sizeD, idQ);
+        posData[i]  = pos_pat[i % 4];
-    std::array<float, 4> s = t.template Store<float>();
+        sizeData[i] = size_one;
        rotData[i]  = idQ;
    }
    auto pos  = PackVec3<Packing>(posData);
    auto size = PackVec3<Packing>(sizeData);
    auto rot  = PackVec4MatchingVec3Batch<Packing>(rotData);
-    if (!FloatEquals(s[0], 4.0f))
+    auto t = IntersectionTestRayOrientedBox<Packing>(rayOrigin, rayDir, pos, size, rot);
-        return new std::string(std::format("RayOrientedBox A: expected 4, got {}", s[0]));
+    auto stored = t.template Store<float>();
-    if (!FloatEquals(s[1], 14.0f))
+
-        return new std::string(std::format("RayOrientedBox B: expected 14, got {}", s[1]));
+    for (std::uint8_t i = 0; i < Total; ++i) {
-    if (s[2] != kMaxF)
+        float expected = expected_pat[i % 4];
-        return new std::string(std::format("RayOrientedBox C: expected max (miss), got {}", s[2]));
+        float got = stored[i];
-    if (s[3] != kMaxF)
+        if (expected == kMaxF) {
-        return new std::string(std::format("RayOrientedBox D: expected max (behind), got {}", s[3]));
+            if (got != kMaxF)
                return new std::string(std::format(
                    "RayOrientedBox<{}> box {}: expected miss, got {}", Packing, i, got));
        } else if (!FloatEquals(got, expected)) {
            return new std::string(std::format(
                "RayOrientedBox<{}> box {}: expected {}, got {}", Packing, i, expected, got));
        }
    }
    return nullptr;
 }
 // Helper: pack a homogeneous array of OBB descriptors into a PackedOBBs<Packing>.
 template <std::uint8_t Packing>
 PackedOBBs<Packing> PackOBBs(
    std::span<const std::array<float, 3>> halfSizes,
    std::span<const std::array<float, 3>> xAxes,
    std::span<const std::array<float, 3>> yAxes,
    std::span<const std::array<float, 3>> zAxes,
    std::span<const std::array<float, 3>> origins
 ) {
    PackedOBBs<Packing> out;
    out.halfSize = PackVec3<Packing>(halfSizes);
    out.xAxis    = PackVec3<Packing>(xAxes);
    out.yAxis    = PackVec3<Packing>(yAxes);
    out.zAxis    = PackVec3<Packing>(zAxes);
    out.origin   = PackVec3<Packing>(origins);
    return out;
 }
 // SphereOrientedBox takes a PackedOBBs (half-extents, three rotation axes,
 // origin per sub-box). For axis-aligned boxes the axes are world x/y/z.
 template <std::uint8_t Packing>
 std::string* TestSphereOrientedBoxN() {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    constexpr std::uint8_t Total = Packing * N;
    VectorF32<3, 1> sphereCenter = Vec3(0, 0, 0);
    // Cycle of four box patterns (half-extent semantics, world-axis aligned):
    //   0: at origin half=2, r=1 -> sphere inside -> hit
    //   1: at (5,0,0) half=1, r=0.5 -> miss
    //   2: at (3,0,0) half=1, r=0.5 -> miss
    //   3: at origin half=0.5, r=1 -> sphere encloses box center -> hit
    constexpr std::array<std::array<float, 3>, 4> size_pat = {{
        { 2, 2, 2}, { 1, 1, 1}, { 1, 1, 1}, {0.5f, 0.5f, 0.5f}
    }};
    constexpr std::array<std::array<float, 3>, 4> origin_pat = {{
        { 0, 0, 0}, { 5, 0, 0}, { 3, 0, 0}, { 0, 0, 0}
    }};
    constexpr std::array<float, 4> radii_pat = { 1.0f, 0.5f, 0.5f, 1.0f };
    constexpr std::array<float, 4> expected_pat = { 0.0f, kMaxF, kMaxF, 0.0f };
    constexpr std::array<float, 3> ax_x = { 1, 0, 0 };
    constexpr std::array<float, 3> ax_y = { 0, 1, 0 };
    constexpr std::array<float, 3> ax_z = { 0, 0, 1 };
    std::array<std::array<float, 3>, Total> sizeData, originData;
    std::array<std::array<float, 3>, Total> xAxesData, yAxesData, zAxesData;
    std::array<float, Total> radiiData;
    for (std::uint8_t i = 0; i < Total; ++i) {
        sizeData[i]   = size_pat[i % 4];
        originData[i] = origin_pat[i % 4];
        xAxesData[i]  = ax_x;
        yAxesData[i]  = ax_y;
        zAxesData[i]  = ax_z;
        radiiData[i]  = radii_pat[i % 4];
    }
    auto boxes = PackOBBs<Packing>(sizeData, xAxesData, yAxesData, zAxesData, originData);
    auto radii = PackScalars<Total>(radiiData);
    auto t = IntersectionTestSphereOrientedBox<Packing>(sphereCenter, radii, boxes);
    auto stored = t.template Store<float>();
    for (std::uint8_t i = 0; i < Total; ++i) {
        float expected = expected_pat[i % 4];
        float got = stored[i];
        if (expected == kMaxF) {
            if (got != kMaxF)
                return new std::string(std::format(
                    "SphereOrientedBox<{}> box {}: expected miss, got {}", Packing, i, got));
        } else if (!FloatEquals(got, expected)) {
            return new std::string(std::format(
                "SphereOrientedBox<{}> box {}: expected {}, got {}", Packing, i, expected, got));
        }
    }
    return nullptr;
 }
 // OBB-vs-OBB test against the new templated SAT routine. Cycles through:
 //   0: identical unit boxes at (0,0,0) and (1,0,0) -> overlap on x
 //   1: identical unit boxes at (0,0,0) and (10,0,0) -> far apart, miss
 //   2: rotated-45° box at origin vs identity at (1,0,0). Both have half=1.
 //      The rotated box's projection along world-x is half=sqrt(2)≈1.414, so
 //      the boxes still overlap on the world-x axis.
 //   3: identical unit boxes at (0,0,0) and (3,0,0) -> miss
 template <std::uint8_t Packing>
 std::string* TestOBBOBBN() {
    constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
    constexpr std::uint8_t Total = Packing * N;
    constexpr float kRot45 = 0.70710678f; // cos(45°) = sin(45°)
    constexpr std::array<std::array<float, 3>, 4> halfA_pat = {{
        { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
    }};
    constexpr std::array<std::array<float, 3>, 4> halfB_pat = halfA_pat;
    constexpr std::array<std::array<float, 3>, 4> originA_pat = {{
        { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }
    }};
    constexpr std::array<std::array<float, 3>, 4> originB_pat = {{
        { 1, 0, 0 }, { 10, 0, 0 }, { 1, 0, 0 }, { 3, 0, 0 }
    }};
    // Box A axes: identity for patterns 0/1/3, rotated 45° around z for pattern 2.
    constexpr std::array<std::array<float, 3>, 4> xAxisA_pat = {{
        { 1, 0, 0 }, { 1, 0, 0 }, {  kRot45, kRot45, 0 }, { 1, 0, 0 }
    }};
    constexpr std::array<std::array<float, 3>, 4> yAxisA_pat = {{
        { 0, 1, 0 }, { 0, 1, 0 }, { -kRot45, kRot45, 0 }, { 0, 1, 0 }
    }};
    constexpr std::array<std::array<float, 3>, 4> zAxisA_pat = {{
        { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }
    }};
    constexpr std::array<std::array<float, 3>, 4> xAxisB_pat = {{
        { 1, 0, 0 }, { 1, 0, 0 }, { 1, 0, 0 }, { 1, 0, 0 }
    }};
    constexpr std::array<std::array<float, 3>, 4> yAxisB_pat = {{
        { 0, 1, 0 }, { 0, 1, 0 }, { 0, 1, 0 }, { 0, 1, 0 }
    }};
    constexpr std::array<std::array<float, 3>, 4> zAxisB_pat = zAxisA_pat;
    constexpr std::array<float, 4> expected_pat = { 0.0f, kMaxF, 0.0f, kMaxF };
    std::array<std::array<float, 3>, Total>
        halfA, halfB, originA, originB,
        xA, yA, zA, xB, yB, zB;
    for (std::uint8_t i = 0; i < Total; ++i) {
        halfA[i]    = halfA_pat[i % 4];
        halfB[i]    = halfB_pat[i % 4];
        originA[i]  = originA_pat[i % 4];
        originB[i]  = originB_pat[i % 4];
        xA[i] = xAxisA_pat[i % 4]; yA[i] = yAxisA_pat[i % 4]; zA[i] = zAxisA_pat[i % 4];
        xB[i] = xAxisB_pat[i % 4]; yB[i] = yAxisB_pat[i % 4]; zB[i] = zAxisB_pat[i % 4];
    }
    auto a = PackOBBs<Packing>(halfA, xA, yA, zA, originA);
    auto b = PackOBBs<Packing>(halfB, xB, yB, zB, originB);
    auto r = IntersectionTestOrientedBoxOrientedBox<Packing>(a, b);
    auto stored = r.template Store<float>();
    for (std::uint8_t i = 0; i < Total; ++i) {
        float expected = expected_pat[i % 4];
        float got = stored[i];
        if (expected == kMaxF) {
            if (got != kMaxF)
                return new std::string(std::format(
                    "OBBOBB<{}> pair {}: expected miss, got {}", Packing, i, got));
        } else if (got != expected) {
            return new std::string(std::format(
                "OBBOBB<{}> pair {}: expected {}, got {}", Packing, i, expected, got));
        }
    }
    return nullptr;
 }
 MatrixRowMajor<float, 4, 3, 1> MakeBoxMatrix(float tx, float ty, float tz) {
    // Box matrix the OBB intersection code expects: rows[i][0..2] is the i-th
    // axis (the existing semantics treat matrix rows as the OBB axes), and
    // rows[i][3] is the translation component along that axis.
    return MatrixRowMajor<float, 4, 3, 1>(
        1, 0, 0, tx,
        0, 1, 0, ty,
@ -175,42 +431,6 @@ MatrixRowMajor<float, 4, 3, 1> MakeBoxMatrix(float tx, float ty, float tz) {
    );
 }
 std::string* TestSphereOrientedBox() {
    // `size` is half-extents (the intersection code clamps to ±size).
    VectorF32<3, 1> sphereCenter = Vec3(0, 0, 0);
    VectorF32<1, 4> radii = Vec1x4(1.0f, 0.5f, 0.5f, 1.0f);
    // A: box at origin half-extent 2 -> sphere center inside -> hit.
    VectorF32<3, 1> sizeA = Vec3(2, 2, 2);
    auto boxA = MakeBoxMatrix(0, 0, 0);
    // B: box at (5,0,0) half-extent 1 -> box spans x in [4,6], sphere in [-0.5,0.5] -> miss.
    VectorF32<3, 1> sizeB = Vec3(1, 1, 1);
    auto boxB = MakeBoxMatrix(5, 0, 0);
    // C: box at (3,0,0) half-extent 1 -> box spans [2,4], sphere [-0.5,0.5] -> miss.
    VectorF32<3, 1> sizeC = Vec3(1, 1, 1);
    auto boxC = MakeBoxMatrix(3, 0, 0);
    // D: box at origin half-extent 0.5 -> sphere center inside the box -> hit.
    VectorF32<3, 1> sizeD = Vec3(0.5f, 0.5f, 0.5f);
    auto boxD = MakeBoxMatrix(0, 0, 0);
    VectorF32<1, 4> r = IntersectionTestSphereOrientedBox(sphereCenter, radii,
        sizeA, boxA,
        sizeB, boxB,
        sizeC, boxC,
        sizeD, boxD);
    std::array<float, 4> s = r.template Store<float>();
    if (s[0] != 0.0f)
        return new std::string(std::format("SphereOrientedBox A: expected hit (0), got {}", s[0]));
    if (s[1] != kMaxF)
        return new std::string(std::format("SphereOrientedBox B: expected max (miss), got {}", s[1]));
    if (s[2] != kMaxF)
        return new std::string(std::format("SphereOrientedBox C: expected max (miss), got {}", s[2]));
    if (s[3] != 0.0f)
        return new std::string(std::format("SphereOrientedBox D: expected hit (0), got {}", s[3]));
    return nullptr;
 }
 std::string* TestGetOBBCorners() {
    // Identity matrix - the 8 corners are exactly ±size on each axis.
    VectorF32<3, 1> size = Vec3(2, 3, 4);
@ -231,7 +451,6 @@ std::string* TestGetOBBCorners() {
        }
    }
    // Translated matrix - corners shift by the translation column.
    auto m2 = MakeBoxMatrix(10, 20, 30);
    std::array<VectorF32<3, 1>, 8> corners2 = GetOBBCorners(size, m2);
    for (std::uint8_t i = 0; i < 8; ++i) {
@ -251,31 +470,43 @@ std::string* TestGetOBBCorners() {
    return nullptr;
 }
-std::string* TestOBBOBBOverlapping() {
+// Top-level wrappers: exercise each refactored function at Packing=1 (always
-    VectorF32<3, 1> size = Vec3(1, 1, 1);
+// supported) and at its default Packing (OptimalPacking for the build target).
-    auto boxA = MakeBoxMatrix(0, 0, 0);
+std::string* TestRayTriangle()          { return TestRayTriangleN<1>(); }
-    auto boxB = MakeBoxMatrix(1, 0, 0); // overlap on x in [-1, 1] (B) and [-1, 1] (A) -> overlap
+std::string* TestRayTriangleOpt()       { return TestRayTriangleN<VectorF32<3, 1>::OptimalPacking>(); }
-    if (!IntersectionTestOrientedBoxOrientedBox(size, boxA, size, boxB))
+std::string* TestRayTriangleBackFacing(){ return TestRayTriangleBackFacingN<1>(); }
-        return new std::string("OBB-OBB overlapping: expected true");
+std::string* TestRayTriangleBackFacingOpt() { return TestRayTriangleBackFacingN<VectorF32<3, 1>::OptimalPacking>(); }
-
+std::string* TestRaySphere()            { return TestRaySphereN<1>(); }
-    auto boxFar = MakeBoxMatrix(10, 0, 0);
+std::string* TestRaySphereOpt()         { return TestRaySphereN<VectorF32<3, 1>::OptimalPacking>(); }
-    if (IntersectionTestOrientedBoxOrientedBox(size, boxA, size, boxFar))
+std::string* TestRayOrientedBox()       { return TestRayOrientedBoxN<1>(); }
-        return new std::string("OBB-OBB far apart: expected false");
+std::string* TestRayOrientedBoxOpt() {
-    return nullptr;
+    constexpr std::uint8_t P = std::min(
        VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking);
    return TestRayOrientedBoxN<P>();
 }
 std::string* TestSphereOrientedBox()    { return TestSphereOrientedBoxN<1>(); }
 std::string* TestSphereOrientedBoxOpt() { return TestSphereOrientedBoxN<VectorF32<3, 1>::OptimalPacking>(); }
 std::string* TestOBBOBB()               { return TestOBBOBBN<1>(); }
 std::string* TestOBBOBBOpt()            { return TestOBBOBBN<VectorF32<3, 1>::OptimalPacking>(); }
 } // namespace
 int main() {
    using Fn = std::string* (*)();
-    constexpr std::array<std::pair<const char*, Fn>, 7> tests = {{
+    constexpr std::array<std::pair<const char*, Fn>, 13> tests = {{
-        { "RayTriangle",            TestRayTriangle           },
+        { "RayTriangle<1>",               TestRayTriangle               },
-        { "RayTriangleBackFacing",  TestRayTriangleBackFacing },
+        { "RayTriangle<Opt>",             TestRayTriangleOpt            },
-        { "RaySphere",              TestRaySphere             },
+        { "RayTriangleBackFacing<1>",     TestRayTriangleBackFacing     },
-        { "RayOrientedBox",         TestRayOrientedBox        },
+        { "RayTriangleBackFacing<Opt>",   TestRayTriangleBackFacingOpt  },
-        { "SphereOrientedBox",      TestSphereOrientedBox     },
+        { "RaySphere<1>",                 TestRaySphere                 },
        { "RaySphere<Opt>",               TestRaySphereOpt              },
        { "RayOrientedBox<1>",            TestRayOrientedBox            },
        { "RayOrientedBox<Opt>",          TestRayOrientedBoxOpt         },
        { "SphereOrientedBox<1>",         TestSphereOrientedBox         },
        { "SphereOrientedBox<Opt>",       TestSphereOrientedBoxOpt      },
        { "GetOBBCorners",                TestGetOBBCorners             },
-        { "OBBOBB",                 TestOBBOBBOverlapping     },
+        { "OBBOBB<1>",                    TestOBBOBB                    },
        { "OBBOBB<Opt>",                  TestOBBOBBOpt                 },
    }};
    for (auto const& [name, fn] : tests) {
--- a/tests/Vector.cpp
+++ b/tests/Vector.cpp
@ -429,7 +429,7 @@ std::string* TestAllCombinations() {
                VectorType<Len, Packing> vecC = vecA * 3;
                VectorType<Len, Packing> vecD = vecA * 4;
                auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC, vecD);
-                VectorType<1, 4> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result));
+                VectorType<1, 4> result2 = VectorType<Len, Packing>::Length(result[0], result[1], result[2], result[3]);
                std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
                for(std::uint8_t i = 0; i < Len*Packing; i++) {
@ -472,7 +472,7 @@ std::string* TestAllCombinations() {
                VectorType<Len, Packing> vecC = vecA * 3;
                VectorType<Len, Packing> vecD = vecA * 4;
                auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC, vecD);
-                VectorType<1, 8> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result));
+                VectorType<1, 8> result2 = VectorType<Len, Packing>::Length(result[0], result[1], result[2], result[3]);
                std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
                for(std::uint8_t i = 0; i < Len*Packing; i++) {
@ -509,7 +509,7 @@ std::string* TestAllCombinations() {
                VectorType<Len, Packing> vecB = vecA * 2;
                VectorType<Len, Packing> vecC = vecA * 3;
                auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC);
-                VectorType<1, 15> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result));
+                VectorType<1, 15> result2 = VectorType<Len, Packing>::Length(result[0], result[1], result[2]);
                std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
                for(std::uint8_t i = 0; i < Len*Packing; i++) {
@ -540,7 +540,7 @@ std::string* TestAllCombinations() {
                VectorType<Len, Packing> vecA(floats);
                VectorType<Len, Packing> vecE = vecA * 2;
                auto result = VectorType<Len, Packing>::Normalize(vecA, vecE);
-                VectorType<1, Packing*2> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result));
+                VectorType<1, Packing*2> result2 = VectorType<Len, Packing>::Length(result[0], result[1]);
                std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
                for(std::uint8_t i = 0; i < Len*Packing; i++) {
@ -583,7 +583,7 @@ std::string* TestAllCombinations() {
                VectorType<Len, Packing> vecE = vecA * 3;
                VectorType<Len, Packing> vecG = vecA * 4;
                auto result = VectorType<Len, Packing>::Normalize(vecA, vecC, vecE, vecG);
-                VectorType<1, Packing*4> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result));
+                VectorType<1, Packing*4> result2 = VectorType<Len, Packing>::Length(result[0], result[1], result[2], result[3]);
                std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
                for(std::uint8_t i = 0; i < Len*Packing; i++) {