matrix operations

2026-05-18 18:03:20 +02:00 · 2026-05-18 18:03:20 +02:00 · ad5ba21b4d
commit ad5ba21b4d
parent 48e3b8e26c
6 changed files with 1433 additions and 613 deletions
--- a/interfaces/Crafter.Math-Intersection.cppm
+++ b/interfaces/Crafter.Math-Intersection.cppm
@ -18,207 +18,432 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
 */

 export module Crafter.Math:Intersection;
-import :Vector;
+import :VectorF32;
 import :MatrixRowMajor;
 import std;

 namespace Crafter {
-    export template<typename T>
-	constexpr T IntersectionTestRayTriangle(Vector<T, 3, 0> vert0, Vector<T, 3, 0> vert1, Vector<T, 3, 0> vert2, Vector<T, 3, 0> rayOrigin, Vector<T, 3, 0> rayDir) {
-        Vector<T, 3, 0> edge1 = vert1 - vert0;
-        Vector<T, 3, 0> edge2 = vert2 - vert0;
+    // All intersection tests are batched over four primitives at a time so they
+    // feed the VectorF32<3,1>::Dot / Cross / Length / Normalize four-pair
+    // overloads directly. The single-primitive case is just "pass the same
+    // primitive four times and read lane 0" - there is no single-vector
+    // fast-path because the SIMD pipelines want full lanes.

-        Vector<T, 3, 0> h = Vector<T, 3, 0>::Cross(rayDir, edge2);
-        T determinant = Vector<T, 3, 0>::Dot(edge1, h);
+    // Möller-Trumbore against four triangles sharing one ray. Returns ray
+    // parameter t per triangle, or float max where the ray misses.
+    export inline VectorF32<1, 4> IntersectionTestRayTriangle(
+        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
+        VectorF32<3, 1> aV0, VectorF32<3, 1> aV1, VectorF32<3, 1> aV2,
+        VectorF32<3, 1> bV0, VectorF32<3, 1> bV1, VectorF32<3, 1> bV2,
+        VectorF32<3, 1> cV0, VectorF32<3, 1> cV1, VectorF32<3, 1> cV2,
+        VectorF32<3, 1> dV0, VectorF32<3, 1> dV1, VectorF32<3, 1> dV2
+    ) {
+        VectorF32<3, 1> aE1 = aV1 - aV0;
+        VectorF32<3, 1> aE2 = aV2 - aV0;
+        VectorF32<3, 1> bE1 = bV1 - bV0;
+        VectorF32<3, 1> bE2 = bV2 - bV0;
+        VectorF32<3, 1> cE1 = cV1 - cV0;
+        VectorF32<3, 1> cE2 = cV2 - cV0;
+        VectorF32<3, 1> dE1 = dV1 - dV0;
+        VectorF32<3, 1> dE2 = dV2 - dV0;

-        if (determinant <= std::numeric_limits<T>::epsilon())  {
-            return std::numeric_limits<T>::max();
+        VectorF32<3, 1> aH = VectorF32<3, 1>::Cross(rayDir, aE2);
+        VectorF32<3, 1> bH = VectorF32<3, 1>::Cross(rayDir, bE2);
+        VectorF32<3, 1> cH = VectorF32<3, 1>::Cross(rayDir, cE2);
+        VectorF32<3, 1> dH = VectorF32<3, 1>::Cross(rayDir, dE2);
+
+        VectorF32<3, 1> aS = rayOrigin - aV0;
+        VectorF32<3, 1> bS = rayOrigin - bV0;
+        VectorF32<3, 1> cS = rayOrigin - cV0;
+        VectorF32<3, 1> dS = rayOrigin - dV0;
+
+        VectorF32<3, 1> aQ = VectorF32<3, 1>::Cross(aS, aE1);
+        VectorF32<3, 1> bQ = VectorF32<3, 1>::Cross(bS, bE1);
+        VectorF32<3, 1> cQ = VectorF32<3, 1>::Cross(cS, cE1);
+        VectorF32<3, 1> dQ = VectorF32<3, 1>::Cross(dS, dE1);
+
+        // Four 3-component dots packed into one __m128 per call.
+        VectorF32<1, 4> det = VectorF32<3, 1>::Dot(
+            aE1, aH, bE1, bH, cE1, cH, dE1, dH);
+        VectorF32<1, 4> uNum = VectorF32<3, 1>::Dot(
+            aS, aH, bS, bH, cS, cH, dS, dH);
+        VectorF32<1, 4> vNum = VectorF32<3, 1>::Dot(
+            rayDir, aQ, rayDir, bQ, rayDir, cQ, rayDir, dQ);
+        VectorF32<1, 4> tNum = VectorF32<3, 1>::Dot(
+            aE2, aQ, bE2, bQ, cE2, cQ, dE2, dQ);
+
+        std::array<float, 4> detArr = det.template Store<float>();
+        std::array<float, 4> uArr = uNum.template Store<float>();
+        std::array<float, 4> vArr = vNum.template Store<float>();
+        std::array<float, 4> tArr = tNum.template Store<float>();
+
+        constexpr float eps = std::numeric_limits<float>::epsilon();
+        constexpr float maxF = std::numeric_limits<float>::max();
+        alignas(16) std::array<float, 4> out{};
+        for (std::uint8_t i = 0; i < 4; ++i) {
+            float d = detArr[i];
+            if (d <= eps) { out[i] = maxF; continue; }
+            float invD = 1.0f / d;
+            float u = uArr[i] * invD;
+            if (u < 0.0f || u > 1.0f) { out[i] = maxF; continue; }
+            float v = vArr[i] * invD;
+            if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; }
+            out[i] = tArr[i] * invD;
        }
-
-        T inverse_determinant = T(1) / determinant;
-
-        Vector<T, 3, 0> origins_diff_vector = rayOrigin - vert0;
-        T u =  Vector<T, 3, 0>::Dot(origins_diff_vector, h) * inverse_determinant;
-
-        if (u < 0.0 || u > 1.0)
-        {
-            return std::numeric_limits<T>::max();
-        }
-
-        Vector<T, 3, 0> q = Vector<T, 3, 0>::Cross(origins_diff_vector, edge1);
-        T v = inverse_determinant * Vector<T, 3, 0>::Dot(rayDir, q);
-
-        if (v < 0.0 || u + v > 1.0) {
-            return std::numeric_limits<T>::max();
-        }
-
-        return inverse_determinant * Vector<T, 3, 0>::Dot(edge2, q);
+        return VectorF32<1, 4>(out.data());
    }

-    export template<typename T>
-    constexpr T IntersectionTestRaySphere(Vector<T, 3, 0> position, T radius, Vector<T, 3, 0> rayOrigin, Vector<T, 3, 0> rayDir) {
-        T a = Vector<T, 3, 0>::Dot(rayDir, rayDir);
-        T b = Vector<T, 3, 0>::Dot(rayDir, (T(2) * (rayOrigin - position)));
-        T c = Vector<T, 3, 0>::Dot(position, position) + Vector<T, 3, 0>::Dot(rayOrigin, rayOrigin) - T(2) * Vector<T, 3, 0>::Dot(rayOrigin, position) - radius * radius;
-        T d = b * b + (T(-4)) * a * c;
+    // One ray against four spheres. radii must hold {rA, rB, rC, rD} in lanes
+    // 0..3.
+    export inline VectorF32<1, 4> IntersectionTestRaySphere(
+        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
+        VectorF32<3, 1> posA, VectorF32<3, 1> posB,
+        VectorF32<3, 1> posC, VectorF32<3, 1> posD,
+        VectorF32<1, 4> radii
+    ) {
+        VectorF32<3, 1> sA = rayOrigin - posA;
+        VectorF32<3, 1> sB = rayOrigin - posB;
+        VectorF32<3, 1> sC = rayOrigin - posC;
+        VectorF32<3, 1> sD = rayOrigin - posD;

-        if (d < 0) {
-            return std::numeric_limits<T>::max();
-        }
+        // dirDotS_i = rayDir · (rayOrigin - pos_i)
+        VectorF32<1, 4> dirDotS = VectorF32<3, 1>::Dot(
+            rayDir, sA, rayDir, sB, rayDir, sC, rayDir, sD);
+        // sqDist_i = |rayOrigin - pos_i|² (a.k.a. LengthSq of the s vectors)
+        VectorF32<1, 4> sqDist = VectorF32<3, 1>::LengthSq(sA, sB, sC, sD);
+        // aScalar = rayDir · rayDir, broadcast across four lanes.
+        VectorF32<1, 4> aScalar = VectorF32<3, 1>::LengthSq(
+            rayDir, rayDir, rayDir, rayDir);

-        d = std::sqrt(d);
+        VectorF32<1, 4> two(2.0f);
+        VectorF32<1, 4> four(4.0f);
+        VectorF32<1, 4> b = two * dirDotS;
+        VectorF32<1, 4> c = sqDist - radii * radii;
+        // discriminant = b² - 4·a·c
+        VectorF32<1, 4> disc = b * b - four * aScalar * c;

-        T t = (T(-0.5)) * (b + d) / a;
-        if (t > T(0)) {
-            return t;
-        } else {
-            return std::numeric_limits<T>::max();
+        std::array<float, 4> discArr = disc.template Store<float>();
+        std::array<float, 4> bArr = b.template Store<float>();
+        std::array<float, 4> aArr = aScalar.template Store<float>();
+
+        constexpr float maxF = std::numeric_limits<float>::max();
+        alignas(16) std::array<float, 4> out{};
+        for (std::uint8_t i = 0; i < 4; ++i) {
+            float d = discArr[i];
+            if (d < 0.0f) { out[i] = maxF; continue; }
+            float sqrtD = std::sqrt(d);
+            float t = -0.5f * (bArr[i] + sqrtD) / aArr[i];
+            out[i] = (t > 0.0f) ? t : maxF;
        }
+        return VectorF32<1, 4>(out.data());
    }

-    export template<typename T>
-    constexpr T IntersectionTestRayOrientedBox(Vector<T, 3, 0> boxPosition, Vector<T, 3, 0> boxSize, Vector<T, 4, 0> boxRotation, Vector<T, 3, 0> rayOrigin, Vector<T, 3, 0> rayDir) {
-        Vector<T, 4, 0> invRot(
-            -boxRotation.x,
-            -boxRotation.y,
-            -boxRotation.z,
-            boxRotation.w
-        );
+    // One ray against four OBBs. Each box is described by world-space position,
+    // half-extent vector (per-axis sizes), and a unit quaternion rotation.
+    export inline VectorF32<1, 4> IntersectionTestRayOrientedBox(
+        VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
+        VectorF32<3, 1> posA, VectorF32<3, 1> sizeA, VectorF32<4, 1> rotA,
+        VectorF32<3, 1> posB, VectorF32<3, 1> sizeB, VectorF32<4, 1> rotB,
+        VectorF32<3, 1> posC, VectorF32<3, 1> sizeC, VectorF32<4, 1> rotC,
+        VectorF32<3, 1> posD, VectorF32<3, 1> sizeD, VectorF32<4, 1> rotD
+    ) {
+        // Conjugate quaternion: negate xyz, keep w. Negate<{true,true,true,false}>
+        // is constant-folded into a single XOR with a mask vector.
+        VectorF32<4, 1> invRotA = rotA.template Negate<{{true, true, true, false}}>();
+        VectorF32<4, 1> invRotB = rotB.template Negate<{{true, true, true, false}}>();
+        VectorF32<4, 1> invRotC = rotC.template Negate<{{true, true, true, false}}>();
+        VectorF32<4, 1> invRotD = rotD.template Negate<{{true, true, true, false}}>();

-        Vector<T, 3, 0> localOrigin = Vector<T, 3, 0>::Rotate(rayOrigin - boxPosition, invRot);
-        Vector<T, 3, 0> localDir    = Vector<T, 3, 0>::Rotate(rayDir, invRot);
+        VectorF32<3, 1> localOriginA = VectorF32<3, 1>::Rotate(rayOrigin - posA, invRotA);
+        VectorF32<3, 1> localOriginB = VectorF32<3, 1>::Rotate(rayOrigin - posB, invRotB);
+        VectorF32<3, 1> localOriginC = VectorF32<3, 1>::Rotate(rayOrigin - posC, invRotC);
+        VectorF32<3, 1> localOriginD = VectorF32<3, 1>::Rotate(rayOrigin - posD, invRotD);

-        Vector<T,3,0> halfExtents = boxSize * T(0.5);
+        VectorF32<3, 1> localDirA = VectorF32<3, 1>::Rotate(rayDir, invRotA);
+        VectorF32<3, 1> localDirB = VectorF32<3, 1>::Rotate(rayDir, invRotB);
+        VectorF32<3, 1> localDirC = VectorF32<3, 1>::Rotate(rayDir, invRotC);
+        VectorF32<3, 1> localDirD = VectorF32<3, 1>::Rotate(rayDir, invRotD);

-        T tMin = T(0);
-        T tMax = std::numeric_limits<T>::max();
+        VectorF32<3, 1> halfA = sizeA * 0.5f;
+        VectorF32<3, 1> halfB = sizeB * 0.5f;
+        VectorF32<3, 1> halfC = sizeC * 0.5f;
+        VectorF32<3, 1> halfD = sizeD * 0.5f;

-        for (std::uint32_t i = 0; i < 3; ++i)
-        {
-            if (std::abs(localDir.v[i]) < std::numeric_limits<T>::epsilon())
-            {
-                if (localOrigin.v[i] < -halfExtents.v[i] || localOrigin.v[i] >  halfExtents.v[i]) {
-                    return std::numeric_limits<T>::max();
-                }
-            }
-            else
-            {
-                T invD = T(1) / localDir.v[i];
-                T t1 = (-halfExtents.v[i] - localOrigin.v[i]) * invD;
-                T t2 = ( halfExtents.v[i] - localOrigin.v[i]) * invD;
-
-                if (t1 > t2) {
-                    std::swap(t1, t2);
-                }
-
-                tMin = std::max(tMin, t1);
-                tMax = std::min(tMax, t2);
-
-                if (tMin > tMax) {
-                    return std::numeric_limits<T>::max();
-                }
-            }
-        }
-
-        return (tMin >= T(0)) ? tMin : tMax;
-    }
-
-
-    export template<typename T>
-    std::vector<Vector<T, 3, 0>> getOBBCorners(Vector<T, 3, 0> size, MatrixRowMajor<T, 4, 3, 1> matrix) {
-        std::vector<Vector<T, 3, 0>> localCorners = {
-            Vector<T, 3, 0>(-size.x, -size.y, -size.z),
-            Vector<T, 3, 0>( size.x, -size.y, -size.z),
-            Vector<T, 3, 0>(-size.x,  size.y, -size.z),
-            Vector<T, 3, 0>( size.x,  size.y, -size.z),
-            Vector<T, 3, 0>(-size.x, -size.y,  size.z),
-            Vector<T, 3, 0>( size.x, -size.y,  size.z),
-            Vector<T, 3, 0>(-size.x,  size.y,  size.z),
-            Vector<T, 3, 0>( size.x,  size.y,  size.z)
+        std::array<std::array<float, 4>, 4> origLanes{
+            localOriginA.template Store<float>(),
+            localOriginB.template Store<float>(),
+            localOriginC.template Store<float>(),
+            localOriginD.template Store<float>(),
+        };
+        std::array<std::array<float, 4>, 4> dirLanes{
+            localDirA.template Store<float>(),
+            localDirB.template Store<float>(),
+            localDirC.template Store<float>(),
+            localDirD.template Store<float>(),
+        };
+        std::array<std::array<float, 4>, 4> halfLanes{
+            halfA.template Store<float>(),
+            halfB.template Store<float>(),
+            halfC.template Store<float>(),
+            halfD.template Store<float>(),
        };
-        
-        std::vector<Vector<T, 3, 0>> worldCorners;
-        
-        for (Vector<T, 3, 0> localCorner : localCorners) {
-            Vector<T, 3, 0> rotatedCorner = matrix * localCorner;
-            worldCorners.push_back(rotatedCorner);
-        }

-        return worldCorners;
+        constexpr float eps = std::numeric_limits<float>::epsilon();
+        constexpr float maxF = std::numeric_limits<float>::max();
+        alignas(16) std::array<float, 4> out{};
+        for (std::uint8_t b = 0; b < 4; ++b) {
+            float tMin = 0.0f;
+            float tMax = maxF;
+            bool miss = false;
+            for (std::uint8_t i = 0; i < 3; ++i) {
+                float d = dirLanes[b][i];
+                float o = origLanes[b][i];
+                float h = halfLanes[b][i];
+                if (std::abs(d) < eps) {
+                    if (o < -h || o > h) { miss = true; break; }
+                } else {
+                    float invD = 1.0f / d;
+                    float t1 = (-h - o) * invD;
+                    float t2 = ( h - o) * invD;
+                    if (t1 > t2) std::swap(t1, t2);
+                    tMin = std::max(tMin, t1);
+                    tMax = std::min(tMax, t2);
+                    if (tMin > tMax) { miss = true; break; }
+                }
+            }
+            out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax);
+        }
+        return VectorF32<1, 4>(out.data());
    }

-    export template<typename T>
-    constexpr bool IntersectionTestOrientedBoxOrientedBox(Vector<T, 3, 0> sizeA, MatrixRowMajor<T, 4, 3, 1> boxA, Vector<T, 3, 0> sizeB, MatrixRowMajor<T, 4, 3, 1> boxB) {
-        std::vector<Vector<T, 3, 0>> axes;
+    // One sphere against four OBBs. boxMatrix encodes rotation in m[r][0..2]
+    // and translation in m[r][3].
+    export inline VectorF32<1, 4> IntersectionTestSphereOrientedBox(
+        VectorF32<3, 1> sphereCenter, VectorF32<1, 4> radii,
+        VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
+        VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB,
+        VectorF32<3, 1> sizeC, MatrixRowMajor<float, 4, 3, 1> boxC,
+        VectorF32<3, 1> sizeD, MatrixRowMajor<float, 4, 3, 1> boxD
+    ) {
+        auto perBox = [&](MatrixRowMajor<float, 4, 3, 1> const& m,
+                          VectorF32<3, 1> const& size,
+                          VectorF32<3, 1>& xAxis,
+                          VectorF32<3, 1>& yAxis,
+                          VectorF32<3, 1>& zAxis,
+                          VectorF32<3, 1>& delta) {
+            // Existing semantics: the OBB axes are read from the rows of the
+            // upper 3x3 block, and the translation column is gathered from the
+            // w lane of each row.
+            std::array<float, 4> r0 = m.rows[0].template Store<float>();
+            std::array<float, 4> r1 = m.rows[1].template Store<float>();
+            std::array<float, 4> r2 = m.rows[2].template Store<float>();
+            alignas(16) float xBuf[4] = { r0[0], r0[1], r0[2], 0.0f };
+            alignas(16) float yBuf[4] = { r1[0], r1[1], r1[2], 0.0f };
+            alignas(16) float zBuf[4] = { r2[0], r2[1], r2[2], 0.0f };
+            alignas(16) float oBuf[4] = { r0[3], r1[3], r2[3], 0.0f };
+            xAxis = VectorF32<3, 1>(xBuf);
+            yAxis = VectorF32<3, 1>(yBuf);
+            zAxis = VectorF32<3, 1>(zBuf);
+            VectorF32<3, 1> origin(oBuf);
+            delta = sphereCenter - origin;
+            (void)size;
+        };

-        std::vector<Vector<T, 3, 0>> box1Corners = getOBBCorners(sizeA, boxA);
-        std::vector<Vector<T, 3, 0>> box2Corners = getOBBCorners(sizeB, boxB);
+        VectorF32<3, 1> xA, yA, zA, dA;
+        VectorF32<3, 1> xB, yB, zB, dB;
+        VectorF32<3, 1> xC, yC, zC, dC;
+        VectorF32<3, 1> xD, yD, zD, dD;
+        perBox(boxA, sizeA, xA, yA, zA, dA);
+        perBox(boxB, sizeB, xB, yB, zB, dB);
+        perBox(boxC, sizeC, xC, yC, zC, dC);
+        perBox(boxD, sizeD, xD, yD, zD, dD);

-        axes.push_back(Vector<T, 3, 0>(boxA.m[0][0], boxA.m[0][1], boxA.m[0][2]));
-        axes.push_back(Vector<T, 3, 0>(boxA.m[1][0], boxA.m[1][1], boxA.m[1][2]));
-        axes.push_back(Vector<T, 3, 0>(boxA.m[2][0], boxA.m[2][1], boxA.m[2][2]));
-        
-        axes.push_back(Vector<T, 3, 0>(boxB.m[0][0], boxB.m[0][1], boxB.m[0][2]));
-        axes.push_back(Vector<T, 3, 0>(boxB.m[1][0], boxB.m[1][1], boxB.m[1][2]));
-        axes.push_back(Vector<T, 3, 0>(boxB.m[2][0], boxB.m[2][1], boxB.m[2][2]));
-        
-        for (int i = 0; i < 3; ++i) {
-            for (int j = 0; j < 3; ++j) {
-                axes.push_back(Vector<T, 3, 0>::Normalize(Vector<T, 3, 0>::Cross(Vector<T, 3, 0>(boxA.m[i][0], boxA.m[i][1], boxA.m[i][2]), Vector<T, 3, 0>(boxB.m[j][0], boxB.m[j][1], boxB.m[j][2]))));
+        // Local sphere center per box: project delta onto each box axis. We
+        // produce {lx, ly, lz, lx, ly, lz, lx, ly, lz, lx, ly, lz} as three
+        // packed 4-wide Dot results (one Dot per axis).
+        VectorF32<1, 4> locX = VectorF32<3, 1>::Dot(
+            dA, xA, dB, xB, dC, xC, dD, xD);
+        VectorF32<1, 4> locY = VectorF32<3, 1>::Dot(
+            dA, yA, dB, yB, dC, yC, dD, yD);
+        VectorF32<1, 4> locZ = VectorF32<3, 1>::Dot(
+            dA, zA, dB, zB, dC, zC, dD, zD);
+
+        std::array<float, 4> lxArr = locX.template Store<float>();
+        std::array<float, 4> lyArr = locY.template Store<float>();
+        std::array<float, 4> lzArr = locZ.template Store<float>();
+        std::array<float, 4> rArr  = radii.template Store<float>();
+        std::array<std::array<float, 4>, 4> sizeLanes{
+            sizeA.template Store<float>(),
+            sizeB.template Store<float>(),
+            sizeC.template Store<float>(),
+            sizeD.template Store<float>(),
+        };
+
+        alignas(16) std::array<float, 4> out{};
+        for (std::uint8_t i = 0; i < 4; ++i) {
+            float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i];
+            float sx = sizeLanes[i][0], sy = sizeLanes[i][1], sz = sizeLanes[i][2];
+            float cx = std::clamp(lx, -sx, sx);
+            float cy = std::clamp(ly, -sy, sy);
+            float cz = std::clamp(lz, -sz, sz);
+            float dx = lx - cx, dy = ly - cy, dz = lz - cz;
+            float distSq = dx * dx + dy * dy + dz * dz;
+            float r = rArr[i];
+            // Returns 0.0 on hit, max on miss - keeps a consistent
+            // "t-like" output signature with the other intersection tests.
+            out[i] = (distSq <= r * r) ? 0.0f : std::numeric_limits<float>::max();
+        }
+        return VectorF32<1, 4>(out.data());
+    }
+
+    // Eight local corners of a unit OBB transformed by `matrix`. Uses one
+    // batched 4-pair Dot per result row (x, y, z), reproducing the corners in
+    // two groups of four.
+    export inline std::array<VectorF32<3, 1>, 8> GetOBBCorners(
+        VectorF32<3, 1> size, MatrixRowMajor<float, 4, 3, 1> matrix
+    ) {
+        std::array<float, 4> sz = size.template Store<float>();
+        const float sx = sz[0], sy = sz[1], sz_ = sz[2];
+
+        VectorF32<4, 1> mx = matrix.rows[0];
+        VectorF32<4, 1> my = matrix.rows[1];
+        VectorF32<4, 1> mz = matrix.rows[2];
+
+        // Eight homogeneous corner vectors with w=1 so the translation column
+        // of `matrix` participates in the dot product.
+        alignas(16) float c0[4] = { -sx, -sy, -sz_, 1.0f };
+        alignas(16) float c1[4] = {  sx, -sy, -sz_, 1.0f };
+        alignas(16) float c2[4] = { -sx,  sy, -sz_, 1.0f };
+        alignas(16) float c3[4] = {  sx,  sy, -sz_, 1.0f };
+        alignas(16) float c4[4] = { -sx, -sy,  sz_, 1.0f };
+        alignas(16) float c5[4] = {  sx, -sy,  sz_, 1.0f };
+        alignas(16) float c6[4] = { -sx,  sy,  sz_, 1.0f };
+        alignas(16) float c7[4] = {  sx,  sy,  sz_, 1.0f };
+        VectorF32<4, 1> v0(c0), v1(c1), v2(c2), v3(c3);
+        VectorF32<4, 1> v4(c4), v5(c5), v6(c6), v7(c7);
+
+        // First four corners (0..3): batch x, y, z dots.
+        VectorF32<1, 4> xLo = VectorF32<4, 1>::Dot(mx, v0, mx, v1, mx, v2, mx, v3);
+        VectorF32<1, 4> yLo = VectorF32<4, 1>::Dot(my, v0, my, v1, my, v2, my, v3);
+        VectorF32<1, 4> zLo = VectorF32<4, 1>::Dot(mz, v0, mz, v1, mz, v2, mz, v3);
+        // Second four corners (4..7).
+        VectorF32<1, 4> xHi = VectorF32<4, 1>::Dot(mx, v4, mx, v5, mx, v6, mx, v7);
+        VectorF32<1, 4> yHi = VectorF32<4, 1>::Dot(my, v4, my, v5, my, v6, my, v7);
+        VectorF32<1, 4> zHi = VectorF32<4, 1>::Dot(mz, v4, mz, v5, mz, v6, mz, v7);
+
+        std::array<float, 4> xLoA = xLo.template Store<float>();
+        std::array<float, 4> yLoA = yLo.template Store<float>();
+        std::array<float, 4> zLoA = zLo.template Store<float>();
+        std::array<float, 4> xHiA = xHi.template Store<float>();
+        std::array<float, 4> yHiA = yHi.template Store<float>();
+        std::array<float, 4> zHiA = zHi.template Store<float>();
+
+        std::array<VectorF32<3, 1>, 8> result;
+        for (std::uint8_t i = 0; i < 4; ++i) {
+            alignas(16) float buf[4] = { xLoA[i], yLoA[i], zLoA[i], 0.0f };
+            result[i] = VectorF32<3, 1>(buf);
+        }
+        for (std::uint8_t i = 0; i < 4; ++i) {
+            alignas(16) float buf[4] = { xHiA[i], yHiA[i], zHiA[i], 0.0f };
+            result[4 + i] = VectorF32<3, 1>(buf);
+        }
+        return result;
+    }
+
+    // SAT against fifteen separating axes (3 box-A, 3 box-B, 9 cross products).
+    // We compute every corner projection with batched 4-pair Dots: each axis
+    // projects four corners per call, two calls per axis covers the 8 corners.
+    export inline bool IntersectionTestOrientedBoxOrientedBox(
+        VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
+        VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB
+    ) {
+        std::array<VectorF32<3, 1>, 8> cornersA = GetOBBCorners(sizeA, boxA);
+        std::array<VectorF32<3, 1>, 8> cornersB = GetOBBCorners(sizeB, boxB);
+
+        // Axes are the upper-3 lanes of each matrix row (same convention as
+        // SphereOrientedBox). ExtractLo<3> just retypes the SIMD register; the
+        // 4th lane is ignored by the Len=3 ops below.
+        std::array<VectorF32<3, 1>, 3> axesA = {
+            boxA.rows[0].template ExtractLo<3>(),
+            boxA.rows[1].template ExtractLo<3>(),
+            boxA.rows[2].template ExtractLo<3>(),
+        };
+        std::array<VectorF32<3, 1>, 3> axesB = {
+            boxB.rows[0].template ExtractLo<3>(),
+            boxB.rows[1].template ExtractLo<3>(),
+            boxB.rows[2].template ExtractLo<3>(),
+        };
+
+        std::array<VectorF32<3, 1>, 15> axes{};
+        axes[0] = axesA[0]; axes[1] = axesA[1]; axes[2] = axesA[2];
+        axes[3] = axesB[0]; axes[4] = axesB[1]; axes[5] = axesB[2];
+        // Normalize all nine cross axes together with a single batched
+        // Normalize call (Packing=3 not in the API, so two calls of four +
+        // one of one would be needed; for now just normalize in two batches
+        // of four and the trailing one inline).
+        std::array<VectorF32<3, 1>, 9> crossAxes{};
+        std::uint8_t k = 0;
+        for (std::uint8_t i = 0; i < 3; ++i) {
+            for (std::uint8_t j = 0; j < 3; ++j) {
+                crossAxes[k++] = VectorF32<3, 1>::Cross(axesA[i], axesB[j]);
            }
        }
+        auto norm0 = VectorF32<3, 1>::Normalize(crossAxes[0], crossAxes[1], crossAxes[2], crossAxes[3]);
+        auto norm1 = VectorF32<3, 1>::Normalize(crossAxes[4], crossAxes[5], crossAxes[6], crossAxes[7]);
+        auto norm2 = VectorF32<3, 1>::Normalize(crossAxes[8], crossAxes[8], crossAxes[8], crossAxes[8]);
+        axes[6]  = std::get<0>(norm0);
+        axes[7]  = std::get<1>(norm0);
+        axes[8]  = std::get<2>(norm0);
+        axes[9]  = std::get<3>(norm0);
+        axes[10] = std::get<0>(norm1);
+        axes[11] = std::get<1>(norm1);
+        axes[12] = std::get<2>(norm1);
+        axes[13] = std::get<3>(norm1);
+        axes[14] = std::get<0>(norm2);

-        for (Vector<T, 3, 0> axis : axes) {
-            T min1 = Vector<T, 3, 0>::Dot(box1Corners[0], axis);
-            T max1 = min1;
-            for (Vector<T, 3, 0> corner : box1Corners) {
-                T projection = Vector<T, 3, 0>::Dot(corner, axis);
-                min1 = std::min(min1, projection);
-                max1 = std::max(max1, projection);
+        for (std::uint8_t axisIdx = 0; axisIdx < 15; ++axisIdx) {
+            VectorF32<3, 1> axis = axes[axisIdx];
+            // Project all 8 corners of each box onto `axis` using two batched
+            // 4-pair Dot calls (lo and hi corners).
+            VectorF32<1, 4> projA_lo = VectorF32<3, 1>::Dot(
+                cornersA[0], axis, cornersA[1], axis,
+                cornersA[2], axis, cornersA[3], axis);
+            VectorF32<1, 4> projA_hi = VectorF32<3, 1>::Dot(
+                cornersA[4], axis, cornersA[5], axis,
+                cornersA[6], axis, cornersA[7], axis);
+            VectorF32<1, 4> projB_lo = VectorF32<3, 1>::Dot(
+                cornersB[0], axis, cornersB[1], axis,
+                cornersB[2], axis, cornersB[3], axis);
+            VectorF32<1, 4> projB_hi = VectorF32<3, 1>::Dot(
+                cornersB[4], axis, cornersB[5], axis,
+                cornersB[6], axis, cornersB[7], axis);
+
+            std::array<float, 4> aLo = projA_lo.template Store<float>();
+            std::array<float, 4> aHi = projA_hi.template Store<float>();
+            std::array<float, 4> bLo = projB_lo.template Store<float>();
+            std::array<float, 4> bHi = projB_hi.template Store<float>();
+
+            float minA = aLo[0], maxA = aLo[0];
+            for (std::uint8_t i = 1; i < 4; ++i) {
+                minA = std::min(minA, aLo[i]);
+                maxA = std::max(maxA, aLo[i]);
+            }
+            for (std::uint8_t i = 0; i < 4; ++i) {
+                minA = std::min(minA, aHi[i]);
+                maxA = std::max(maxA, aHi[i]);
+            }
+            float minB = bLo[0], maxB = bLo[0];
+            for (std::uint8_t i = 1; i < 4; ++i) {
+                minB = std::min(minB, bLo[i]);
+                maxB = std::max(maxB, bLo[i]);
+            }
+            for (std::uint8_t i = 0; i < 4; ++i) {
+                minB = std::min(minB, bHi[i]);
+                maxB = std::max(maxB, bHi[i]);
            }

-            T min2 = Vector<T, 3, 0>::Dot(box2Corners[0], axis);
-            T max2 = min2;
-            for (Vector<T, 3, 0> corner : box2Corners) {
-                T projection = Vector<T, 3, 0>::Dot(corner, axis);
-                min2 = std::min(min2, projection);
-                max2 = std::max(max2, projection);
-            }
-
-            if (max1 < min2 || max2 < min1) {
-                return false;
-            }
+            if (maxA < minB || maxB < minA) return false;
        }
-
        return true;
    }
-
-    export template<typename T>
-    constexpr bool IntersectionTestSphereOrientedBox(Vector<T, 3, 0> sphereCenter, T sphereRadius, Vector<T, 3, 0> boxSize, MatrixRowMajor<T, 4, 3, 1> boxMatrix) {
-        // Extract the OBB's axes (columns of the rotation matrix)
-        Vector<T, 3, 0> xAxis(boxMatrix.m[0][0], boxMatrix.m[0][1], boxMatrix.m[0][2]);
-        Vector<T, 3, 0> yAxis(boxMatrix.m[1][0], boxMatrix.m[1][1], boxMatrix.m[1][2]);
-        Vector<T, 3, 0> zAxis(boxMatrix.m[2][0], boxMatrix.m[2][1], boxMatrix.m[2][2]);
-
-        // Translate the sphere center into the OBB's local space
-        Vector<T, 3, 0> localCenter = Vector<T, 3, 0>(
-            Vector<T, 3, 0>::Dot(sphereCenter - Vector<T, 3, 0>(boxMatrix.m[0][3], boxMatrix.m[1][3], boxMatrix.m[2][3]), xAxis),
-            Vector<T, 3, 0>::Dot(sphereCenter - Vector<T, 3, 0>(boxMatrix.m[0][3], boxMatrix.m[1][3], boxMatrix.m[2][3]), yAxis),
-            Vector<T, 3, 0>::Dot(sphereCenter - Vector<T, 3, 0>(boxMatrix.m[0][3], boxMatrix.m[1][3], boxMatrix.m[2][3]), zAxis)
-        );
-
-        // Clamp the local center to the OBB's extents
-        Vector<T, 3, 0> closestPoint = Vector<T, 3, 0>(
-            std::max(-boxSize.x, std::min(localCenter.x, boxSize.x)),
-            std::max(-boxSize.y, std::min(localCenter.y, boxSize.y)),
-            std::max(-boxSize.z, std::min(localCenter.z, boxSize.z))
-        );
-
-        // Calculate the distance between the closest point and the local center
-        Vector<T, 3, 0> delta = localCenter - closestPoint;
-        T distanceSquared = Vector<T, 3, 0>::Dot(delta, delta);
-
-        // Check if the distance is less than or equal to the sphere's radius squared
-        return distanceSquared <= (sphereRadius * sphereRadius);
-    }
-}
+}
--- a/interfaces/Crafter.Math-MatrixRowMajor.cppm
+++ b/interfaces/Crafter.Math-MatrixRowMajor.cppm
@ -1,433 +1,330 @@
-/*
-Crafter®.Math
-Copyright (C) 2026 Catcrafts®
-catcrafts.net
-
-This library is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License version 3.0 as published by the Free Software Foundation;
-
-This library is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this library; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
-*/
-
-export module Crafter.Math:MatrixRowMajor;
-
-import :Basic;
-import :Vector;
-import std;
-
-namespace Crafter {
-	export template <typename T, std::uint32_t CollumSize, std::uint32_t RowSize, std::uint32_t Repeats>
-	class MatrixRowMajor {
-	public:
-		T m[RowSize][CollumSize*Repeats];
-
-		MatrixRowMajor() = default;
-
-		MatrixRowMajor(
-			float x0, float y0, float z0, float w0,
-			float x1, float y1, float z1, float w1,
-			float x2, float y2, float z2, float w2,
-			float x3, float y3, float z3, float w3
-		) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
-			m[0][0] = x0;
-			m[0][1] = y0;
-			m[0][2] = z0;
-			m[0][3] = w0;
-
-			m[1][0] = x1;
-			m[1][1] = y1;
-			m[1][2] = z1;
-			m[1][3] = w1;
-
-			m[2][0] = x2;
-			m[2][1] = y2;
-			m[2][2] = z2;
-			m[2][3] = w2;
-
-			m[3][0] = x3;
-			m[3][1] = y3;
-			m[3][2] = z3;
-			m[3][3] = w3;
-		}
-
-        MatrixRowMajor(
-			float x0, float y0, float z0, float w0,
-			float x1, float y1, float z1, float w1,
-			float x2, float y2, float z2, float w2
-		) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
-			m[0][0] = x0;
-			m[0][1] = y0;
-			m[0][2] = z0;
-			m[0][3] = w0;
-
-			m[1][0] = x1;
-			m[1][1] = y1;
-			m[1][2] = z1;
-			m[1][3] = w1;
-
-			m[2][0] = x2;
-			m[2][1] = y2;
-			m[2][2] = z2;
-			m[2][3] = w2;
-		}
-
-        template <std::uint32_t VAligment>
-		Vector<T, 3, VAligment> operator*(Vector<T, 3, VAligment> b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
-			return Vector<T, 3, VAligment>(
-				b.x * m[0][0] + b.y * m[0][1] + b.z * m[0][2] + m[0][3],
-				b.x * m[1][0] + b.y * m[1][1] + b.z * m[1][2] + m[1][3],
-				b.x * m[2][0] + b.y * m[2][1] + b.z * m[2][2] + m[2][3]
-			);
-		}
-
-		MatrixRowMajor<T, CollumSize, RowSize, Repeats> operator*(MatrixRowMajor<T, CollumSize, RowSize, Repeats> b) const requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
-			MatrixRowMajor<T, CollumSize, RowSize, Repeats> result;
-
-			result.m[0][0] = b.m[0][0] * m[0][0] + b.m[0][1] * m[1][0] + b.m[0][2] * m[2][0] + b.m[0][3] * m[3][0];
-			result.m[1][0] = b.m[1][0] * m[0][0] + b.m[1][1] * m[1][0] + b.m[1][2] * m[2][0] + b.m[1][3] * m[3][0];
-			result.m[2][0] = b.m[2][0] * m[0][0] + b.m[2][1] * m[1][0] + b.m[2][2] * m[2][0] + b.m[2][3] * m[3][0];
-			result.m[3][0] = b.m[3][0] * m[0][0] + b.m[3][1] * m[1][0] + b.m[3][2] * m[2][0] + b.m[3][3] * m[3][0];
-			
-			result.m[0][1] = b.m[0][0] * m[0][1] + b.m[0][1] * m[1][1] + b.m[0][2] * m[2][1] + b.m[0][3] * m[3][1];
-			result.m[1][1] = b.m[1][0] * m[0][1] + b.m[1][1] * m[1][1] + b.m[1][2] * m[2][1] + b.m[1][3] * m[3][1];
-			result.m[2][1] = b.m[2][0] * m[0][1] + b.m[2][1] * m[1][1] + b.m[2][2] * m[2][1] + b.m[2][3] * m[3][1];
-			result.m[3][1] = b.m[3][0] * m[0][1] + b.m[3][1] * m[1][1] + b.m[3][2] * m[2][1] + b.m[3][3] * m[3][1];
-
-			result.m[0][2] = b.m[0][0] * m[0][2] + b.m[0][1] * m[1][2] + b.m[0][2] * m[2][2] + b.m[0][3] * m[3][2];
-			result.m[1][2] = b.m[1][0] * m[0][2] + b.m[1][1] * m[1][2] + b.m[1][2] * m[2][2] + b.m[1][3] * m[3][2];
-			result.m[2][2] = b.m[2][0] * m[0][2] + b.m[2][1] * m[1][2] + b.m[2][2] * m[2][2] + b.m[2][3] * m[3][2];
-			result.m[3][2] = b.m[3][0] * m[0][2] + b.m[3][1] * m[1][2] + b.m[3][2] * m[2][2] + b.m[3][3] * m[3][2];
-
-			result.m[0][3] = b.m[0][0] * m[0][3] + b.m[0][1] * m[1][3] + b.m[0][2] * m[2][3] + b.m[0][3] * m[3][3];
-			result.m[1][3] = b.m[1][0] * m[0][3] + b.m[1][1] * m[1][3] + b.m[1][2] * m[2][3] + b.m[1][3] * m[3][3];
-			result.m[2][3] = b.m[2][0] * m[0][3] + b.m[2][1] * m[1][3] + b.m[2][2] * m[2][3] + b.m[2][3] * m[3][3];
-			result.m[3][3] = b.m[3][0] * m[0][3] + b.m[3][1] * m[1][3] + b.m[3][2] * m[2][3] + b.m[3][3] * m[3][3];
-
-			return result;
-		}
-
-        MatrixRowMajor<T, CollumSize, RowSize, Repeats> operator*(MatrixRowMajor<T, CollumSize, RowSize, Repeats> b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
-			MatrixRowMajor<T, CollumSize, RowSize, Repeats> result;
-
-            // Column 0
-            result.m[0][0] = b.m[0][0]*m[0][0] + b.m[0][1]*m[1][0] + b.m[0][2]*m[2][0];
-            result.m[1][0] = b.m[1][0]*m[0][0] + b.m[1][1]*m[1][0] + b.m[1][2]*m[2][0];
-            result.m[2][0] = b.m[2][0]*m[0][0] + b.m[2][1]*m[1][0] + b.m[2][2]*m[2][0];
-
-            // Column 1
-            result.m[0][1] = b.m[0][0]*m[0][1] + b.m[0][1]*m[1][1] + b.m[0][2]*m[2][1];
-            result.m[1][1] = b.m[1][0]*m[0][1] + b.m[1][1]*m[1][1] + b.m[1][2]*m[2][1];
-            result.m[2][1] = b.m[2][0]*m[0][1] + b.m[2][1]*m[1][1] + b.m[2][2]*m[2][1];
-
-            // Column 2
-            result.m[0][2] = b.m[0][0]*m[0][2] + b.m[0][1]*m[1][2] + b.m[0][2]*m[2][2];
-            result.m[1][2] = b.m[1][0]*m[0][2] + b.m[1][1]*m[1][2] + b.m[1][2]*m[2][2];
-            result.m[2][2] = b.m[2][0]*m[0][2] + b.m[2][1]*m[1][2] + b.m[2][2]*m[2][2];
-
-            // Translation column
-            result.m[0][3] = b.m[0][0]*m[0][3] + b.m[0][1]*m[1][3] + b.m[0][2]*m[2][3] + b.m[0][3];
-            result.m[1][3] = b.m[1][0]*m[0][3] + b.m[1][1]*m[1][3] + b.m[1][2]*m[2][3] + b.m[1][3];
-            result.m[2][3] = b.m[2][0]*m[0][3] + b.m[2][1]*m[1][3] + b.m[2][2]*m[2][3] + b.m[2][3];
-
-
-			return result;
-		}
-
-        static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Identity() requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
-            return MatrixRowMajor<T, CollumSize, RowSize, Repeats>(
-                1, 0, 0, 0,
-                0, 1, 0, 0,
-                0, 0, 1, 0
-            );
-        }
-        
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Scaling(float x, float y, float z) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			return  MatrixRowMajor<T, CollumSize, RowSize, Repeats>(
-				x, 0, 0, 0,
-				0, y, 0, 0,
-				0, 0, z, 0,
-				0, 0, 0, 1
-			);
-		}
-        static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Scaling(float x, float y, float z) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
-            return MatrixRowMajor<T, CollumSize, RowSize, Repeats>(
-                x, 0, 0, 0,
-                0, y, 0, 0,
-                0, 0, z, 0
-            );
-        }
-        template <std::uint32_t VAligment>
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Scaling(Vector<float, 3, VAligment> vector) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			return Scaling(vector.x, vector.y, vector.z);
-		}
-
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Translation(float x, float y, float z) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			return  MatrixRowMajor<T, CollumSize, RowSize, Repeats>(
-				1, 0, 0, 0,
-				0, 1, 0, 0,
-				0, 0, 1, 0,
-				x, y, z, 1
-			);
-		}
-        static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Translation(float x, float y, float z) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
-            return MatrixRowMajor<T, CollumSize, RowSize, Repeats>(
-                1, 0, 0, x,
-                0, 1, 0, y,
-                0, 0, 1, z
-            );
-        }
-
-        template <std::uint32_t VAligment>
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Translation(Vector<T, 3, VAligment> vector) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			return Translation(vector.x, vector.y, vector.z);
-		}
-
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Rotation(float Pitch, float Yaw, float Roll) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			float cp = std::cosf(Pitch);
-			float sp = std::sinf(Pitch);
-		
-			float cy = std::cosf(Yaw);
-			float sy = std::sinf(Yaw);
-		
-			float cr = std::cosf(Roll);
-			float sr = std::sinf(Roll);
-		
-			MatrixRowMajor<T, CollumSize, RowSize, Repeats> M;
-			M.m[0][0] = cr * cy + sr * sp * sy;
-			M.m[0][1] = sr * cp;
-			M.m[0][2] = sr * sp * cy - cr * sy;
-			M.m[0][3] = 0.0f;
-		
-			M.m[1][0] = cr * sp * sy - sr * cy;
-			M.m[1][1] = cr * cp;
-			M.m[1][2] = sr * sy + cr * sp * cy;
-			M.m[1][3] = 0.0f;
-		
-			M.m[2][0] = cp * sy;
-			M.m[2][1] = -sp;
-			M.m[2][2] = cp * cy;
-			M.m[2][3] = 0.0f;
-		
-			M.m[3][0] = 0.0f;
-			M.m[3][1] = 0.0f;
-			M.m[3][2] = 0.0f;
-			M.m[3][3] = 1.0f;
-
-			return M;
-		}
-
-        static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Rotation(float Pitch, float Yaw, float Roll) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
-            float cp = std::cosf(Pitch);
-            float sp = std::sinf(Pitch);
-
-            float cy = std::cosf(Yaw);
-            float sy = std::sinf(Yaw);
-
-            float cr = std::cosf(Roll);
-            float sr = std::sinf(Roll);
-
-            MatrixRowMajor<T, CollumSize, RowSize, Repeats> M;
-            M.m[0][0] = cr * cy + sr * sp * sy;
-            M.m[0][1] = sr * cp;
-            M.m[0][2] = sr * sp * cy - cr * sy;
-            M.m[0][3] = 0.0f;
-
-            M.m[1][0] = cr * sp * sy - sr * cy;
-            M.m[1][1] = cr * cp;
-            M.m[1][2] = sr * sy + cr * sp * cy;
-            M.m[1][3] = 0.0f;
-
-            M.m[2][0] = cp * sy;
-            M.m[2][1] = -sp;
-            M.m[2][2] = cp * cy;
-            M.m[2][3] = 0.0f;
-
-            return M;
-        }
-
-        template <std::uint32_t VAligment>
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> LookAt(Vector<T, 3, VAligment> eyePosition, Vector<T, 3, VAligment> focusPosition, Vector<T, 3, VAligment> upDirection) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			MatrixRowMajor<T, CollumSize, RowSize, Repeats> M;
-
-			Vector<T, 3, VAligment> negEyeDirection = eyePosition - focusPosition;
-			return LookTo(eyePosition, negEyeDirection, upDirection);
-
-			return M;
-		}
-
-        template <std::uint32_t VAligment>
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> LookTo(Vector<T, 3, VAligment> eyePosition, Vector<T, 3, VAligment> eyeDirection, Vector<T, 3, VAligment> upDirection) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			Vector<T, 3, 3> R2 = eyeDirection.Normalize();
-
-			Vector<T, 3, 3> R0 = upDirection.Cross(R2);
-			R0 = R0.Normalize();
-
-			Vector<T, 3, 3> R1 = R2.Cross(R0);
-
-			Vector<T, 3, 3> NegEyePosition = -eyePosition;
-
-			float D0 = R0.Dot(NegEyePosition);
-			float D1 = R1.Dot(NegEyePosition);
-			float D2 = R2.Dot(NegEyePosition);
-			
-			MatrixRowMajor<T, CollumSize, RowSize, Repeats> M;
-			M.m[0][0] = R0.v[0];
-			M.m[1][0] = R0.v[1];
-			M.m[2][0] = R0.v[2];
-			M.m[3][0] = D0;
-
-			M.m[0][1] = R1.v[0];
-			M.m[1][1] = R1.v[1];
-			M.m[2][1] = R1.v[2];
-			M.m[3][1] = D1;
-
-			M.m[0][2] = R2.v[0];
-			M.m[1][2] = R2.v[1];
-			M.m[2][2] = R2.v[2];
-			M.m[3][2] = D2;
-
-			M.m[0][3] = 0;
-			M.m[1][3] = 0;
-			M.m[2][3] = 0;
-			M.m[3][3] = 1;
-
-			return M;
-		}
-
-        template <std::uint32_t VAligment>
-		static MatrixRowMajor<T, CollumSize, RowSize, Repeats> Rotation(Vector<T, 3, VAligment> vector) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
-			return Rotation(vector.x, vector.y, vector.z);
-		}
-
-        template <std::uint32_t VAligment>
-		Vector<T, 3, VAligment> TransformNormal(Vector<T, 3, VAligment> V) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>)  {
-			Vector<T, 3, 3> Z = Vector<T, 3, 3>(V.z, V.z, V.z);
-			Vector<T, 3, 3> Y = Vector<T, 3, 3>(V.y, V.y, V.y);
-			Vector<T, 3, 3> X = Vector<T, 3, 3>(V.x, V.x, V.x);
-		
-			Vector<T, 3, VAligment> Result = Z * Vector<T, 3, VAligment>(m[2][0], m[2][1], m[2][2]);
-			Result = Y * Vector<T, 3, VAligment>(m[1][0], m[1][1], m[1][2]) + Result;
-			Result = X * Vector<T, 3, VAligment>(m[0][0], m[0][1], m[0][2]) + Result;
-		
-			return Result;
-		}
-
-		// MatrixRowMajor<T, CollumSize, RowSize, Repeats> Inverse() requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
-		// 	Vector<float, 4> V0[4], V1[4];
-		// 	V0[0] = Vector<float, 4>(m[0][2], m[0][2], m[1][2], m[1][2]);
-		// 	V1[0] = Vector<float, 4>(m[2][3], m[3][3], m[2][3], m[3][3]);
-		// 	V0[1] = Vector<float, 4>(m[0][0], m[0][0], m[1][0], m[1][0]);
-		// 	V1[1] = Vector<float, 4>(m[2][1], m[3][1], m[2][1], m[3][1]);
-		// 	V0[2] = Vector<float, 4>(m[0][2], m[2][2], m[0][0], m[2][0]);
-		// 	V1[2] = Vector<float, 4>(m[1][3], m[3][3], m[1][1], m[3][1]);
-
-		// 	Vector<float, 4> D0 = V0[0] * V1[0];
-		// 	Vector<float, 4> D1 = V0[1] * V1[1];
-		// 	Vector<float, 4> D2 = V0[2] * V1[2];
-
-		// 	V0[0] = Vector<float, 4>(m[2][2], m[3][2], m[2][2], m[3][2]);
-		// 	V1[0] = Vector<float, 4>(m[0][3], m[0][3], m[1][3], m[1][3]);
-		// 	V0[1] = Vector<float, 4>(m[2][0], m[3][0], m[2][0], m[3][0]);
-		// 	V1[1] = Vector<float, 4>(m[0][1], m[0][1], m[1][1], m[1][1]);
-		// 	V0[2] = Vector<float, 4>(m[1][2], m[3][2], m[1][0], m[3][0]);
-		// 	V1[2] = Vector<float, 4>(m[0][3], m[2][3], m[0][1], m[2][1]);
-
-		// 	D0 = Vector<float, 4>::NegativeMultiplySubtract(V0[0], V1[0], D0);
-		// 	D1 = Vector<float, 4>::NegativeMultiplySubtract(V0[1], V1[1], D1);
-		// 	D2 = Vector<float, 4>::NegativeMultiplySubtract(V0[2], V1[2], D2);
-
-		// 	V0[0] = Vector<float, 4>(m[1][1], m[2][1], m[0][1], m[1][1]);
-		// 	V1[0] = Vector<float, 4>(D2.v[1], D0.v[1], D0.v[3], D0.v[0]);
-		// 	V0[1] = Vector<float, 4>(m[2][0], m[0][0], m[1][0], m[0][0]);
-		// 	V1[1] = Vector<float, 4>(D0.v[3], D2.v[1], D0.v[1], D0.v[2]);
-		// 	V0[2] = Vector<float, 4>(m[1][3], m[2][3], m[0][3], m[1][3]);
-		// 	V1[2] = Vector<float, 4>(D2.v[3], D1.v[1], D1.v[3], D1.v[0]);
-		// 	V0[3] = Vector<float, 4>(m[2][2], m[0][2], m[1][2], m[0][2]);
-		// 	V1[3] = Vector<float, 4>(D1.v[3], D2.v[3], D1.v[1], D1.v[2]);
-
-		// 	Vector<float, 4> C0 = V0[0] * V1[0];
-		// 	Vector<float, 4> C2 = V0[1] * V1[1];
-		// 	Vector<float, 4> C4 = V0[2] * V1[2];
-		// 	Vector<float, 4> C6 = V0[3] * V1[3];
-
-		// 	V0[0] = Vector<float, 4>(m[2][1], m[3][1], m[1][1], m[2][1]);
-		// 	V1[0] = Vector<float, 4>(D0.v[3], D0.v[0], D0.v[1], D2.v[0]);
-		// 	V0[1] = Vector<float, 4>(m[3][0], m[2][0], m[3][0], m[1][0]);
-		// 	V1[1] = Vector<float, 4>(D0.v[2], D0.v[1], D2.v[0], D0.v[0]);
-		// 	V0[2] = Vector<float, 4>(m[2][3], m[3][3], m[1][3], m[2][3]);
-		// 	V1[2] = Vector<float, 4>(D1.v[3], D1.v[0], D1.v[1], D2.v[2]);
-		// 	V0[3] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[2]);
-		// 	V1[3] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(D1, D2);
-
-		// 	C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
-		// 	C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
-		// 	C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
-		// 	C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
-
-		// 	V0[0] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[1]);
-		// 	V1[0] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z>(D0, D2);
-		// 	V0[1] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[0]);
-		// 	V1[1] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X>(D0, D2);
-		// 	V0[2] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[3]);
-		// 	V1[2] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(D1, D2);
-		// 	V0[3] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[2]);
-		// 	V1[3] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z>(D1, D2);
-
-		// 	XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0);
-		// 	C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0);
-		// 	XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2);
-		// 	C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2);
-		// 	XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4);
-		// 	C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4);
-		// 	XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6);
-		// 	C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6);
-
-		// 	XMMATRIX R;
-		// 	R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v);
-		// 	R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v);
-		// 	R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v);
-		// 	R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v);
-
-		// 	XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]);
-		// 	XMVECTOR Reciprocal = XMVectorReciprocal(Determinant);
-
-		// 	XMMATRIX Result;
-		// 	Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal);
-		// 	Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal);
-		// 	Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal);
-		// 	Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal);
-		// 	return Result;
-		// }
-	};
-}
-
-template <>
-struct std::formatter<Crafter::MatrixRowMajor<float, 4, 4, 1>> : std::formatter<std::string> {
-    auto format(const Crafter::MatrixRowMajor<float, 4, 4, 1>& obj, format_context& ctx) const {
-        return std::formatter<std::string>::format(std::format("{{{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}}}", 
-			obj.m[0][0], obj.m[0][1], obj.m[0][2], obj.m[0][3], 
-			obj.m[1][0], obj.m[1][1], obj.m[1][2], obj.m[1][3], 
-			obj.m[2][0], obj.m[2][1], obj.m[2][2], obj.m[2][3], 
-			obj.m[3][0], obj.m[3][1], obj.m[3][2], obj.m[3][3]
-		), ctx);
-    }
-};
-
-template <>
-struct std::formatter<Crafter::MatrixRowMajor<float, 4, 3, 1>> : std::formatter<std::string> {
-    auto format(const Crafter::MatrixRowMajor<float, 4, 3, 1>& obj, format_context& ctx) const {
-        return std::formatter<std::string>::format(std::format("{{{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}}}", 
-			obj.m[0][0], obj.m[0][1], obj.m[0][2], obj.m[0][3], 
-			obj.m[1][0], obj.m[1][1], obj.m[1][2], obj.m[1][3], 
-			obj.m[2][0], obj.m[2][1], obj.m[2][2], obj.m[2][3]
-		), ctx);
-    }
-};
+/*
+Crafter®.Math
+Copyright (C) 2026 Catcrafts®
+catcrafts.net
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License version 3.0 as published by the Free Software Foundation;
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+*/
+
+export module Crafter.Math:MatrixRowMajor;
+
+import :Basic;
+import :VectorF32;
+import std;
+
+namespace Crafter {
+    // Row-major matrix whose rows live in optimized SIMD vectors. All
+    // multiplications are expressed as broadcast + fused multiply-add against
+    // these row vectors so the heavy work stays in __m128/__m256/__m512 land.
+    //
+    // CollumSize is the column count; only CollumSize == 4 is implemented (the
+    // matrix gets one SIMD row per row). Repeats is reserved for future
+    // SoA-style batching.
+    export template <typename T, std::uint32_t CollumSize, std::uint32_t RowSize, std::uint32_t Repeats>
+    class MatrixRowMajor {
+    public:
+        // Rows are exposed publicly so users can compose with VectorF32 ops
+        // directly without going through an accessor. Each row is a single
+        // SIMD vector covering all columns.
+        VectorF32<static_cast<std::uint8_t>(CollumSize), 1> rows[RowSize];
+
+        MatrixRowMajor() = default;
+
+        MatrixRowMajor(
+            float x0, float y0, float z0, float w0,
+            float x1, float y1, float z1, float w1,
+            float x2, float y2, float z2, float w2,
+            float x3, float y3, float z3, float w3
+        ) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            alignas(16) float r0[4] = { x0, y0, z0, w0 };
+            alignas(16) float r1[4] = { x1, y1, z1, w1 };
+            alignas(16) float r2[4] = { x2, y2, z2, w2 };
+            alignas(16) float r3[4] = { x3, y3, z3, w3 };
+            rows[0] = VectorF32<4, 1>(r0);
+            rows[1] = VectorF32<4, 1>(r1);
+            rows[2] = VectorF32<4, 1>(r2);
+            rows[3] = VectorF32<4, 1>(r3);
+        }
+
+        MatrixRowMajor(
+            float x0, float y0, float z0, float w0,
+            float x1, float y1, float z1, float w1,
+            float x2, float y2, float z2, float w2
+        ) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            alignas(16) float r0[4] = { x0, y0, z0, w0 };
+            alignas(16) float r1[4] = { x1, y1, z1, w1 };
+            alignas(16) float r2[4] = { x2, y2, z2, w2 };
+            rows[0] = VectorF32<4, 1>(r0);
+            rows[1] = VectorF32<4, 1>(r1);
+            rows[2] = VectorF32<4, 1>(r2);
+        }
+
+        // Flatten to RowSize*CollumSize contiguous floats (row-major). Replaces
+        // the old `m[i][j]` raw array access for callers that need a packed
+        // float buffer (e.g. GPU upload via memcpy).
+        constexpr void Store(float* dst) const requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            for (std::uint32_t i = 0; i < RowSize; ++i) {
+                rows[i].Store(dst + i * 4);
+            }
+        }
+
+        constexpr std::array<float, RowSize * 4> Store() const requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            std::array<float, RowSize * 4> out{};
+            Store(out.data());
+            return out;
+        }
+
+        // Affine transform: extend `b` with implicit w=1 (translation) and dot
+        // each row against it. Three row-dots packed into one batched 4-pair
+        // Dot call (lane 3 of the result is the duplicated row 0 and gets
+        // discarded).
+        VectorF32<3, 1> operator*(VectorF32<3, 1> b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            std::array<float, 4> bArr = b.template Store<float>();
+            alignas(16) float bhBuf[4] = { bArr[0], bArr[1], bArr[2], 1.0f };
+            VectorF32<4, 1> bh(bhBuf);
+
+            VectorF32<1, 4> dots = VectorF32<4, 1>::Dot(
+                rows[0], bh, rows[1], bh, rows[2], bh, rows[0], bh);
+
+            std::array<float, 4> dotsArr = dots.template Store<float>();
+            alignas(16) float outBuf[4] = { dotsArr[0], dotsArr[1], dotsArr[2], 0.0f };
+            return VectorF32<3, 1>(outBuf);
+        }
+
+        // Linear transform (no translation): same as the affine version but
+        // with bh.w = 0 so the translation column does not contribute. Useful
+        // for direction vectors and normals.
+        VectorF32<3, 1> TransformNormal(VectorF32<3, 1> b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            std::array<float, 4> bArr = b.template Store<float>();
+            alignas(16) float bhBuf[4] = { bArr[0], bArr[1], bArr[2], 0.0f };
+            VectorF32<4, 1> bh(bhBuf);
+
+            VectorF32<1, 4> dots = VectorF32<4, 1>::Dot(
+                rows[0], bh, rows[1], bh, rows[2], bh, rows[0], bh);
+
+            std::array<float, 4> dotsArr = dots.template Store<float>();
+            alignas(16) float outBuf[4] = { dotsArr[0], dotsArr[1], dotsArr[2], 0.0f };
+            return VectorF32<3, 1>(outBuf);
+        }
+
+        // 4×4 matrix product via broadcast + FMA. Each result row is
+        //   b[i][0]·rows[0] + b[i][1]·rows[1] + b[i][2]·rows[2] + b[i][3]·rows[3]
+        // produced with four shuffle-broadcasts and three fused multiply-adds.
+        MatrixRowMajor operator*(MatrixRowMajor b) const requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            MatrixRowMajor result;
+            for (std::uint32_t i = 0; i < 4; ++i) {
+                VectorF32<4, 1> bi = b.rows[i];
+                VectorF32<4, 1> bx = bi.template Shuffle<{{0, 0, 0, 0}}>();
+                VectorF32<4, 1> by = bi.template Shuffle<{{1, 1, 1, 1}}>();
+                VectorF32<4, 1> bz = bi.template Shuffle<{{2, 2, 2, 2}}>();
+                VectorF32<4, 1> bw = bi.template Shuffle<{{3, 3, 3, 3}}>();
+
+                VectorF32<4, 1> row = bx * rows[0];
+                row = VectorF32<4, 1>::MulitplyAdd(by, rows[1], row);
+                row = VectorF32<4, 1>::MulitplyAdd(bz, rows[2], row);
+                row = VectorF32<4, 1>::MulitplyAdd(bw, rows[3], row);
+                result.rows[i] = row;
+            }
+            return result;
+        }
+
+        // 4×3 affine product. Same broadcast + FMA pattern, but the implicit
+        // 4th row of both matrices is [0, 0, 0, 1] so the b.w · row3 term
+        // contributes only to the translation slot.
+        MatrixRowMajor operator*(MatrixRowMajor b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            alignas(16) float wRowBuf[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+            VectorF32<4, 1> wRow(wRowBuf);
+
+            MatrixRowMajor result;
+            for (std::uint32_t i = 0; i < 3; ++i) {
+                VectorF32<4, 1> bi = b.rows[i];
+                VectorF32<4, 1> bx = bi.template Shuffle<{{0, 0, 0, 0}}>();
+                VectorF32<4, 1> by = bi.template Shuffle<{{1, 1, 1, 1}}>();
+                VectorF32<4, 1> bz = bi.template Shuffle<{{2, 2, 2, 2}}>();
+                VectorF32<4, 1> bw = bi.template Shuffle<{{3, 3, 3, 3}}>();
+
+                VectorF32<4, 1> row = bx * rows[0];
+                row = VectorF32<4, 1>::MulitplyAdd(by, rows[1], row);
+                row = VectorF32<4, 1>::MulitplyAdd(bz, rows[2], row);
+                row = VectorF32<4, 1>::MulitplyAdd(bw, wRow, row);
+                result.rows[i] = row;
+            }
+            return result;
+        }
+
+        static MatrixRowMajor Identity() requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            return MatrixRowMajor(
+                1, 0, 0, 0,
+                0, 1, 0, 0,
+                0, 0, 1, 0
+            );
+        }
+
+        static MatrixRowMajor Identity() requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            return MatrixRowMajor(
+                1, 0, 0, 0,
+                0, 1, 0, 0,
+                0, 0, 1, 0,
+                0, 0, 0, 1
+            );
+        }
+
+        static MatrixRowMajor Scaling(float x, float y, float z) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            return MatrixRowMajor(
+                x, 0, 0, 0,
+                0, y, 0, 0,
+                0, 0, z, 0,
+                0, 0, 0, 1
+            );
+        }
+        static MatrixRowMajor Scaling(float x, float y, float z) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            return MatrixRowMajor(
+                x, 0, 0, 0,
+                0, y, 0, 0,
+                0, 0, z, 0
+            );
+        }
+        static MatrixRowMajor Scaling(VectorF32<3, 1> vector) requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            std::array<float, 4> a = vector.template Store<float>();
+            return Scaling(a[0], a[1], a[2]);
+        }
+
+        static MatrixRowMajor Translation(float x, float y, float z) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            return MatrixRowMajor(
+                1, 0, 0, 0,
+                0, 1, 0, 0,
+                0, 0, 1, 0,
+                x, y, z, 1
+            );
+        }
+        static MatrixRowMajor Translation(float x, float y, float z) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            return MatrixRowMajor(
+                1, 0, 0, x,
+                0, 1, 0, y,
+                0, 0, 1, z
+            );
+        }
+        static MatrixRowMajor Translation(VectorF32<3, 1> vector) requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            std::array<float, 4> a = vector.template Store<float>();
+            return Translation(a[0], a[1], a[2]);
+        }
+
+        // Pitch/yaw/roll Euler rotation. Computes all three sin/cos pairs as a
+        // single batched SinCos on a VectorF32<3, 1>, then assembles the rows.
+        static MatrixRowMajor Rotation(float Pitch, float Yaw, float Roll) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            alignas(16) float angles[4] = { Pitch, Yaw, Roll, 0.0f };
+            VectorF32<3, 1> v(angles);
+            std::tuple<VectorF32<3, 1>, VectorF32<3, 1>> sc = v.SinCos();
+            std::array<float, 4> s = std::get<0>(sc).template Store<float>();
+            std::array<float, 4> c = std::get<1>(sc).template Store<float>();
+            const float sp = s[0], cp = c[0];
+            const float sy = s[1], cy = c[1];
+            const float sr = s[2], cr = c[2];
+
+            return MatrixRowMajor(
+                cr * cy + sr * sp * sy, sr * cp, sr * sp * cy - cr * sy, 0.0f,
+                cr * sp * sy - sr * cy, cr * cp, sr * sy + cr * sp * cy, 0.0f,
+                cp * sy,                -sp,     cp * cy,                0.0f,
+                0.0f,                   0.0f,    0.0f,                   1.0f
+            );
+        }
+
+        static MatrixRowMajor Rotation(float Pitch, float Yaw, float Roll) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
+            alignas(16) float angles[4] = { Pitch, Yaw, Roll, 0.0f };
+            VectorF32<3, 1> v(angles);
+            std::tuple<VectorF32<3, 1>, VectorF32<3, 1>> sc = v.SinCos();
+            std::array<float, 4> s = std::get<0>(sc).template Store<float>();
+            std::array<float, 4> c = std::get<1>(sc).template Store<float>();
+            const float sp = s[0], cp = c[0];
+            const float sy = s[1], cy = c[1];
+            const float sr = s[2], cr = c[2];
+
+            return MatrixRowMajor(
+                cr * cy + sr * sp * sy, sr * cp, sr * sp * cy - cr * sy, 0.0f,
+                cr * sp * sy - sr * cy, cr * cp, sr * sy + cr * sp * cy, 0.0f,
+                cp * sy,                -sp,     cp * cy,                0.0f
+            );
+        }
+
+        static MatrixRowMajor Rotation(VectorF32<3, 1> v) requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            std::array<float, 4> a = v.template Store<float>();
+            return Rotation(a[0], a[1], a[2]);
+        }
+
+        // View matrix: builds the basis from a forward (negated) direction and
+        // an up reference, then dots each basis vector with -eye for the
+        // translation column. The four dots needed are produced by a single
+        // batched 4-pair Dot.
+        static MatrixRowMajor LookTo(VectorF32<3, 1> eyePosition, VectorF32<3, 1> eyeDirection, VectorF32<3, 1> upDirection) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            // R0 = up × R2 is linear in R2, so its normalized direction does
+            // not depend on whether we hand R2 in before or after its own
+            // normalize. Computing R0_raw from the un-normalized R2 lets us
+            // satisfy the 4-input Normalize requirement with one batched call
+            // (duplicating R2 and R0 in the padding slots).
+            VectorF32<3, 1> R0Raw = VectorF32<3, 1>::Cross(upDirection, eyeDirection);
+            auto normalized = VectorF32<3, 1>::Normalize(eyeDirection, R0Raw, eyeDirection, R0Raw);
+            VectorF32<3, 1> R2 = std::get<0>(normalized);
+            VectorF32<3, 1> R0 = std::get<1>(normalized);
+            VectorF32<3, 1> R1 = VectorF32<3, 1>::Cross(R2, R0);
+            VectorF32<3, 1> negEye = -eyePosition;
+
+            VectorF32<1, 4> dots = VectorF32<3, 1>::Dot(
+                R0, negEye, R1, negEye, R2, negEye, R0, negEye);
+            std::array<float, 4> d = dots.template Store<float>();
+            std::array<float, 4> r0a = R0.template Store<float>();
+            std::array<float, 4> r1a = R1.template Store<float>();
+            std::array<float, 4> r2a = R2.template Store<float>();
+
+            return MatrixRowMajor(
+                r0a[0], r1a[0], r2a[0], 0.0f,
+                r0a[1], r1a[1], r2a[1], 0.0f,
+                r0a[2], r1a[2], r2a[2], 0.0f,
+                d[0],   d[1],   d[2],   1.0f
+            );
+        }
+
+        static MatrixRowMajor LookAt(VectorF32<3, 1> eyePosition, VectorF32<3, 1> focusPosition, VectorF32<3, 1> upDirection) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
+            return LookTo(eyePosition, eyePosition - focusPosition, upDirection);
+        }
+    };
+}
+
+// Pretty printer using Store() so it does not depend on the legacy m[i][j]
+// access pattern.
+template <>
+struct std::formatter<Crafter::MatrixRowMajor<float, 4, 4, 1>> : std::formatter<std::string> {
+    auto format(const Crafter::MatrixRowMajor<float, 4, 4, 1>& obj, format_context& ctx) const {
+        std::array<float, 16> v = obj.Store();
+        return std::formatter<std::string>::format(std::format(
+            "{{{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}}}",
+            v[0],  v[1],  v[2],  v[3],
+            v[4],  v[5],  v[6],  v[7],
+            v[8],  v[9],  v[10], v[11],
+            v[12], v[13], v[14], v[15]
+        ), ctx);
+    }
+};
+
+template <>
+struct std::formatter<Crafter::MatrixRowMajor<float, 4, 3, 1>> : std::formatter<std::string> {
+    auto format(const Crafter::MatrixRowMajor<float, 4, 3, 1>& obj, format_context& ctx) const {
+        std::array<float, 12> v = obj.Store();
+        return std::formatter<std::string>::format(std::format(
+            "{{{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}}}",
+            v[0], v[1], v[2],  v[3],
+            v[4], v[5], v[6],  v[7],
+            v[8], v[9], v[10], v[11]
+        ), ctx);
+    }
+};