/*
Crafter®.Math
Copyright (C) 2026 Catcrafts®
catcrafts.net

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 3.0 as published by the Free Software Foundation;

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
*/

export module Crafter.Math:MatrixRowMajor;

import :Basic;
import :VectorF32;
import std;

namespace Crafter {
    // Row-major matrix whose rows live in optimized SIMD vectors. All
    // multiplications are expressed as broadcast + fused multiply-add against
    // these row vectors so the heavy work stays in __m128/__m256/__m512 land.
    //
    // CollumSize is the column count; only CollumSize == 4 is implemented (the
    // matrix gets one SIMD row per row). Repeats is reserved for future
    // SoA-style batching.
    export template <typename T, std::uint32_t CollumSize, std::uint32_t RowSize, std::uint32_t Repeats>
    class MatrixRowMajor {
    public:
        // Rows are exposed publicly so users can compose with VectorF32 ops
        // directly without going through an accessor. Each row is a single
        // SIMD vector covering all columns.
        VectorF32<static_cast<std::uint8_t>(CollumSize), 1> rows[RowSize];

        MatrixRowMajor() = default;

        MatrixRowMajor(
            float x0, float y0, float z0, float w0,
            float x1, float y1, float z1, float w1,
            float x2, float y2, float z2, float w2,
            float x3, float y3, float z3, float w3
        ) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            alignas(16) float r0[4] = { x0, y0, z0, w0 };
            alignas(16) float r1[4] = { x1, y1, z1, w1 };
            alignas(16) float r2[4] = { x2, y2, z2, w2 };
            alignas(16) float r3[4] = { x3, y3, z3, w3 };
            rows[0] = VectorF32<4, 1>(r0);
            rows[1] = VectorF32<4, 1>(r1);
            rows[2] = VectorF32<4, 1>(r2);
            rows[3] = VectorF32<4, 1>(r3);
        }

        MatrixRowMajor(
            float x0, float y0, float z0, float w0,
            float x1, float y1, float z1, float w1,
            float x2, float y2, float z2, float w2
        ) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            alignas(16) float r0[4] = { x0, y0, z0, w0 };
            alignas(16) float r1[4] = { x1, y1, z1, w1 };
            alignas(16) float r2[4] = { x2, y2, z2, w2 };
            rows[0] = VectorF32<4, 1>(r0);
            rows[1] = VectorF32<4, 1>(r1);
            rows[2] = VectorF32<4, 1>(r2);
        }

        // Flatten to RowSize*CollumSize contiguous floats (row-major). Replaces
        // the old `m[i][j]` raw array access for callers that need a packed
        // float buffer (e.g. GPU upload via memcpy).
        constexpr void Store(float* dst) const requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            for (std::uint32_t i = 0; i < RowSize; ++i) {
                rows[i].Store(dst + i * 4);
            }
        }

        constexpr std::array<float, RowSize * 4> Store() const requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            std::array<float, RowSize * 4> out{};
            Store(out.data());
            return out;
        }

        // Affine transform: extend `b` with implicit w=1 (translation) and dot
        // each row against it. Three row-dots packed into one batched 4-pair
        // Dot call (lane 3 of the result is the duplicated row 0 and gets
        // discarded).
        VectorF32<3, 1> operator*(VectorF32<3, 1> b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            std::array<float, 4> bArr = b.template Store<float>();
            alignas(16) float bhBuf[4] = { bArr[0], bArr[1], bArr[2], 1.0f };
            VectorF32<4, 1> bh(bhBuf);

            VectorF32<1, 4> dots = VectorF32<4, 1>::Dot(
                rows[0], bh, rows[1], bh, rows[2], bh, rows[0], bh);

            std::array<float, 4> dotsArr = dots.template Store<float>();
            alignas(16) float outBuf[4] = { dotsArr[0], dotsArr[1], dotsArr[2], 0.0f };
            return VectorF32<3, 1>(outBuf);
        }

        // Linear transform (no translation): same as the affine version but
        // with bh.w = 0 so the translation column does not contribute. Useful
        // for direction vectors and normals.
        VectorF32<3, 1> TransformNormal(VectorF32<3, 1> b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            std::array<float, 4> bArr = b.template Store<float>();
            alignas(16) float bhBuf[4] = { bArr[0], bArr[1], bArr[2], 0.0f };
            VectorF32<4, 1> bh(bhBuf);

            VectorF32<1, 4> dots = VectorF32<4, 1>::Dot(
                rows[0], bh, rows[1], bh, rows[2], bh, rows[0], bh);

            std::array<float, 4> dotsArr = dots.template Store<float>();
            alignas(16) float outBuf[4] = { dotsArr[0], dotsArr[1], dotsArr[2], 0.0f };
            return VectorF32<3, 1>(outBuf);
        }

        // 4×4 matrix product via broadcast + FMA. Each result row is
        //   b[i][0]·rows[0] + b[i][1]·rows[1] + b[i][2]·rows[2] + b[i][3]·rows[3]
        // produced with four shuffle-broadcasts and three fused multiply-adds.
        MatrixRowMajor operator*(MatrixRowMajor b) const requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            MatrixRowMajor result;
            for (std::uint32_t i = 0; i < 4; ++i) {
                VectorF32<4, 1> bi = b.rows[i];
                VectorF32<4, 1> bx = bi.template Shuffle<{{0, 0, 0, 0}}>();
                VectorF32<4, 1> by = bi.template Shuffle<{{1, 1, 1, 1}}>();
                VectorF32<4, 1> bz = bi.template Shuffle<{{2, 2, 2, 2}}>();
                VectorF32<4, 1> bw = bi.template Shuffle<{{3, 3, 3, 3}}>();

                VectorF32<4, 1> row = bx * rows[0];
                row = VectorF32<4, 1>::MulitplyAdd(by, rows[1], row);
                row = VectorF32<4, 1>::MulitplyAdd(bz, rows[2], row);
                row = VectorF32<4, 1>::MulitplyAdd(bw, rows[3], row);
                result.rows[i] = row;
            }
            return result;
        }

        // 4×3 affine product. Same broadcast + FMA pattern, but the implicit
        // 4th row of both matrices is [0, 0, 0, 1] so the b.w · row3 term
        // contributes only to the translation slot.
        MatrixRowMajor operator*(MatrixRowMajor b) const requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            alignas(16) float wRowBuf[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
            VectorF32<4, 1> wRow(wRowBuf);

            MatrixRowMajor result;
            for (std::uint32_t i = 0; i < 3; ++i) {
                VectorF32<4, 1> bi = b.rows[i];
                VectorF32<4, 1> bx = bi.template Shuffle<{{0, 0, 0, 0}}>();
                VectorF32<4, 1> by = bi.template Shuffle<{{1, 1, 1, 1}}>();
                VectorF32<4, 1> bz = bi.template Shuffle<{{2, 2, 2, 2}}>();
                VectorF32<4, 1> bw = bi.template Shuffle<{{3, 3, 3, 3}}>();

                VectorF32<4, 1> row = bx * rows[0];
                row = VectorF32<4, 1>::MulitplyAdd(by, rows[1], row);
                row = VectorF32<4, 1>::MulitplyAdd(bz, rows[2], row);
                row = VectorF32<4, 1>::MulitplyAdd(bw, wRow, row);
                result.rows[i] = row;
            }
            return result;
        }

        static MatrixRowMajor Identity() requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            return MatrixRowMajor(
                1, 0, 0, 0,
                0, 1, 0, 0,
                0, 0, 1, 0
            );
        }

        static MatrixRowMajor Identity() requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            return MatrixRowMajor(
                1, 0, 0, 0,
                0, 1, 0, 0,
                0, 0, 1, 0,
                0, 0, 0, 1
            );
        }

        static MatrixRowMajor Scaling(float x, float y, float z) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            return MatrixRowMajor(
                x, 0, 0, 0,
                0, y, 0, 0,
                0, 0, z, 0,
                0, 0, 0, 1
            );
        }
        static MatrixRowMajor Scaling(float x, float y, float z) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            return MatrixRowMajor(
                x, 0, 0, 0,
                0, y, 0, 0,
                0, 0, z, 0
            );
        }
        static MatrixRowMajor Scaling(VectorF32<3, 1> vector) requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            std::array<float, 4> a = vector.template Store<float>();
            return Scaling(a[0], a[1], a[2]);
        }

        static MatrixRowMajor Translation(float x, float y, float z) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            return MatrixRowMajor(
                1, 0, 0, 0,
                0, 1, 0, 0,
                0, 0, 1, 0,
                x, y, z, 1
            );
        }
        static MatrixRowMajor Translation(float x, float y, float z) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            return MatrixRowMajor(
                1, 0, 0, x,
                0, 1, 0, y,
                0, 0, 1, z
            );
        }
        static MatrixRowMajor Translation(VectorF32<3, 1> vector) requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            std::array<float, 4> a = vector.template Store<float>();
            return Translation(a[0], a[1], a[2]);
        }

        // Pitch/yaw/roll Euler rotation. Computes all three sin/cos pairs as a
        // single batched SinCos on a VectorF32<3, 1>, then assembles the rows.
        static MatrixRowMajor Rotation(float Pitch, float Yaw, float Roll) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            alignas(16) float angles[4] = { Pitch, Yaw, Roll, 0.0f };
            VectorF32<3, 1> v(angles);
            std::tuple<VectorF32<3, 1>, VectorF32<3, 1>> sc = v.SinCos();
            std::array<float, 4> s = std::get<0>(sc).template Store<float>();
            std::array<float, 4> c = std::get<1>(sc).template Store<float>();
            const float sp = s[0], cp = c[0];
            const float sy = s[1], cy = c[1];
            const float sr = s[2], cr = c[2];

            return MatrixRowMajor(
                cr * cy + sr * sp * sy, sr * cp, sr * sp * cy - cr * sy, 0.0f,
                cr * sp * sy - sr * cy, cr * cp, sr * sy + cr * sp * cy, 0.0f,
                cp * sy,                -sp,     cp * cy,                0.0f,
                0.0f,                   0.0f,    0.0f,                   1.0f
            );
        }

        static MatrixRowMajor Rotation(float Pitch, float Yaw, float Roll) requires(CollumSize == 4 && RowSize == 3 && Repeats == 1 && std::same_as<T, float>) {
            alignas(16) float angles[4] = { Pitch, Yaw, Roll, 0.0f };
            VectorF32<3, 1> v(angles);
            std::tuple<VectorF32<3, 1>, VectorF32<3, 1>> sc = v.SinCos();
            std::array<float, 4> s = std::get<0>(sc).template Store<float>();
            std::array<float, 4> c = std::get<1>(sc).template Store<float>();
            const float sp = s[0], cp = c[0];
            const float sy = s[1], cy = c[1];
            const float sr = s[2], cr = c[2];

            return MatrixRowMajor(
                cr * cy + sr * sp * sy, sr * cp, sr * sp * cy - cr * sy, 0.0f,
                cr * sp * sy - sr * cy, cr * cp, sr * sy + cr * sp * cy, 0.0f,
                cp * sy,                -sp,     cp * cy,                0.0f
            );
        }

        static MatrixRowMajor Rotation(VectorF32<3, 1> v) requires(CollumSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            std::array<float, 4> a = v.template Store<float>();
            return Rotation(a[0], a[1], a[2]);
        }

        // View matrix: builds the basis from a forward (negated) direction and
        // an up reference, then dots each basis vector with -eye for the
        // translation column. The four dots needed are produced by a single
        // batched 4-pair Dot.
        static MatrixRowMajor LookTo(VectorF32<3, 1> eyePosition, VectorF32<3, 1> eyeDirection, VectorF32<3, 1> upDirection) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            // R0 = up × R2 is linear in R2, so its normalized direction does
            // not depend on whether we hand R2 in before or after its own
            // normalize. Computing R0_raw from the un-normalized R2 lets us
            // satisfy the 4-input Normalize requirement with one batched call
            // (duplicating R2 and R0 in the padding slots).
            VectorF32<3, 1> R0Raw = VectorF32<3, 1>::Cross(upDirection, eyeDirection);
            auto normalized = VectorF32<3, 1>::Normalize(eyeDirection, R0Raw, eyeDirection, R0Raw);
            VectorF32<3, 1> R2 = std::get<0>(normalized);
            VectorF32<3, 1> R0 = std::get<1>(normalized);
            VectorF32<3, 1> R1 = VectorF32<3, 1>::Cross(R2, R0);
            VectorF32<3, 1> negEye = -eyePosition;

            VectorF32<1, 4> dots = VectorF32<3, 1>::Dot(
                R0, negEye, R1, negEye, R2, negEye, R0, negEye);
            std::array<float, 4> d = dots.template Store<float>();
            std::array<float, 4> r0a = R0.template Store<float>();
            std::array<float, 4> r1a = R1.template Store<float>();
            std::array<float, 4> r2a = R2.template Store<float>();

            return MatrixRowMajor(
                r0a[0], r1a[0], r2a[0], 0.0f,
                r0a[1], r1a[1], r2a[1], 0.0f,
                r0a[2], r1a[2], r2a[2], 0.0f,
                d[0],   d[1],   d[2],   1.0f
            );
        }

        static MatrixRowMajor LookAt(VectorF32<3, 1> eyePosition, VectorF32<3, 1> focusPosition, VectorF32<3, 1> upDirection) requires(CollumSize == 4 && RowSize == 4 && Repeats == 1 && std::same_as<T, float>) {
            return LookTo(eyePosition, eyePosition - focusPosition, upDirection);
        }
    };
}

// Pretty printer using Store() so it does not depend on the legacy m[i][j]
// access pattern.
template <>
struct std::formatter<Crafter::MatrixRowMajor<float, 4, 4, 1>> : std::formatter<std::string> {
    auto format(const Crafter::MatrixRowMajor<float, 4, 4, 1>& obj, format_context& ctx) const {
        std::array<float, 16> v = obj.Store();
        return std::formatter<std::string>::format(std::format(
            "{{{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}}}",
            v[0],  v[1],  v[2],  v[3],
            v[4],  v[5],  v[6],  v[7],
            v[8],  v[9],  v[10], v[11],
            v[12], v[13], v[14], v[15]
        ), ctx);
    }
};

template <>
struct std::formatter<Crafter::MatrixRowMajor<float, 4, 3, 1>> : std::formatter<std::string> {
    auto format(const Crafter::MatrixRowMajor<float, 4, 3, 1>& obj, format_context& ctx) const {
        std::array<float, 12> v = obj.Store();
        return std::formatter<std::string>::format(std::format(
            "{{{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}}}",
            v[0], v[1], v[2],  v[3],
            v[4], v[5], v[6],  v[7],
            v[8], v[9], v[10], v[11]
        ), ctx);
    }
};