vectorF16

2026-03-18 03:16:29 +01:00 · 2026-03-18 03:16:29 +01:00 · fe016adc18
commit fe016adc18
parent 4624f30f1f
4 changed files with 540 additions and 7 deletions
--- a/interfaces/Crafter.Math-VectorF16.cppm
+++ b/interfaces/Crafter.Math-VectorF16.cppm
@ -0,0 +1,489 @@
+// /*
+// Crafter®.Math
+// Copyright (C) 2026 Catcrafts®
+// catcrafts.net
+
+// This library is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License version 3.0 as published by the Free Software Foundation;
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// Lesser General Public License for more details.
+
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+// */
+// module;
+// #include <immintrin.h>
+// export module Crafter.Math:VectorF16;
+// import std;
+// import :Vector;
+
+// namespace Crafter {
+// 	export template <std::uint32_t Len, std::uint32_t Packing, std::uint32_t Repeats>
+// 	struct VectorF16 {
+//         static constexpr std::uint32_t MaxSize = 32;
+//         static constexpr std::uint32_t MaxElement = 8;
+//         static consteval std::uint32_t GetAlignment() {
+//             if constexpr (Len * Packing <= 8) {
+//                 return 8;
+//             }
+//             if constexpr (Len * Packing <= 16) {
+//                 return 16;
+//             }
+//             if constexpr (Len * Packing <= 32) {
+//                 return 32;
+//             }
+//             static_assert(Len * Packing <= 32, "Len * Packing is larger than supported max size of 32");
+//             static_assert(Len * Packing <= 8, "Len * Packing is larger than supported packed size of 8");
+//             static_assert(Len * Packing * Repeats <= 32, "Len * Packing * Repeats is larger than supported max of 32");
+//         }
+//         static consteval std::uint32_t GetTotalSize() {
+//             return GetAlignment() * Repeats;
+//         }
+
+//         using VectorType = std::conditional_t<
+//             (GetTotalSize() == 32), __m512h,
+//             std::conditional_t<(GetTotalSize() == 16), __m256h, __m128h>
+//         >;
+
+//         VectorType v;
+       
+// 		constexpr VectorF16() = default;
+//         constexpr VectorF16(VectorType v) : v(v) {}
+//         template <std::uint32_t VLen, std::uint32_t VAlign>
+//         constexpr VectorF16(const Vector<_Float16, VLen, VAlign>* vA) requires(VAlign != 0 || VLen >= GetTotalSize()) {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 v = _mm_loadu_ph(vA->v);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 v = _mm256_loadu_ph(vA->v);
+//             } else {
+//                 v = _mm512_loadu_ph(vA->v);
+//             }
+//         };
+
+//         template <std::uint32_t VLen, std::uint32_t VAlign>
+//         constexpr void Load(const Vector<_Float16, VLen, VAlign>* vA) {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 v = _mm_loadu_ph(vA->v);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 v = _mm256_loadu_ph(vA->v);
+//             } else {
+//                 v = _mm512_loadu_ph(vA->v);
+//             }
+//         }
+
+//         template <std::uint32_t VLen, std::uint32_t VAlign>
+//         constexpr void Store(Vector<_Float16, VLen, VAlign>* vA) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 _mm_storeu_ph(vA->v, v);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 _mm256_storeu_ph(vA->v, v);
+//             } else {
+//                 _mm512_storeu_ph(vA->v, v);
+//             }
+//         }
+
+//         template <std::uint32_t VLen, std::uint32_t VAlign>
+//         constexpr Vector<_Float16, VLen, VAlign> Store() const {
+//             Vector<_Float16, VLen, VAlign> returnVec;
+//             Store(&returnVec);
+//             return returnVec;
+//         }
+
+//         template <std::uint32_t BLen, std::uint32_t BPacking, std::uint32_t BRepeats>
+//         constexpr operator VectorF16<BLen, BPacking, BRepeats>() const {
+//             if constexpr(std::is_same_v<VectorType, __m256h> && std::is_same_v<typename VectorF16<BLen, BPacking, BRepeats>::VectorType, __m128h>) {
+//                 return VectorF16<BLen, BPacking, BRepeats>(_mm256_castph256_ph128(v));
+//             } else if constexpr(std::is_same_v<VectorType, __m512h> && std::is_same_v<typename VectorF16<BLen, BPacking, BRepeats>::VectorType, __m128h>) {
+//                 return VectorF16<BLen, BPacking, BRepeats>(_mm512_castph512_ph128(v));
+//             } else if constexpr(std::is_same_v<VectorType, __m512h> && std::is_same_v<typename VectorF16<BLen, BPacking, BRepeats>::VectorType, __m256h>) {
+//                 return VectorF16<BLen, BPacking, BRepeats>(_mm512_castph512_ph256(v));
+//             } else if constexpr(std::is_same_v<VectorType, __m128h> && std::is_same_v<typename VectorF16<BLen, BPacking, BRepeats>::VectorType, __m256h>) {
+//                 return VectorF16<BLen, BPacking, BRepeats>(_mm256_castph128_ph256(v));
+//             } else if constexpr(std::is_same_v<VectorType, __m128h> && std::is_same_v<typename VectorF16<BLen, BPacking, BRepeats>::VectorType, __m512h>) {
+//                 return VectorF16<BLen, BPacking, BRepeats>(_mm512_castph128_ph512(v));
+//             } else if constexpr(std::is_same_v<VectorType, __m256h> && std::is_same_v<typename VectorF16<BLen, BPacking, BRepeats>::VectorType, __m512h>) {
+//                 return VectorF16<BLen, BPacking, BRepeats>(_mm512_castph256_ph512(v));
+//             } else {
+//                 return VectorF16<BLen, BPacking, BRepeats>(v);
+//             }
+//         }
+
+// 		constexpr VectorF16<Len, Packing, Repeats> operator+(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm_add_ph(v, b.v));
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm256_add_ph(v, b.v));
+//             } else {
+//                return VectorF16<Len, Packing, Repeats>(_mm512_add_ph(v, b.v));
+//             }
+// 		}
+
+//         constexpr VectorF16<Len, Packing, Repeats> operator-(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm_sub_ph(v, b.v));
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm256_sub_ph(v, b.v));
+//             } else {
+//                return VectorF16<Len, Packing, Repeats>(_mm512_sub_ph(v, b.v));
+//             }
+// 		}
+
+//         constexpr VectorF16<Len, Packing, Repeats> operator*(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm_mul_ph(v, b.v));
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm256_mul_ph(v, b.v));
+//             } else {
+//                return VectorF16<Len, Packing, Repeats>(_mm512_mul_ph(v, b.v));
+//             }
+// 		}
+
+//         constexpr VectorF16<Len, Packing, Repeats> operator/(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm_div_ph(v, b.v));
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 return VectorF16<Len, Packing, Repeats>(_mm256_div_ph(v, b.v));
+//             } else {
+//                return VectorF16<Len, Packing, Repeats>(_mm512_div_ph(v, b.v));
+//             }
+// 		}
+
+        
+// 		constexpr VectorF16<Len, Packing, Repeats> operator+=(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 v = _mm_add_ph(v, b.v);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                v = _mm256_add_ph(v, b.v);
+//             } else {
+//                v = _mm512_add_ph(v, b.v);
+//             }
+// 		}
+
+//         constexpr VectorF16<Len, Packing, Repeats> operator-=(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 v = _mm_sub_ph(v, b.v);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 v = _mm256_sub_ph(v, b.v);
+//             } else {
+//                v = _mm512_sub_ph(v, b.v);
+//             }
+// 		}
+
+//         constexpr VectorF16<Len, Packing, Repeats> operator*=(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 v = _mm_mul_ph(v, b.v);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 v = _mm256_mul_ph(v, b.v);
+//             } else {
+//                v = _mm512_mul_ph(v, b.v);
+//             }
+// 		}
+
+//         constexpr VectorF16<Len, Packing, Repeats> operator/=(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 v = _mm_div_ph(v, b.v);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 v = _mm256_div_ph(v, b.v);
+//             } else {
+//                 v = _mm512_div_ph(v, b.v);
+//             }
+// 		}
+
+// 		constexpr VectorF16<Len, Packing, Repeats> operator-(){
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000};
+//                 __m128i sign_mask = _mm_load_si128(reinterpret_cast<const __m128i*>(mask));
+//                 return VectorF16<Len, Packing, Repeats>(_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(v))));
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000};
+//                 __m256i sign_mask = _mm256_load_si256(reinterpret_cast<const __m256i*>(mask));
+//                 return VectorF16<Len, Packing, Repeats>(_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(v))));
+//             } else {
+//                 alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000};
+//                 __m512i sign_mask = _mm512_load_si512(reinterpret_cast<const __m256i*>(mask));
+//                 return VectorF16<Len, Packing, Repeats>(_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(v))));
+//             }
+// 		}
+
+// 		constexpr bool operator==(VectorF16<Len, Packing, Repeats> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 255;
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 return _mm256_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 65535;
+//             } else {
+//                return _mm512_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 4294967295;
+//             }
+// 		}
+
+//         template <typename BT, std::uint32_t Blen, std::uint32_t BAlignment>
+// 		constexpr bool operator!=(Vector<BT, Blen, BAlignment> b) const {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 255;
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 return _mm256_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 65535;
+//             } else {
+//                return _mm512_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 4294967295;
+//             }
+// 		}
+
+// 		constexpr void Normalize() {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+// 			    _Float16 dot = LengthSq();
+//                 __m128h vec = _mm_set1_ph(dot);
+//                 __m128h sqrt = _mm_rsqrt_ph(vec);
+//                 v = _mm_div_ps(v, sqrt);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 // __m256h mul = _mm256_mul_ph(a.v, b.v);
+//                 // return _mm256_reduce_add_ph(mul);
+//             } else {
+//                 // __m512h mul = _mm512_mul_ph(a.v, b.v);
+//                 // return _mm512_reduce_add_ph(mul);
+//             }
+// 		}
+
+
+// 		constexpr _Float16 Length() const {
+// 			_Float16 Result = LengthSq();
+// 			return std::sqrtf(Result);
+// 		}
+
+// 		constexpr _Float16 LengthSq() const {
+//     		return Dot(*this, *this);
+// 		}
+
+//     //     template <typename AT, std::uint32_t Alen, std::uint32_t AAlignment, typename BT, std::uint32_t Blen, std::uint32_t BAlignment>
+//     //     constexpr static Vector<T, Len, Aligment> Cross(Vector<AT, Alen, AAlignment> a, Vector<BT, Blen, BAlignment> b) requires(Len == 3 && Alen >= 3 && Blen >= 3) {
+//     //         return Vector<T, Len, Aligment>(
+//     //         	(a.v[1] * b.v[2]) - (a.v[2] * b.v[1]),
+//     //         	(a.v[2] * b.v[0]) - (a.v[0] * b.v[2]),
+//     //         	(a.v[0] * b.v[1]) - (a.v[1] * b.v[0])
+// 	// 		);
+//     //     }
+
+//     //     template <typename AT, std::uint32_t Alen, std::uint32_t AAlignment>
+//     //     constexpr static Vector<T, Len, Aligment> Normalize(Vector<AT, Alen, AAlignment> a) requires(Len == Alen) {
+//     //         Vector<T, 3, 0> returned;
+//     //         T fLength = a.Length();
+		
+// 	// 		if (fLength > 0) {
+// 	// 			fLength = 1.0f / fLength;
+// 	// 		}
+
+//     //         for(std::uint32_t i = 0; i < Len; i++) {
+// 	// 			returned.v[i] = a.v[i] * fLength;
+// 	// 		}
+//     //         return returned;
+//     //     }
+
+//         constexpr static _Float16 Dot(VectorF16<Len, 1, 1> a, VectorF16<Len, 1, 1> b) {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 __m128h mul = _mm_mul_ph(a.v, b.v);
+//                 return _mm_reduce_add_ph(mul);
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 __m256h mul = _mm256_mul_ph(a.v, b.v);
+//                 return _mm256_reduce_add_ph(mul);
+//             } else {
+//                 __m512h mul = _mm512_mul_ph(a.v, b.v);
+//                 return _mm512_reduce_add_ph(mul);
+//             }
+// 		}
+
+//         constexpr static VectorF16<8, 1, Repeats> Dot(
+//                 VectorF16<8, 1, Repeats> A0, VectorF16<8, 1, Repeats> A1, 
+//                 VectorF16<8, 1, Repeats> B0, VectorF16<8, 1, Repeats> B1, 
+//                 VectorF16<8, 1, Repeats> C0, VectorF16<8, 1, Repeats> C1, 
+//                 VectorF16<8, 1, Repeats> D0, VectorF16<8, 1, Repeats> D1, 
+//                 VectorF16<8, 1, Repeats> E0, VectorF16<8, 1, Repeats> E1, 
+//                 VectorF16<8, 1, Repeats> F0, VectorF16<8, 1, Repeats> F1, 
+//                 VectorF16<8, 1, Repeats> G0, VectorF16<8, 1, Repeats> G1, 
+//                 VectorF16<8, 1, Repeats> H0, VectorF16<8, 1, Repeats> H1 
+//             ) {
+//             if constexpr(std::is_same_v<VectorType, __m128h>) {
+//                 __m128h mulA = _mm_mul_ph(A0.v, A1.v);
+//                 __m128h mulB = _mm_mul_ph(B0.v, B1.v);
+//                 __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4
+//                 __m128i row56Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8
+//                 __m128i row1TempTemp1 = row12Temp1;
+//                 __m128i row5TempTemp1 = row56Temp1;
+
+//                 __m128h mulC = _mm_mul_ph(C0.v, C1.v);
+//                 __m128h mulD = _mm_mul_ph(D0.v, D1.v);
+//                 __m128i row34Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulC), _mm_castph_si128(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4
+//                 __m128i row78Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8
+
+//                 row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row34Temp1); // A1 B1 A2 B2 C1 D1 C2 D2
+//                 row12Temp1 = _mm_shuffle_epi32(row12Temp1, 0b01'00'11'10); // A1 B1 C1 D1 A2 B2 C2 D2
+
+//                 row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 B3 A4 B4 C3 D3 C4 D4
+//                 row34Temp1 = _mm_shuffle_epi32(row34Temp1, 0b01'00'11'10); // A3 B3 C3 D3 A4 B4 C4 D4
+
+//                 row56Temp1 = _mm_unpacklo_epi16(row56Temp1, row56Temp1); // A5 B5 A6 B6 C7 D7 C8 D8
+//                 row56Temp1 = _mm_shuffle_epi32(row56Temp1, 0b01'00'11'10); // A5 B5 C5 D5 A6 B6 C6 D6
+
+//                 row78Temp1 = _mm_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 B7 A8 B8 C7 D7 C8 D8
+//                 row78Temp1 = _mm_shuffle_epi32(row78Temp1, 0b01'00'11'10); // A7 B7 C7 D7 A8 B8 C8 D8
+
+
+//                 __m128h mulE = _mm_mul_ph(E0.v, E1.v);
+//                 __m128h mulF = _mm_mul_ph(F0.v, F1.v);
+//                 __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4
+//                 __m128i row56Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8
+//                 __m128i row1TempTemp2 = row12Temp2;
+//                 __m128i row5TempTemp2 = row56Temp2;
+
+//                 __m128h mulG = _mm_mul_ph(G0.v, G1.v);
+//                 __m128h mulH = _mm_mul_ph(H0.v, H1.v);
+//                 __m128i row34Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulG), _mm_castph_si128(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4
+//                 __m128i row78Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8
+
+//                 row12Temp2 = _mm_unpacklo_epi16(row12Temp2, row34Temp2); // E1 F1 E2 F2 G1 H1 G2 H2
+//                 row12Temp2 = _mm_shuffle_epi32(row12Temp2, 0b01'00'11'10); // E1 F1 G1 H1 E2 F2 G2 H2
+
+//                 row34Temp2 = _mm_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 F3 E4 F4 G3 H3 G4 H4
+//                 row34Temp2 = _mm_shuffle_epi32(row34Temp2, 0b01'00'11'10); // E3 F3 G3 H3 E4 F4 G4 H4
+
+//                 row56Temp2 = _mm_unpacklo_epi16(row56Temp2, row56Temp2); // E5 F5 E6 F6 G7 H7 G8 H8
+//                 row56Temp2 = _mm_shuffle_epi32(row56Temp2, 0b01'00'11'10); // E5 F5 G5 H5 E6 F6 G6 H6
+
+//                 row78Temp2 = _mm_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 F7 E8 F8 G7 H7 G8 H8
+//                 row78Temp2 = _mm_shuffle_epi32(row78Temp2, 0b01'00'11'10); // E7 F7 G7 H7 E8 F8 G8 H8
+
+//                 __m128h row1 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1
+//                 __m128h row2 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2
+//                 __m128h row3 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A3 B3 C3 D3 E3 F3 G3 H3
+//                 __m128h row4 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A4 B4 C4 D4 E4 F4 G4 H4
+//                 __m128h row5 = _mm_castsi128_ph(_mm_unpackhi_epi16(row56Temp1, row56Temp2));// A5 B5 C5 D5 E5 F5 G5 H5
+//                 __m128h row6 = _mm_castsi128_ph(_mm_unpacklo_epi16(row56Temp1, row56Temp2));// A6 B6 C6 D6 E6 F6 G6 H6
+//                 __m128h row7 = _mm_castsi128_ph(_mm_unpackhi_epi16(row78Temp1, row78Temp2));// A7 B7 C7 D7 E7 F7 G7 H7
+//                 __m128h row8 = _mm_castsi128_ph(_mm_unpacklo_epi16(row78Temp1, row78Temp2));// A8 B8 C8 D8 E8 F8 G8 H8
+
+
+//                 row1 = _mm_add_ph(row1, row2);
+//                 row1 = _mm_add_ph(row1, row3);
+//                 row1 = _mm_add_ph(row1, row4);
+//                 row1 = _mm_add_ph(row1, row5);
+//                 row1 = _mm_add_ph(row1, row6);
+//                 row1 = _mm_add_ph(row1, row7);
+//                 row1 = _mm_add_ph(row1, row8);
+                
+//                 return row1;
+//             } else if constexpr(std::is_same_v<VectorType, __m256h>) {
+//                 // __m256h mul = _mm256_mul_ph(a.v, b.v);
+//                 // return _mm256_reduce_add_ph(mul);
+//             } else {
+//                 // __m512h mul = _mm512_mul_ph(a.v, b.v);
+//                 // return _mm512_reduce_add_ph(mul);
+//             }
+// 		}
+
+//     //     template <typename AT, std::uint32_t AAlignment, typename BT, std::uint32_t BAlignment>
+//     //     constexpr static Vector<T, 3, Aligment> Rotate(Vector<AT, 3, AAlignment> v, Vector<BT, 4, BAlignment> q) requires(Len == 3) {
+//     //         Vector<T, 3, 0> qv(q.x, q.y, q.z);
+//     //         Vector<T, 3, 0> t = Vector<T, 3, Aligment>::Cross(qv, v) * T(2);
+//     //         return v + t * q.w + Vector<T, 3, Aligment>::Cross(qv, t);
+// 	// 	}
+
+//     //     template <typename AT, std::uint32_t AAlignment, typename BT, std::uint32_t BAlignment, typename PT, std::uint32_t PAlignment>
+//     //     constexpr static Vector<T, 3, Aligment> RotatePivot(Vector<AT, 3, AAlignment> v, Vector<BT, 4, BAlignment> q, Vector<PT, 3, PAlignment> pivot) requires(Len == 3) {
+//     //         Vector<T, 3, 0> translated = v - pivot;
+//     //         Vector<T, 3, 0> qv(q.x, q.y, q.z);
+//     //         Vector<T, 3, 0> t = Cross(qv, translated) * T(2);
+//     //         Vector<T, 3, 0> rotated = translated + t * q.w +Cross(qv, t);
+//     //         return rotated + pivot;
+//     //     }
+
+//     //     template <typename AT, std::uint32_t AAlignment, typename BT, std::uint32_t BAlignment, typename CT, std::uint32_t CAlignment>
+//     //     constexpr static Vector<T, 4, Aligment> QuanternionFromBasis(Vector<AT, 3, AAlignment> right, Vector<BT, 3, BAlignment> up, Vector<CT, 3, CAlignment> forward) requires(Len == 4) {
+//     //         T m00 = right.x;
+//     //         T m01 = up.x;
+//     //         T m02 = forward.x;
+
+//     //         T m10 = right.y;
+//     //         T m11 = up.y;
+//     //         T m12 = forward.y;
+
+//     //         T m20 = right.z;
+//     //         T m21 = up.z;
+//     //         T m22 = forward.z;
+
+//     //         T trace = m00 + m11 + m22;
+
+//     //         Vector<T, 4, Aligment> q;
+
+//     //         if (trace > std::numeric_limits<T>::epsilon()) {
+//     //             T s = std::sqrt(trace + T(1)) * T(2);
+//     //             q.w = T(0.25) * s;
+//     //             q.x = (m21 - m12) / s;
+//     //             q.y = (m02 - m20) / s;
+//     //             q.z = (m10 - m01) / s;
+//     //         }
+//     //         else if ((m00 > m11) && (m00 > m22)) {
+//     //             T s = std::sqrt(T(1) + m00 - m11 - m22) * T(2);
+//     //             q.w = (m21 - m12) / s;
+//     //             q.x = T(0.25) * s;
+//     //             q.y = (m01 + m10) / s;
+//     //             q.z = (m02 + m20) / s;
+//     //         }
+//     //         else if (m11 > m22) {
+//     //             T s = std::sqrt(T(1) + m11 - m00 - m22) * T(2);
+//     //             q.w = (m02 - m20) / s;
+//     //             q.x = (m01 + m10) / s;
+//     //             q.y = T(0.25) * s;
+//     //             q.z = (m12 + m21) / s;
+//     //         }
+//     //         else {
+//     //             T s = std::sqrt(T(1) + m22 - m00 - m11) * T(2);
+//     //             q.w = (m10 - m01) / s;
+//     //             q.x = (m02 + m20) / s;
+//     //             q.y = (m12 + m21) / s;
+//     //             q.z = T(0.25) * s;
+//     //         }
+
+//     //         q.Normalize();
+//     //         return q;
+// 	// 	}
+
+//     //     constexpr static Vector<T, 4, Aligment> QuanternionFromEuler(T roll, T pitch, T yaw) {
+//     //         T cr = std::cos(roll * 0.5);
+//     //         T sr = std::sin(roll * 0.5);
+//     //         T cp = std::cos(pitch * 0.5);
+//     //         T sp = std::sin(pitch * 0.5);
+//     //         T cy = std::cos(yaw * 0.5);
+//     //         T sy = std::sin(yaw * 0.5);
+
+//     //         return Vector<T, 4, Aligment>(
+//     //             sr * cp * cy - cr * sp * sy, 
+//     //             cr * sp * cy + sr * cp * sy, 
+//     //             cr * cp * sy - sr * sp * cy,
+//     //             cr * cp * cy + sr * sp * sy
+//     //         );
+//     //     }
+// 	};
+// }
+
+
+// export template <std::uint32_t Len, std::uint32_t Packing, std::uint32_t Repeats>
+// struct std::formatter<Crafter::VectorF16<Len, Packing, Repeats>> : std::formatter<std::string> {
+//     auto format(const Crafter::VectorF16<Len, Packing, Repeats>& obj, format_context& ctx) const {
+//         Crafter::Vector<_Float16, Len * Packing * Repeats, 0> vec = obj.template Store<Len * Packing * Repeats, 0>();
+//         std::string out;
+//         for(std::uint32_t i = 0; i < Repeats; i++) {
+//             out += "{";
+//             for(std::uint32_t i2 = 0; i2 < Packing; i2++) {
+//                 out += "{";
+//                 for(std::uint32_t i3 = 0; i3 < Len; i3++) {
+//                     out += std::format("{}", static_cast<float>(vec.v[i * Packing * Len + i2 * Len + i3]));
+//                     if (i3 + 1 < Len) out += ",";
+//                 }
+//                 out += "}";
+//             }
+//             out += "}";
+//         }
+//         return std::formatter<std::string>::format(out, ctx);
+//     }
+// };
--- a/interfaces/Crafter.Math.cppm
+++ b/interfaces/Crafter.Math.cppm
@ -22,4 +22,5 @@ export module Crafter.Math;
 export import :Basic;
 export import :Vector;
 export import :MatrixRowMajor;
-export import :Intersection;
+export import :Intersection;
+//export import :VectorF16;
--- a/interfaces/main.cpp
+++ b/interfaces/main.cpp
@ -5,9 +5,46 @@ import std;
 using namespace Crafter;

 int main() {
-    Vector<float, 3, 0> sphereCenter(-10,0,0);
-    float sphereRadius = 10;
-    Vector<float, 3, 0> boxSize(0,0,0); 
-    MatrixRowMajor<float, 4, 3, 1> boxMatrix = MatrixRowMajor<float, 4, 3, 1> ::Translation(10,0,0);
-    std::cout << IntersectionTestSphereOrientedBox(sphereCenter, sphereRadius, boxSize, boxMatrix) << std::endl;
+    // std::random_device rd;
+    // std::mt19937 gen(rd());
+    // std::uniform_real_distribution<float> dist(0, 100);
+
+    // Vector<_Float16, 8, 8> vA;
+    // for(std::uint32_t i = 0; i < 8; i++) {
+    //     vA.v[i] = dist(gen);
+    // }
+    // VectorF16<4, 2, 1> vfA(&vA);
+
+    // Vector<_Float16, 16, 16> vB;
+    // for(std::uint32_t i = 0; i < 16; i++) {
+    //     vB.v[i] = dist(gen);
+    // }
+    // VectorF16<4, 2, 2> vfB(&vB);
+
+    // VectorF16<4, 2, 1> vfC = vfA + vfB;
+    // auto start = std::chrono::high_resolution_clock::now();
+    // for(std::uint32_t i = 0; i < 90000000; i++) {
+    //     vfC = vfC + vfB;
+    // }
+    // auto end = std::chrono::high_resolution_clock::now();
+    // std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end-start) << std::endl;
+    // std::println("{}", vfC);
+
+
+   
+    // Vector<_Float16, 8, 8> vA;
+    // for(std::uint32_t i = 0; i < 8; i++) {
+    //     vA.v[i] = i;
+    // }
+    // VectorF16<8, 1, 1> vfA(&vA);
+    // VectorF16<8, 1, 1> dot = VectorF16<8, 1, 1>::Dot(vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA);
+    // std::println("{}", dot);
+
+    // Vector<float, 8, 8> vB;
+    // for(std::uint32_t i = 0; i < 8; i++) {
+    //     vB.v[i] = i;
+    // }
+    // float test = Vector<float, 8, 8>::Dot(vA, vA);
+
+    // std::println("{}", test);
 }
--- a/project.json
+++ b/project.json
@ -3,7 +3,13 @@
    "configurations": [
        {
            "name": "base",
-            "interfaces": ["interfaces/Crafter.Math-Vector", "interfaces/Crafter.Math-Basic", "interfaces/Crafter.Math-MatrixRowMajor", "interfaces/Crafter.Math", "interfaces/Crafter.Math-Intersection"],
+            "interfaces": [
+                "interfaces/Crafter.Math-Vector", 
+                "interfaces/Crafter.Math-Basic", 
+                "interfaces/Crafter.Math-MatrixRowMajor",
+                "interfaces/Crafter.Math", 
+                "interfaces/Crafter.Math-Intersection"
+            ],
            "implementations": []
        },
        {