From a6bf3ca572d872efecc88534c4e87b8077a886d2 Mon Sep 17 00:00:00 2001 From: Jorijn van der Graaf Date: Mon, 23 Mar 2026 21:42:40 +0100 Subject: [PATCH 1/2] fixed f16 shuffling --- interfaces/Crafter.Math-Basic.cppm | 8 +- interfaces/Crafter.Math-VectorF16.cppm | 2411 ++++++++++++------------ interfaces/main.cpp | 19 +- project.json | 3 +- 4 files changed, 1197 insertions(+), 1244 deletions(-) diff --git a/interfaces/Crafter.Math-Basic.cppm b/interfaces/Crafter.Math-Basic.cppm index a86234a..8757ee2 100755 --- a/interfaces/Crafter.Math-Basic.cppm +++ b/interfaces/Crafter.Math-Basic.cppm @@ -30,10 +30,10 @@ namespace Crafter { } #ifdef __AVX512FP16__ - export template - using VectorF16L = VectorF16; + export template + using VectorF16L = VectorF16; #else - export template - using VectorF16L = VectorF32; + export template + using VectorF16L = VectorF32; #endif } \ No newline at end of file diff --git a/interfaces/Crafter.Math-VectorF16.cppm b/interfaces/Crafter.Math-VectorF16.cppm index ec85f4d..7e0c649 100755 --- a/interfaces/Crafter.Math-VectorF16.cppm +++ b/interfaces/Crafter.Math-VectorF16.cppm @@ -26,31 +26,24 @@ import :Vector; #ifdef __AVX512FP16__ namespace Crafter { - export template + export template struct VectorF16 { - static constexpr std::uint32_t MaxSize = 32; - static constexpr std::uint32_t MaxElement = 8; - static consteval std::uint32_t GetAlignment() { - if constexpr (Len * Packing <= 8) { + static consteval std::uint8_t GetAlingment() { + if(Len * Packing <= 8) { return 8; - } - if constexpr (Len * Packing <= 16) { + } else if(Len * Packing <= 16) { return 16; - } - if constexpr (Len * Packing <= 32) { + } else if(Len * Packing <= 32) { return 32; } - static_assert(Len * Packing <= 32, "Len * Packing is larger than supported max size of 32"); - static_assert(Len * Packing <= 8, "Len * Packing is larger than supported packed size of 8"); - static_assert(Len * Packing * Repeats <= 32, "Len * Packing * Repeats is larger than supported max of 32"); - } - static consteval std::uint32_t GetTotalSize() { - return GetAlignment() * Repeats; } + static constexpr std::uint32_t MaxSize = 32; + static constexpr std::uint8_t Alignment = GetAlingment(); + static_assert(Len * Packing <= MaxSize, "Len * Packing exceeds MaxSize"); using VectorType = std::conditional_t< - (GetTotalSize() == 32), __m512h, - std::conditional_t<(GetTotalSize() == 16), __m256h, __m128h> + (Len * Packing > 16), __m512h, + std::conditional_t<(Len * Packing > 8), __m256h, __m128h> >; VectorType v; @@ -88,74 +81,73 @@ namespace Crafter { } } - template - constexpr Vector<_Float16, VLen, VAlign> Store() const { - Vector<_Float16, VLen, VAlign> returnVec; + constexpr Vector<_Float16, Len*Packing, Alignment> Store() const { + Vector<_Float16, Len*Packing, Alignment> returnVec; Store(returnVec.v); return returnVec; } - template - constexpr operator VectorF16() const { - if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { - return VectorF16(_mm256_castph256_ph128(v)); - } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { - return VectorF16(_mm512_castph512_ph128(v)); - } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { - return VectorF16(_mm512_castph512_ph256(v)); - } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { - return VectorF16(_mm256_castph128_ph256(v)); - } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { - return VectorF16(_mm512_castph128_ph512(v)); - } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { - return VectorF16(_mm512_castph256_ph512(v)); + template + constexpr operator VectorF16() const { + if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { + return VectorF16(_mm256_castph256_ph128(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { + return VectorF16(_mm512_castph512_ph128(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { + return VectorF16(_mm512_castph512_ph256(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { + return VectorF16(_mm256_castph128_ph256(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { + return VectorF16(_mm512_castph128_ph512(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { + return VectorF16(_mm512_castph256_ph512(v)); } else { - return VectorF16(v); + return VectorF16(v); } } - constexpr VectorF16 operator+(VectorF16 b) const { + constexpr VectorF16 operator+(VectorF16 b) const { if constexpr(std::is_same_v) { - return VectorF16(_mm_add_ph(v, b.v)); + return VectorF16(_mm_add_ph(v, b.v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_add_ph(v, b.v)); + return VectorF16(_mm256_add_ph(v, b.v)); } else { - return VectorF16(_mm512_add_ph(v, b.v)); + return VectorF16(_mm512_add_ph(v, b.v)); } } - constexpr VectorF16 operator-(VectorF16 b) const { + constexpr VectorF16 operator-(VectorF16 b) const { if constexpr(std::is_same_v) { - return VectorF16(_mm_sub_ph(v, b.v)); + return VectorF16(_mm_sub_ph(v, b.v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sub_ph(v, b.v)); + return VectorF16(_mm256_sub_ph(v, b.v)); } else { - return VectorF16(_mm512_sub_ph(v, b.v)); + return VectorF16(_mm512_sub_ph(v, b.v)); } } - constexpr VectorF16 operator*(VectorF16 b) const { + constexpr VectorF16 operator*(VectorF16 b) const { if constexpr(std::is_same_v) { - return VectorF16(_mm_mul_ph(v, b.v)); + return VectorF16(_mm_mul_ph(v, b.v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_mul_ph(v, b.v)); + return VectorF16(_mm256_mul_ph(v, b.v)); } else { - return VectorF16(_mm512_mul_ph(v, b.v)); + return VectorF16(_mm512_mul_ph(v, b.v)); } } - constexpr VectorF16 operator/(VectorF16 b) const { + constexpr VectorF16 operator/(VectorF16 b) const { if constexpr(std::is_same_v) { - return VectorF16(_mm_div_ph(v, b.v)); + return VectorF16(_mm_div_ph(v, b.v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_div_ph(v, b.v)); + return VectorF16(_mm256_div_ph(v, b.v)); } else { - return VectorF16(_mm512_div_ph(v, b.v)); + return VectorF16(_mm512_div_ph(v, b.v)); } } - constexpr void operator+=(VectorF16 b) const { + constexpr void operator+=(VectorF16 b) const { if constexpr(std::is_same_v) { v = _mm_add_ph(v, b.v); } else if constexpr(std::is_same_v) { @@ -165,7 +157,7 @@ namespace Crafter { } } - constexpr void operator-=(VectorF16 b) const { + constexpr void operator-=(VectorF16 b) const { if constexpr(std::is_same_v) { v = _mm_sub_ph(v, b.v); } else if constexpr(std::is_same_v) { @@ -175,7 +167,7 @@ namespace Crafter { } } - constexpr void operator*=(VectorF16 b) const { + constexpr void operator*=(VectorF16 b) const { if constexpr(std::is_same_v) { v = _mm_mul_ph(v, b.v); } else if constexpr(std::is_same_v) { @@ -185,7 +177,7 @@ namespace Crafter { } } - constexpr void operator/=(VectorF16 b) const { + constexpr void operator/=(VectorF16 b) const { if constexpr(std::is_same_v) { v = _mm_div_ph(v, b.v); } else if constexpr(std::is_same_v) { @@ -195,63 +187,63 @@ namespace Crafter { } } - constexpr VectorF16 operator+(_Float16 b) const { - VectorF16 vB(b); + constexpr VectorF16 operator+(_Float16 b) const { + VectorF16 vB(b); return this + vB; } - constexpr VectorF16 operator-(_Float16 b) const { - VectorF16 vB(b); + constexpr VectorF16 operator-(_Float16 b) const { + VectorF16 vB(b); return this - vB; } - constexpr VectorF16 operator*(_Float16 b) const { - VectorF16 vB(b); + constexpr VectorF16 operator*(_Float16 b) const { + VectorF16 vB(b); return this * vB; } - constexpr VectorF16 operator/(_Float16 b) const { - VectorF16 vB(b); + constexpr VectorF16 operator/(_Float16 b) const { + VectorF16 vB(b); return this / vB; } constexpr void operator+=(_Float16 b) const { - VectorF16 vB(b); + VectorF16 vB(b); this += vB; } constexpr void operator-=(_Float16 b) const { - VectorF16 vB(b); + VectorF16 vB(b); this -= vB; } constexpr void operator*=(_Float16 b) const { - VectorF16 vB(b); + VectorF16 vB(b); this *= vB; } constexpr void operator/=(_Float16 b) const { - VectorF16 vB(b); + VectorF16 vB(b); this /= vB; } - constexpr VectorF16 operator-(){ + constexpr VectorF16 operator-(){ if constexpr(std::is_same_v) { alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; __m128i sign_mask = _mm_load_si128(reinterpret_cast(mask)); - return VectorF16(_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(v)))); + return VectorF16(_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(v)))); } else if constexpr(std::is_same_v) { alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; __m256i sign_mask = _mm256_load_si256(reinterpret_cast(mask)); - return VectorF16(_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(v)))); + return VectorF16(_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(v)))); } else { alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; __m512i sign_mask = _mm512_load_si512(reinterpret_cast(mask)); - return VectorF16(_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(v)))); + return VectorF16(_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(v)))); } } - constexpr bool operator==(VectorF16 b) const { + constexpr bool operator==(VectorF16 b) const { if constexpr(std::is_same_v) { return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 255; } else if constexpr(std::is_same_v) { @@ -261,7 +253,7 @@ namespace Crafter { } } - constexpr bool operator!=(VectorF16 b) const { + constexpr bool operator!=(VectorF16 b) const { if constexpr(std::is_same_v) { return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 255; } else if constexpr(std::is_same_v) { @@ -299,186 +291,178 @@ namespace Crafter { return Dot(*this, *this); } - constexpr VectorF16 Cos() requires(Len == 3) { + constexpr VectorF16 Cos() { if constexpr(std::is_same_v) { - return VectorF16(_mm_cos_ph(v)); + return VectorF16(_mm_cos_ph(v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_cos_ph(v)); + return VectorF16(_mm256_cos_ph(v)); } else { - return VectorF16(_mm512_cos_ph(v)); + return VectorF16(_mm512_cos_ph(v)); } } - constexpr VectorF16 Sin() requires(Len == 3) { + constexpr VectorF16 Sin() { if constexpr(std::is_same_v) { - return VectorF16(_mm_sin_ph(v)); + return VectorF16(_mm_sin_ph(v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sin_ph(v)); + return VectorF16(_mm256_sin_ph(v)); } else { - return VectorF16(_mm512_sin_ph(v)); + return VectorF16(_mm512_sin_ph(v)); } } - template - constexpr VectorF16 Shuffle() { - if constexpr(A == B-1 && C == D-1 && E == F-1 && G == H-1) { - constexpr std::uint32_t val = - (A & 0x3) | - ((B & 0x3) << 2) | - ((C & 0x3) << 4) | - ((D & 0x3) << 6) | - ((E & 0x3) << 8) | - ((F & 0x3) << 10) | - ((G & 0x3) << 12) | - ((H & 0x3) << 14); - if constexpr(std::is_same_v) { - return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi32(_mm_castph_si128(v), val))); - } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_castsi256_ph(_mm256_shuffle_epi32(_mm256_castph_si256(v), val))); - } else { - return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi32(_mm_512castph_si512(v), val))); + template ShuffleValues> + static consteval bool CheckEpi32Shuffle() { + for(std::uint8_t i = 1; i < Len; i+=2) { + if(ShuffleValues[i-1] != ShuffleValues[i] - 1) { + return false; } - } else { + } + for(std::uint8_t i = 0; i < Len; i++) { + for(std::uint8_t i2 = 0; i2 < Len; i2 += 8) { + if(ShuffleValues[i] != ShuffleValues[i2]) { + return false; + } + } + } + return true; + } + + template ShuffleValues> + static consteval bool GetShuffleMaskEpi32() { + std::uint8_t mask = 0; + for(std::uint8_t i = 0; i < std::min(Len, std::uint32_t(8)); i+=2) { + mask = mask | (ShuffleValues[i] & 0b11) << i; + } + return mask; + } + + template ShuffleValues> + static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ + std::array shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); + shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); + } + } + return shuffleMask; + } + + template ShuffleValues> + static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ + std::array shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); + shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); + } + } + return shuffleMask; + } + + template ShuffleValues> + static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ + std::array shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); + shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); + } + } + return shuffleMask; + } + + template ShuffleValues> + constexpr VectorF16 Shuffle() { + if constexpr(VectorF16::CheckEpi32Shuffle()) { if constexpr(std::is_same_v) { - constexpr std::uint8_t shuffleMask[] { - A,A,B,B,C,C,D,D,E,E,F,F,G,G,H,H - }; - __m128h shuffleVec = _mm_loadu_epi8(shuffleMask); - return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec))); + return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi32(_mm_castph_si128(v), GetShuffleMaskEpi32()))); } else if constexpr(std::is_same_v) { - constexpr std::uint8_t shuffleMask[] { - A,A,B,B,C,C,D,D,E,E,F,F,G,G,H,H, - A+16,A+16,B+16,B+16,C+16,C+16,D+16,D+16,E+16,E+16,F+16,F+16,G+16,G+16,H+16,H+16, - }; - __m256h shuffleVec = _mm256_loadu_epi8(shuffleMask); - return VectorF16(_mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(v), shuffleVec))); + return VectorF16(_mm256_castsi256_ph(_mm256_shuffle_epi32(_mm256_castph_si256(v), GetShuffleMaskEpi32()))); } else { - constexpr std::uint8_t shuffleMask[] { - A,A,B,B,C,C,D,D,E,E,F,F,G,G,H,H, - A+16,A+16,B+16,B+16,C+16,C+16,D+16,D+16,E+16,E+16,F+16,F+16,G+16,G+16,H+16,H+16, - A+32,A+32,B+32,B+32,C+32,C+32,D+32,D+32,E+32,E+32,F+32,F+32,G+32,G+32,H+32,H+32, - A+48,A+48,B+48,B+48,C+48,C+48,D+48,D+48,E+48,E+48,F+48,F+48,G+48,G+48,H+48,H+48, - }; - __m512h shuffleVec = _mm512_loadu_epi8(shuffleMask); - return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(v), shuffleVec))); + return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi32(_mm512_castph_si512(v), GetShuffleMaskEpi32()))); } - } - } - - template < - std::uint8_t A0, std::uint8_t B0, std::uint8_t C0, std::uint8_t D0, std::uint8_t E0, std::uint8_t F0, std::uint8_t G0, std::uint8_t H0, - std::uint8_t A1, std::uint8_t B1, std::uint8_t C1, std::uint8_t D1, std::uint8_t E1, std::uint8_t F1, std::uint8_t G1, std::uint8_t H1 - > - constexpr VectorF16 Shuffle() requires(Repeats == 2) { - constexpr std::uint8_t shuffleMask[] { - A0,A0,B0,B0,C0,C0,D0,D0,E0,E0,F0,F0,G0,G0,H0,H0, - A1,A1,B1,B1,C1,C1,D1,D1,E1,E1,F1,F1,G1,G1,H1,H1, - }; - __m256h shuffleVec = _mm256_loadu_epi8(shuffleMask); - return VectorF16(_mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(v), shuffleVec))); - } - - template < - std::uint8_t A0, std::uint8_t B0, std::uint8_t C0, std::uint8_t D0, std::uint8_t E0, std::uint8_t F0, std::uint8_t G0, std::uint8_t H0, - std::uint8_t A1, std::uint8_t B1, std::uint8_t C1, std::uint8_t D1, std::uint8_t E1, std::uint8_t F1, std::uint8_t G1, std::uint8_t H1, - std::uint8_t A2, std::uint8_t B2, std::uint8_t C2, std::uint8_t D2, std::uint8_t E2, std::uint8_t F2, std::uint8_t G2, std::uint8_t H2, - std::uint8_t A3, std::uint8_t B3, std::uint8_t C3, std::uint8_t D3, std::uint8_t E3, std::uint8_t F3, std::uint8_t G3, std::uint8_t H3 - > - constexpr VectorF16 Shuffle() requires(Repeats == 4) { - constexpr std::uint8_t shuffleMask[] { - A0,A0,B0,B0,C0,C0,D0,D0,E0,E0,F0,F0,G0,G0,H0,H0, - A1,A1,B1,B1,C1,C1,D1,D1,E1,E1,F1,F1,G1,G1,H1,H1, - A2,A2,B2,B2,C2,C2,D2,D2,E2,E2,F2,F2,G2,G2,H2,H2, - A3,A3,B3,B3,C3,C3,D3,D3,E3,E3,F3,F3,G3,G3,H3,H3, - }; - __m512h shuffleVec = _mm512_loadu_epi8(shuffleMask); - return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(v), shuffleVec))); - } - - static constexpr VectorF16 MulitplyAdd(VectorF16 a, VectorF16 b, VectorF16 add) { - if constexpr(std::is_same_v) { - return VectorF16(_mm_fmadd_ph(a, b, add)); - } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_fmadd_ph(a, b, add)); } else { - return VectorF16(_mm512_fmadd_ph(a, b, add)); + if constexpr(std::is_same_v) { + constexpr std::array shuffleMask = GetShuffleMaskEpi8(); + __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec))); + } else if constexpr(std::is_same_v) { + constexpr std::array shuffleMask = GetShuffleMaskEpi8(); + __m256i shuffleVec = _mm256_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(v)), _mm512_castsi256_si512(shuffleVec))))); + } else { + constexpr std::array shuffleMask = GetShuffleMaskEpi8(); + __m512i shuffleVec = _mm512_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(v), shuffleVec))); + } } } - static constexpr VectorF16 MulitplySub(VectorF16 a, VectorF16 b, VectorF16 sub) { + static constexpr VectorF16 MulitplyAdd(VectorF16 a, VectorF16 b, VectorF16 add) { if constexpr(std::is_same_v) { - return VectorF16(_mm_fmsub_ph(a, b, sub)); + return VectorF16(_mm_fmadd_ph(a.v, b.v, add)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_fmsub_ph(a, b, sub)); + return VectorF16(_mm256_fmadd_ph(a.v, b.v, add)); } else { - return VectorF16(_mm512_fmsub_ph(a, b, sub)); + return VectorF16(_mm512_fmadd_ph(a.v, b.v, add)); } } - constexpr static VectorF16 Cross(VectorF16 a, VectorF16 b) requires(Len == 3 && Packing == 2) { - if constexpr(Len == 3) { - if constexpr(Repeats == 1) { - constexpr std::uint8_t shuffleMask1[] { - 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16 - }; - __m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1); - __m128h row1 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec1)); - __m128h row4 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec1)); + static constexpr VectorF16 MulitplySub(VectorF16 a, VectorF16 b, VectorF16 sub) { + if constexpr(std::is_same_v) { + return VectorF16(_mm_fmsub_ph(a.v, b.v, sub)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_fmsub_ph(a.v, b.v, sub)); + } else { + return VectorF16(_mm512_fmsub_ph(a.v, b.v, sub)); + } + } - constexpr std::uint8_t shuffleMask3[] { - 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16 - }; - __m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3); - __m128h row3 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec3)); - __m128h row2 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec3)); + constexpr static VectorF16 Cross(VectorF16 a, VectorF16 b) requires(Len == 3) { + if constexpr(std::is_same_v) { + constexpr std::array shuffleMask1 = GetShuffleMaskEpi8<{1,2,0}>(); + __m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1.data()); + __m128h row1 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec1)); + __m128h row4 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec1)); - __m128h result = _mm_mul_ph(row3, row4); - return _mm_fmsub_ph(row1,row2,result); - } - if constexpr(Repeats == 2) { - constexpr std::uint8_t shuffleMask1[] { - 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, - 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16 - }; - __m256i shuffleVec1 = _mm256_loadu_epi8(shuffleMask1); - __m256h row1 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(a.v), shuffleVec1)); - __m256h row4 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(b.v), shuffleVec1)); + constexpr std::array shuffleMask3 = GetShuffleMaskEpi8<{2,0,1}>(); + __m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3.data()); + __m128h row3 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec3)); + __m128h row2 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec3)); - constexpr std::uint8_t shuffleMask3[] { - 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, - 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16 - }; - __m256i shuffleVec3 = _mm256_loadu_epi8(shuffleMask3); - __m256h row3 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(a.v), shuffleVec3)); - __m256h row2 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(b.v), shuffleVec3)); + __m128h result = _mm_mul_ph(row3, row4); + return _mm_fmsub_ph(row1,row2,result); + } else if constexpr (std::is_same_v) { + constexpr std::array shuffleMask1 = GetShuffleMaskEpi8<{1,2,0}>(); + __m512i shuffleVec1 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask1.data())); + __m256h row1 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec1))); + __m256h row4 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(b.v)), shuffleVec1))); - __m256h result = _mm256_mul_ph(row3, row4); - return _mm256_fmsub_ph(row1,row2,result); - } - if constexpr(Repeats == 4) { - constexpr std::uint8_t shuffleMask1[] { - 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, - 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, - 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, - 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16 - }; - __m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1); - __m512h row1 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec1)); - __m512h row4 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec1)); + constexpr std::array shuffleMask3 = GetShuffleMaskEpi8<{2,0,1}>(); - constexpr std::uint8_t shuffleMask3[] { - 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, - 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, - 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, - 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16 - }; - __m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3); - __m512h row3 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec3)); - __m512h row2 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec3)); + __m512i shuffleVec3 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask3.data())); + __m256h row3 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec3))); + __m256h row2 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(b.v)), shuffleVec3))); - __m512h result = _mm512_mul_ph(row3, row4); - return _mm512_fmsub_ph(row1,row2,result); - } + __m256h result = _mm256_mul_ph(row3, row4); + return _mm256_fmsub_ph(row1,row2,result); + } else { + constexpr std::array shuffleMask1 = GetShuffleMaskEpi8<{1,2,0}>(); + + __m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1.data()); + __m512h row1 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec1)); + __m512h row4 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec1)); + + constexpr std::array shuffleMask3 = GetShuffleMaskEpi8<{2,0,1}>(); + + __m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3.data()); + __m512h row3 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec3)); + __m512h row2 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec3)); + + __m512h result = _mm512_mul_ph(row3, row4); + return _mm512_fmsub_ph(row1,row2,result); } } @@ -496,1088 +480,1047 @@ namespace Crafter { } - constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( - VectorF16 A, - VectorF16 B, - VectorF16 C, - VectorF16 D, - VectorF16 E, - VectorF16 F, - VectorF16 G, - VectorF16 H - ) requires(Packing == 1) { - if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; - __m128h one = _mm_loadu_ph(oneArr); - __m128h fLenght = _mm_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 - }; - __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); - __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskB[] { - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 - }; - __m128i shuffleVecB = _mm_loadu_epi8(shuffleMaskB); - __m128h fLenghtB = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecB)); - - constexpr std::uint8_t shuffleMaskC[] { - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 - }; - __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); - __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); - - constexpr std::uint8_t shuffleMaskD[] { - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 - }; - __m128i shuffleVecD = _mm_loadu_epi8(shuffleMaskD); - __m128h fLenghtD = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecD)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 - }; - __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); - __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); - - constexpr std::uint8_t shuffleMaskF[] { - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - }; - __m128i shuffleVecF = _mm_loadu_epi8(shuffleMaskF); - __m128h fLenghtF = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecF)); - - constexpr std::uint8_t shuffleMaskG[] { - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - }; - __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); - __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); - - constexpr std::uint8_t shuffleMaskH[] { - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - }; - __m128i shuffleVecH = _mm_loadu_epi8(shuffleMaskH); - __m128h fLenghtH = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecH)); - - return { - _mm_mul_ph(A.v, fLenghtA), - _mm_mul_ph(B.v, fLenghtB), - _mm_mul_ph(C.v, fLenghtC), - _mm_mul_ph(D.v, fLenghtD), - _mm_mul_ph(E.v, fLenghtE), - _mm_mul_ph(F.v, fLenghtF), - _mm_mul_ph(G.v, fLenghtG), - _mm_mul_ph(H.v, fLenghtH) - }; - } else if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m256h one = _mm256_loadu_ph(oneArr); - __m256h fLenght = _mm256_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 - }; - __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); - __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskB[] { - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 - }; - __m256i shuffleVecB = _mm256_loadu_epi8(shuffleMaskB); - __m256h fLenghtB = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecB)); - - constexpr std::uint8_t shuffleMaskC[] { - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 - }; - __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); - __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); - - constexpr std::uint8_t shuffleMaskD[] { - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 - }; - __m256i shuffleVecD = _mm256_loadu_epi8(shuffleMaskD); - __m256h fLenghtD = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecD)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 - }; - __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); - __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); - - constexpr std::uint8_t shuffleMaskF[] { - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - }; - __m256i shuffleVecF = _mm256_loadu_epi8(shuffleMaskF); - __m256h fLenghtF = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecF)); - - constexpr std::uint8_t shuffleMaskG[] { - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 - }; - __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); - __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); - - constexpr std::uint8_t shuffleMaskH[] { - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 - }; - __m256i shuffleVecH = _mm256_loadu_epi8(shuffleMaskH); - __m256h fLenghtH = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecH)); - - return { - _mm256_mul_ph(A.v, fLenghtA), - _mm256_mul_ph(B.v, fLenghtB), - _mm256_mul_ph(C.v, fLenghtC), - _mm256_mul_ph(D.v, fLenghtD), - _mm256_mul_ph(E.v, fLenghtE), - _mm256_mul_ph(F.v, fLenghtF), - _mm256_mul_ph(G.v, fLenghtG), - _mm256_mul_ph(H.v, fLenghtH) - }; - } else { - VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m512h one = _mm512_loadu_ph(oneArr); - __m512h fLenght = _mm512_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 - }; - __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); - __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskB[] { - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 - }; - __m512i shuffleVecB = _mm512_loadu_epi8(shuffleMaskB); - __m512h fLenghtB = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecB)); - - constexpr std::uint8_t shuffleMaskC[] { - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 - }; - __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); - __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); - - constexpr std::uint8_t shuffleMaskD[] { - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 - }; - __m512i shuffleVecD = _mm512_loadu_epi8(shuffleMaskD); - __m512h fLenghtD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecD)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 - }; - __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); - __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); - - constexpr std::uint8_t shuffleMaskF[] { - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - }; - __m512i shuffleVecF = _mm512_loadu_epi8(shuffleMaskF); - __m512h fLenghtF = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecF)); - - constexpr std::uint8_t shuffleMaskG[] { - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 - }; - __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); - __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); - - constexpr std::uint8_t shuffleMaskH[] { - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 - }; - __m512i shuffleVecH = _mm512_loadu_epi8(shuffleMaskH); - __m512h fLenghtH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecH)); - - return { - _mm512_mul_ph(A.v, fLenghtA), - _mm512_mul_ph(B.v, fLenghtB), - _mm512_mul_ph(C.v, fLenghtC), - _mm512_mul_ph(D.v, fLenghtD), - _mm512_mul_ph(E.v, fLenghtE), - _mm512_mul_ph(F.v, fLenghtF), - _mm512_mul_ph(G.v, fLenghtG), - _mm512_mul_ph(H.v, fLenghtH) - }; - } - } - - constexpr static std::tuple, VectorF16, VectorF16, VectorF16> Normalize( - VectorF16 A, - VectorF16 C, - VectorF16 E, - VectorF16 G - ) requires(Packing == 2) { - if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, C, E, G); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; - __m128h one = _mm_loadu_ph(oneArr); - __m128h fLenght = _mm_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 - }; - __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); - __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskC[] { - 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 - }; - __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); - __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 - }; - __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); - __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); - - constexpr std::uint8_t shuffleMaskG[] { - 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - }; - __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); - __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); - - return { - _mm_mul_ph(A.v, fLenghtA), - _mm_mul_ph(C.v, fLenghtC), - _mm_mul_ph(E.v, fLenghtE), - _mm_mul_ph(G.v, fLenghtG), - }; - } else if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, C, E, G); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m256h one = _mm256_loadu_ph(oneArr); - __m256h fLenght = _mm256_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 - }; - __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); - __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskC[] { - 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 - }; - __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); - __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 - }; - __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); - __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); - - constexpr std::uint8_t shuffleMaskG[] { - 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - }; - __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); - __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); - - return { - _mm256_mul_ph(A.v, fLenghtA), - _mm256_mul_ph(C.v, fLenghtC), - _mm256_mul_ph(E.v, fLenghtE), - _mm256_mul_ph(G.v, fLenghtG), - }; - } else { - VectorF16 lenght = Length(A, C, E, G); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m512h one = _mm512_loadu_ph(oneArr); - __m512h fLenght = _mm512_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 - }; - __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); - __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskC[] { - 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 - }; - __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); - __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 - }; - __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); - __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); - - constexpr std::uint8_t shuffleMaskG[] { - 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - }; - __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); - __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); - - return { - VectorF16(_mm512_mul_ph(A.v, fLenghtA)), - VectorF16(_mm512_mul_ph(C.v, fLenghtC)), - VectorF16(_mm512_mul_ph(E.v, fLenghtE)), - VectorF16(_mm512_mul_ph(G.v, fLenghtG)), - }; - } - } - - constexpr static std::tuple, VectorF16> Normalize( - VectorF16 A, - VectorF16 E - ) requires(Packing == 4) { - if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, E); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; - __m128h one = _mm_loadu_ph(oneArr); - __m128h fLenght = _mm_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 - }; - __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); - __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 - }; - __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); - __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); - - return { - _mm_mul_ph(A.v, fLenghtA), - _mm_mul_ph(E.v, fLenghtE), - }; - } else if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, E); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m256h one = _mm256_loadu_ph(oneArr); - __m256h fLenght = _mm256_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 - }; - __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); - __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 - }; - __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); - __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); - - return { - _mm256_mul_ph(A.v, fLenghtA), - _mm256_mul_ph(E.v, fLenghtE), - }; - } else { - VectorF16 lenght = Length(A, E); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m512h one = _mm512_loadu_ph(oneArr); - __m512h fLenght = _mm512_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskA[] { - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 - }; - __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); - __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); - - constexpr std::uint8_t shuffleMaskE[] { - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 - }; - __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); - __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); - - return { - _mm512_mul_ph(A.v, fLenghtA), - _mm512_mul_ph(E.v, fLenghtE), - }; - } - } - - constexpr static std::tuple, VectorF16> NormalizeRepeated( - VectorF16 A, - VectorF16 B, - VectorF16 C, - VectorF16 D, - VectorF16 E, - VectorF16 F, - VectorF16 G, - VectorF16 H - ) requires(Len == 8 && Packing == 1 && Repeats == 1) { - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - __m128h one = _mm_loadu_ph(oneArr); - __m128h fLenght = _mm_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskABCD[] { - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 - }; - __m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD); //10 0.5 - __m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD)); //1 1 - - __m512h vecABCD; - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0)); //3 1 - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3)); - vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD); //4 0.5 - - constexpr std::uint8_t shuffleMaskEFGH[] { - 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - }; - __m512h shuffleVecEFGH = _mm512_loadu_epi8(shuffleMaskEFGH); //10 0.5 - __m512h fLenghtEFGH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), _mm512_castph_si512(shuffleVecEFGH))); //1 1 - - __m512h vecEFGH; - vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(E.v), 0)); //3 1 - vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(F.v), 1)); - vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(G.v), 2)); - vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(H.v), 3)); - vecEFGH = _mm512_mul_ph(vecABCD, fLenghtEFGH); //4 0.5 - return { - vecABCD, - vecEFGH - }; - } - - constexpr static std::tuple, VectorF16> NormalizeRepeated( - VectorF16 A, - VectorF16 B, - VectorF16 C, - VectorF16 D, - VectorF16 E, - VectorF16 F, - VectorF16 G, - VectorF16 H - ) requires(Len == 4 && Packing == 2 && Repeats == 1) { - VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m128h one = _mm_loadu_ph(oneArr); - __m128h fLenght = _mm_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskABCD[] { - 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7, - 8,9,8,9,8,9,8,9,10,11,10,11,10,11, - 12,13,12,13,12,13,14,15,14,15,14,15 - }; - __m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD); - __m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD)); - - __m512h vecABCD; - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3)); - vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD); - return vecABCD; - } - - constexpr static std::tuple, VectorF16> NormalizeRepeated( - VectorF16 A, - VectorF16 B, - VectorF16 C, - VectorF16 D, - VectorF16 E, - VectorF16 F, - VectorF16 G, - VectorF16 H - ) requires(Len == 2 && Packing == 4 && Repeats == 1) { - VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m128h one = _mm_loadu_ph(oneArr); - __m128h fLenght = _mm_div_ph(one, lenght.v); - - constexpr std::uint8_t shuffleMaskABCD[] { - 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - 16,17,16,17,18,19,18,19,20,21,20,21,22,23,22,23, - 24,25,24,25,26,27,26,27,28,29,28,29,30,31,30,31 - }; - __m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD); - __m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD)); - - __m512h vecABCD; - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2)); - vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3)); - vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD); - return vecABCD; - } - - constexpr static VectorF16 Length( - VectorF16 A, - VectorF16 B, - VectorF16 C, - VectorF16 D, - VectorF16 E, - VectorF16 F, - VectorF16 G, - VectorF16 H - ) requires(Packing == 1) { - VectorF16 lenghtSq = LengthSq(A, B, C, D, E, F, G, H); - if constexpr(std::is_same_v) { - return VectorF16(_mm_sqrt_ph(lenghtSq.v)); - } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); - } else { - return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); - } - } - - constexpr static VectorF16 Length( - VectorF16 A, - VectorF16 C, - VectorF16 E, - VectorF16 G - ) requires(Packing == 2) { - VectorF16 lenghtSq = LengthSq(A, C, E, G); - if constexpr(std::is_same_v) { - return VectorF16(_mm_sqrt_ph(lenghtSq.v)); - } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); - } else { - return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); - } - } - - constexpr static VectorF16 Length( - VectorF16 A, - VectorF16 E - ) requires(Packing == 2) { - VectorF16 lenghtSq = LengthSq(A, E); - if constexpr(std::is_same_v) { - return VectorF16(_mm_sqrt_ph(lenghtSq.v)); - } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); - } else { - return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); - } - } - - constexpr static VectorF16 LengthSq( - VectorF16 A, - VectorF16 B, - VectorF16 C, - VectorF16 D, - VectorF16 E, - VectorF16 F, - VectorF16 G, - VectorF16 H - ) requires(Packing == 1) { - return Dot(A, A, B, B, C, C, D, D, E, E, F, F, G, G, H, H); - } - - constexpr static VectorF16 LengthSq( - VectorF16 A, - VectorF16 C, - VectorF16 E, - VectorF16 G - ) requires(Packing == 2) { - return Dot(A, A, C, C, E, E, G, G); - } - - constexpr static VectorF16 LengthSq( - VectorF16 A, - VectorF16 E - ) requires(Packing == 4) { - return Dot(A, A, E, E); - } - - constexpr static VectorF16 Dot( - VectorF16 A0, VectorF16 A1, - VectorF16 B0, VectorF16 B1, - VectorF16 C0, VectorF16 C1, - VectorF16 D0, VectorF16 D1, - VectorF16 E0, VectorF16 E1, - VectorF16 F0, VectorF16 F1, - VectorF16 G0, VectorF16 G1, - VectorF16 H0, VectorF16 H1 - ) requires(Packing == 1) { - if constexpr(std::is_same_v) { - __m128h mulA = _mm_mul_ph(A0.v, A1.v); - __m128h mulB = _mm_mul_ph(B0.v, B1.v); - __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 - __m128i row56Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 - __m128i row1TempTemp1 = row12Temp1; - __m128i row5TempTemp1 = row56Temp1; - - __m128h mulC = _mm_mul_ph(C0.v, C1.v); - __m128h mulD = _mm_mul_ph(D0.v, D1.v); - __m128i row34Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulC), _mm_castph_si128(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 - __m128i row78Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 - - row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 - row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 - row56Temp1 = _mm_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 - row78Temp1 = _mm_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 - - __m128h mulE = _mm_mul_ph(E0.v, E1.v); - __m128h mulF = _mm_mul_ph(F0.v, F1.v); - __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 - __m128i row56Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 - __m128i row1TempTemp2 = row12Temp2; - __m128i row5TempTemp2 = row56Temp2; - - __m128h mulG = _mm_mul_ph(G0.v, G1.v); - __m128h mulH = _mm_mul_ph(H0.v, H1.v); - __m128i row34Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulG), _mm_castph_si128(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 - __m128i row78Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 - - row12Temp2 = _mm_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 - row34Temp2 = _mm_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 - row56Temp2 = _mm_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 - row78Temp2 = _mm_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 - - __m128h row1 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 - __m128h row2 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 - __m128h row3 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 - __m128h row4 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 - __m128h row5 = _mm_castsi128_ph(_mm_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 - __m128h row6 = _mm_castsi128_ph(_mm_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 - __m128h row7 = _mm_castsi128_ph(_mm_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 - __m128h row8 = _mm_castsi128_ph(_mm_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 - - row1 = _mm_add_ph(row1, row2); - row1 = _mm_add_ph(row1, row3); - row1 = _mm_add_ph(row1, row4); - row1 = _mm_add_ph(row1, row5); - row1 = _mm_add_ph(row1, row6); - row1 = _mm_add_ph(row1, row7); - row1 = _mm_add_ph(row1, row8); + // constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( + // VectorF16 A, + // VectorF16 B, + // VectorF16 C, + // VectorF16 D, + // VectorF16 E, + // VectorF16 F, + // VectorF16 G, + // VectorF16 H + // ) requires(Packing == 1) { + // if constexpr(std::is_same_v) { + // VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + // __m128h one = _mm_loadu_ph(oneArr); + // __m128h fLenght = _mm_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 + // }; + // __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + // __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskB[] { + // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 + // }; + // __m128i shuffleVecB = _mm_loadu_epi8(shuffleMaskB); + // __m128h fLenghtB = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecB)); + + // constexpr std::uint8_t shuffleMaskC[] { + // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 + // }; + // __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); + // __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); + + // constexpr std::uint8_t shuffleMaskD[] { + // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 + // }; + // __m128i shuffleVecD = _mm_loadu_epi8(shuffleMaskD); + // __m128h fLenghtD = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecD)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 + // }; + // __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + // __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + // constexpr std::uint8_t shuffleMaskF[] { + // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + // }; + // __m128i shuffleVecF = _mm_loadu_epi8(shuffleMaskF); + // __m128h fLenghtF = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecF)); + + // constexpr std::uint8_t shuffleMaskG[] { + // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + // }; + // __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); + // __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); + + // constexpr std::uint8_t shuffleMaskH[] { + // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + // }; + // __m128i shuffleVecH = _mm_loadu_epi8(shuffleMaskH); + // __m128h fLenghtH = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecH)); + + // return { + // _mm_mul_ph(A.v, fLenghtA), + // _mm_mul_ph(B.v, fLenghtB), + // _mm_mul_ph(C.v, fLenghtC), + // _mm_mul_ph(D.v, fLenghtD), + // _mm_mul_ph(E.v, fLenghtE), + // _mm_mul_ph(F.v, fLenghtF), + // _mm_mul_ph(G.v, fLenghtG), + // _mm_mul_ph(H.v, fLenghtH) + // }; + // } else if constexpr(std::is_same_v) { + // VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + // __m256h one = _mm256_loadu_ph(oneArr); + // __m256h fLenght = _mm256_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 + // }; + // __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + // __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskB[] { + // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 + // }; + // __m256i shuffleVecB = _mm256_loadu_epi8(shuffleMaskB); + // __m256h fLenghtB = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecB)); + + // constexpr std::uint8_t shuffleMaskC[] { + // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 + // }; + // __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); + // __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); + + // constexpr std::uint8_t shuffleMaskD[] { + // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 + // }; + // __m256i shuffleVecD = _mm256_loadu_epi8(shuffleMaskD); + // __m256h fLenghtD = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecD)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 + // }; + // __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + // __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + // constexpr std::uint8_t shuffleMaskF[] { + // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + // }; + // __m256i shuffleVecF = _mm256_loadu_epi8(shuffleMaskF); + // __m256h fLenghtF = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecF)); + + // constexpr std::uint8_t shuffleMaskG[] { + // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 + // }; + // __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); + // __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); + + // constexpr std::uint8_t shuffleMaskH[] { + // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + // }; + // __m256i shuffleVecH = _mm256_loadu_epi8(shuffleMaskH); + // __m256h fLenghtH = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecH)); + + // return { + // _mm256_mul_ph(A.v, fLenghtA), + // _mm256_mul_ph(B.v, fLenghtB), + // _mm256_mul_ph(C.v, fLenghtC), + // _mm256_mul_ph(D.v, fLenghtD), + // _mm256_mul_ph(E.v, fLenghtE), + // _mm256_mul_ph(F.v, fLenghtF), + // _mm256_mul_ph(G.v, fLenghtG), + // _mm256_mul_ph(H.v, fLenghtH) + // }; + // } else { + // VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + // __m512h one = _mm512_loadu_ph(oneArr); + // __m512h fLenght = _mm512_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 + // }; + // __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + // __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskB[] { + // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 + // }; + // __m512i shuffleVecB = _mm512_loadu_epi8(shuffleMaskB); + // __m512h fLenghtB = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecB)); + + // constexpr std::uint8_t shuffleMaskC[] { + // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 + // }; + // __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); + // __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); + + // constexpr std::uint8_t shuffleMaskD[] { + // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 + // }; + // __m512i shuffleVecD = _mm512_loadu_epi8(shuffleMaskD); + // __m512h fLenghtD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecD)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 + // }; + // __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + // __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + // constexpr std::uint8_t shuffleMaskF[] { + // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + // }; + // __m512i shuffleVecF = _mm512_loadu_epi8(shuffleMaskF); + // __m512h fLenghtF = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecF)); + + // constexpr std::uint8_t shuffleMaskG[] { + // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 + // }; + // __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); + // __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); + + // constexpr std::uint8_t shuffleMaskH[] { + // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + // }; + // __m512i shuffleVecH = _mm512_loadu_epi8(shuffleMaskH); + // __m512h fLenghtH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecH)); + + // return { + // _mm512_mul_ph(A.v, fLenghtA), + // _mm512_mul_ph(B.v, fLenghtB), + // _mm512_mul_ph(C.v, fLenghtC), + // _mm512_mul_ph(D.v, fLenghtD), + // _mm512_mul_ph(E.v, fLenghtE), + // _mm512_mul_ph(F.v, fLenghtF), + // _mm512_mul_ph(G.v, fLenghtG), + // _mm512_mul_ph(H.v, fLenghtH) + // }; + // } + // } + + // constexpr static std::tuple, VectorF16, VectorF16, VectorF16> Normalize( + // VectorF16 A, + // VectorF16 C, + // VectorF16 E, + // VectorF16 G + // ) requires(Packing == 2) { + // if constexpr(std::is_same_v) { + // VectorF16 lenght = Length(A, C, E, G); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + // __m128h one = _mm_loadu_ph(oneArr); + // __m128h fLenght = _mm_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 + // }; + // __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + // __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskC[] { + // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 + // }; + // __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); + // __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 + // }; + // __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + // __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + // constexpr std::uint8_t shuffleMaskG[] { + // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + // }; + // __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); + // __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); + + // return { + // _mm_mul_ph(A.v, fLenghtA), + // _mm_mul_ph(C.v, fLenghtC), + // _mm_mul_ph(E.v, fLenghtE), + // _mm_mul_ph(G.v, fLenghtG), + // }; + // } else if constexpr(std::is_same_v) { + // VectorF16 lenght = Length(A, C, E, G); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + // __m256h one = _mm256_loadu_ph(oneArr); + // __m256h fLenght = _mm256_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 + // }; + // __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + // __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskC[] { + // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 + // }; + // __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); + // __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 + // }; + // __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + // __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + // constexpr std::uint8_t shuffleMaskG[] { + // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + // }; + // __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); + // __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); + + // return { + // _mm256_mul_ph(A.v, fLenghtA), + // _mm256_mul_ph(C.v, fLenghtC), + // _mm256_mul_ph(E.v, fLenghtE), + // _mm256_mul_ph(G.v, fLenghtG), + // }; + // } else { + // VectorF16 lenght = Length(A, C, E, G); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + // __m512h one = _mm512_loadu_ph(oneArr); + // __m512h fLenght = _mm512_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 + // }; + // __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + // __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskC[] { + // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 + // }; + // __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); + // __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 + // }; + // __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + // __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + // constexpr std::uint8_t shuffleMaskG[] { + // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + // }; + // __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); + // __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); + + // return { + // VectorF16(_mm512_mul_ph(A.v, fLenghtA)), + // VectorF16(_mm512_mul_ph(C.v, fLenghtC)), + // VectorF16(_mm512_mul_ph(E.v, fLenghtE)), + // VectorF16(_mm512_mul_ph(G.v, fLenghtG)), + // }; + // } + // } + + // constexpr static std::tuple, VectorF16> Normalize( + // VectorF16 A, + // VectorF16 E + // ) requires(Packing == 4) { + // if constexpr(std::is_same_v) { + // VectorF16 lenght = Length(A, E); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + // __m128h one = _mm_loadu_ph(oneArr); + // __m128h fLenght = _mm_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 + // }; + // __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + // __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 + // }; + // __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + // __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + // return { + // _mm_mul_ph(A.v, fLenghtA), + // _mm_mul_ph(E.v, fLenghtE), + // }; + // } else if constexpr(std::is_same_v) { + // VectorF16 lenght = Length(A, E); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + // __m256h one = _mm256_loadu_ph(oneArr); + // __m256h fLenght = _mm256_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 + // }; + // __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + // __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 + // }; + // __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + // __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + // return { + // _mm256_mul_ph(A.v, fLenghtA), + // _mm256_mul_ph(E.v, fLenghtE), + // }; + // } else { + // VectorF16 lenght = Length(A, E); + // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + // __m512h one = _mm512_loadu_ph(oneArr); + // __m512h fLenght = _mm512_div_ph(one, lenght.v); + + // constexpr std::uint8_t shuffleMaskA[] { + // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 + // }; + // __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + // __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + // constexpr std::uint8_t shuffleMaskE[] { + // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 + // }; + // __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + // __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + // return { + // _mm512_mul_ph(A.v, fLenghtA), + // _mm512_mul_ph(E.v, fLenghtE), + // }; + // } + // } + + // constexpr static VectorF16 Length( + // VectorF16 A, + // VectorF16 B, + // VectorF16 C, + // VectorF16 D, + // VectorF16 E, + // VectorF16 F, + // VectorF16 G, + // VectorF16 H + // ) requires(Packing == 1) { + // VectorF16 lenghtSq = LengthSq(A, B, C, D, E, F, G, H); + // if constexpr(std::is_same_v) { + // return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + // } else if constexpr(std::is_same_v) { + // return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + // } else { + // return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + // } + // } + + // constexpr static VectorF16 Length( + // VectorF16 A, + // VectorF16 C, + // VectorF16 E, + // VectorF16 G + // ) requires(Packing == 2) { + // VectorF16 lenghtSq = LengthSq(A, C, E, G); + // if constexpr(std::is_same_v) { + // return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + // } else if constexpr(std::is_same_v) { + // return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + // } else { + // return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + // } + // } + + // constexpr static VectorF16 Length( + // VectorF16 A, + // VectorF16 E + // ) requires(Packing == 4) { + // VectorF16 lenghtSq = LengthSq(A, E); + // if constexpr(std::is_same_v) { + // return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + // } else if constexpr(std::is_same_v) { + // return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + // } else { + // return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + // } + // } + + // constexpr static VectorF16 LengthSq( + // VectorF16 A, + // VectorF16 B, + // VectorF16 C, + // VectorF16 D, + // VectorF16 E, + // VectorF16 F, + // VectorF16 G, + // VectorF16 H + // ) requires(Packing == 1) { + // return Dot(A, A, B, B, C, C, D, D, E, E, F, F, G, G, H, H); + // } + + // constexpr static VectorF16 LengthSq( + // VectorF16 A, + // VectorF16 C, + // VectorF16 E, + // VectorF16 G + // ) requires(Packing == 2) { + // return Dot(A, A, C, C, E, E, G, G); + // } + + // constexpr static VectorF16 LengthSq( + // VectorF16 A, + // VectorF16 E + // ) requires(Packing == 4) { + // return Dot(A, A, E, E); + // } + + // constexpr static VectorF16 Dot( + // VectorF16 A0, VectorF16 A1, + // VectorF16 B0, VectorF16 B1, + // VectorF16 C0, VectorF16 C1, + // VectorF16 D0, VectorF16 D1, + // VectorF16 E0, VectorF16 E1, + // VectorF16 F0, VectorF16 F1, + // VectorF16 G0, VectorF16 G1, + // VectorF16 H0, VectorF16 H1 + // ) requires(Packing == 1) { + // if constexpr(std::is_same_v) { + // __m128h mulA = _mm_mul_ph(A0.v, A1.v); + // __m128h mulB = _mm_mul_ph(B0.v, B1.v); + // __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + // __m128i row56Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + // __m128i row1TempTemp1 = row12Temp1; + // __m128i row5TempTemp1 = row56Temp1; + + // __m128h mulC = _mm_mul_ph(C0.v, C1.v); + // __m128h mulD = _mm_mul_ph(D0.v, D1.v); + // __m128i row34Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulC), _mm_castph_si128(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + // __m128i row78Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + + // row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + // row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + // row56Temp1 = _mm_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + // row78Temp1 = _mm_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 + + // __m128h mulE = _mm_mul_ph(E0.v, E1.v); + // __m128h mulF = _mm_mul_ph(F0.v, F1.v); + // __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + // __m128i row56Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + // __m128i row1TempTemp2 = row12Temp2; + // __m128i row5TempTemp2 = row56Temp2; + + // __m128h mulG = _mm_mul_ph(G0.v, G1.v); + // __m128h mulH = _mm_mul_ph(H0.v, H1.v); + // __m128i row34Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulG), _mm_castph_si128(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + // __m128i row78Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + + // row12Temp2 = _mm_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + // row34Temp2 = _mm_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + // row56Temp2 = _mm_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + // row78Temp2 = _mm_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + + // __m128h row1 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + // __m128h row2 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + // __m128h row3 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + // __m128h row4 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + // __m128h row5 = _mm_castsi128_ph(_mm_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + // __m128h row6 = _mm_castsi128_ph(_mm_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + // __m128h row7 = _mm_castsi128_ph(_mm_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + // __m128h row8 = _mm_castsi128_ph(_mm_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + + // row1 = _mm_add_ph(row1, row2); + // row1 = _mm_add_ph(row1, row3); + // row1 = _mm_add_ph(row1, row4); + // row1 = _mm_add_ph(row1, row5); + // row1 = _mm_add_ph(row1, row6); + // row1 = _mm_add_ph(row1, row7); + // row1 = _mm_add_ph(row1, row8); - return row1; - } else if constexpr(std::is_same_v) { - __m256h mulA = _mm256_mul_ph(A0.v, A1.v); - __m256h mulB = _mm256_mul_ph(B0.v, B1.v); - __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 - __m256i row56Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 - __m256i row1TempTemp1 = row12Temp1; - __m256i row5TempTemp1 = row56Temp1; + // return row1; + // } else if constexpr(std::is_same_v) { + // __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + // __m256h mulB = _mm256_mul_ph(B0.v, B1.v); + // __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + // __m256i row56Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + // __m256i row1TempTemp1 = row12Temp1; + // __m256i row5TempTemp1 = row56Temp1; - __m256h mulC = _mm256_mul_ph(C0.v, C1.v); - __m256h mulD = _mm256_mul_ph(D0.v, D1.v); - __m256i row34Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulC), _mm256_castph_si256(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 - __m256i row78Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + // __m256h mulC = _mm256_mul_ph(C0.v, C1.v); + // __m256h mulD = _mm256_mul_ph(D0.v, D1.v); + // __m256i row34Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulC), _mm256_castph_si256(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + // __m256i row78Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 - row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 - row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 - row56Temp1 = _mm256_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 - row78Temp1 = _mm256_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 + // row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + // row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + // row56Temp1 = _mm256_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + // row78Temp1 = _mm256_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 - __m256h mulE = _mm256_mul_ph(E0.v, E1.v); - __m256h mulF = _mm256_mul_ph(F0.v, F1.v); - __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 - __m256i row56Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 - __m256i row1TempTemp2 = row12Temp2; - __m256i row5TempTemp2 = row56Temp2; + // __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + // __m256h mulF = _mm256_mul_ph(F0.v, F1.v); + // __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + // __m256i row56Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + // __m256i row1TempTemp2 = row12Temp2; + // __m256i row5TempTemp2 = row56Temp2; - __m256h mulG = _mm256_mul_ph(G0.v, G1.v); - __m256h mulH = _mm256_mul_ph(H0.v, H1.v); - __m256i row34Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulG), _mm256_castph_si256(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 - __m256i row78Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + // __m256h mulG = _mm256_mul_ph(G0.v, G1.v); + // __m256h mulH = _mm256_mul_ph(H0.v, H1.v); + // __m256i row34Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulG), _mm256_castph_si256(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + // __m256i row78Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 - row12Temp2 = _mm256_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 - row34Temp2 = _mm256_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 - row56Temp2 = _mm256_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 - row78Temp2 = _mm256_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + // row12Temp2 = _mm256_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + // row34Temp2 = _mm256_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + // row56Temp2 = _mm256_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + // row78Temp2 = _mm256_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 - __m256h row1 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 - __m256h row2 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 - __m256h row3 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 - __m256h row4 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 - __m256h row5 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 - __m256h row6 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 - __m256h row7 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 - __m256h row8 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + // __m256h row1 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + // __m256h row2 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + // __m256h row3 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + // __m256h row4 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + // __m256h row5 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + // __m256h row6 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + // __m256h row7 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + // __m256h row8 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 - row1 = _mm256_add_ph(row1, row2); - row1 = _mm256_add_ph(row1, row3); - row1 = _mm256_add_ph(row1, row4); - row1 = _mm256_add_ph(row1, row5); - row1 = _mm256_add_ph(row1, row6); - row1 = _mm256_add_ph(row1, row7); - row1 = _mm256_add_ph(row1, row8); + // row1 = _mm256_add_ph(row1, row2); + // row1 = _mm256_add_ph(row1, row3); + // row1 = _mm256_add_ph(row1, row4); + // row1 = _mm256_add_ph(row1, row5); + // row1 = _mm256_add_ph(row1, row6); + // row1 = _mm256_add_ph(row1, row7); + // row1 = _mm256_add_ph(row1, row8); - return row1; - } else { - __m512h mulA = _mm512_mul_ph(A0.v, A1.v); - __m512h mulB = _mm512_mul_ph(B0.v, B1.v); - __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 - __m512i row56Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 - __m512i row1TempTemp1 = row12Temp1; - __m512i row5TempTemp1 = row56Temp1; + // return row1; + // } else { + // __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + // __m512h mulB = _mm512_mul_ph(B0.v, B1.v); + // __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + // __m512i row56Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + // __m512i row1TempTemp1 = row12Temp1; + // __m512i row5TempTemp1 = row56Temp1; - __m512h mulC = _mm512_mul_ph(C0.v, C1.v); - __m512h mulD = _mm512_mul_ph(D0.v, D1.v); - __m512i row34Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulC), _mm512_castph_si512(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 - __m512i row78Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + // __m512h mulC = _mm512_mul_ph(C0.v, C1.v); + // __m512h mulD = _mm512_mul_ph(D0.v, D1.v); + // __m512i row34Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulC), _mm512_castph_si512(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + // __m512i row78Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 - row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 - row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 - row56Temp1 = _mm512_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 - row78Temp1 = _mm512_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 + // row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + // row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + // row56Temp1 = _mm512_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + // row78Temp1 = _mm512_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 - __m512h mulE = _mm512_mul_ph(E0.v, E1.v); - __m512h mulF = _mm512_mul_ph(F0.v, F1.v); - __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 - __m512i row56Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 - __m512i row1TempTemp2 = row12Temp2; - __m512i row5TempTemp2 = row56Temp2; + // __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + // __m512h mulF = _mm512_mul_ph(F0.v, F1.v); + // __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + // __m512i row56Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + // __m512i row1TempTemp2 = row12Temp2; + // __m512i row5TempTemp2 = row56Temp2; - __m512h mulG = _mm512_mul_ph(G0.v, G1.v); - __m512h mulH = _mm512_mul_ph(H0.v, H1.v); - __m512i row34Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulG), _mm512_castph_si512(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 - __m512i row78Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + // __m512h mulG = _mm512_mul_ph(G0.v, G1.v); + // __m512h mulH = _mm512_mul_ph(H0.v, H1.v); + // __m512i row34Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulG), _mm512_castph_si512(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + // __m512i row78Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 - row12Temp2 = _mm512_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 - row34Temp2 = _mm512_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 - row56Temp2 = _mm512_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 - row78Temp2 = _mm512_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + // row12Temp2 = _mm512_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + // row34Temp2 = _mm512_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + // row56Temp2 = _mm512_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + // row78Temp2 = _mm512_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 - __m512h row1 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 - __m512h row2 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 - __m512h row3 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 - __m512h row4 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 - __m512h row5 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 - __m512h row6 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 - __m512h row7 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 - __m512h row8 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + // __m512h row1 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + // __m512h row2 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + // __m512h row3 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + // __m512h row4 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + // __m512h row5 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + // __m512h row6 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + // __m512h row7 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + // __m512h row8 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 - row1 = _mm512_add_ph(row1, row2); - row1 = _mm512_add_ph(row1, row3); - row1 = _mm512_add_ph(row1, row4); - row1 = _mm512_add_ph(row1, row5); - row1 = _mm512_add_ph(row1, row6); - row1 = _mm512_add_ph(row1, row7); - row1 = _mm512_add_ph(row1, row8); + // row1 = _mm512_add_ph(row1, row2); + // row1 = _mm512_add_ph(row1, row3); + // row1 = _mm512_add_ph(row1, row4); + // row1 = _mm512_add_ph(row1, row5); + // row1 = _mm512_add_ph(row1, row6); + // row1 = _mm512_add_ph(row1, row7); + // row1 = _mm512_add_ph(row1, row8); - return row1; - } - } + // return row1; + // } + // } - constexpr static VectorF16 Dot( - VectorF16 A0, VectorF16 A1, - VectorF16 C0, VectorF16 C1, - VectorF16 E0, VectorF16 E1, - VectorF16 G0, VectorF16 G1 - ) requires(Packing == 2) { - if constexpr(std::is_same_v) { - __m128h mulA = _mm_mul_ph(A0.v, A1.v); - __m128h mulC = _mm_mul_ph(C0.v, C1.v); - __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 - __m128i row34Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 - __m128i row1TempTemp1 = row12Temp1; - __m128i row5TempTemp1 = row34Temp1; + // constexpr static VectorF16 Dot( + // VectorF16 A0, VectorF16 A1, + // VectorF16 C0, VectorF16 C1, + // VectorF16 E0, VectorF16 E1, + // VectorF16 G0, VectorF16 G1 + // ) requires(Packing == 2) { + // if constexpr(std::is_same_v) { + // __m128h mulA = _mm_mul_ph(A0.v, A1.v); + // __m128h mulC = _mm_mul_ph(C0.v, C1.v); + // __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + // __m128i row34Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + // __m128i row1TempTemp1 = row12Temp1; + // __m128i row5TempTemp1 = row34Temp1; - __m128h mulE = _mm_mul_ph(E0.v, E1.v); - __m128h mulG = _mm_mul_ph(G0.v, G1.v); - __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 - __m128i row34Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + // __m128h mulE = _mm_mul_ph(E0.v, E1.v); + // __m128h mulG = _mm_mul_ph(G0.v, G1.v); + // __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + // __m128i row34Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 - row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 - row12Temp2 = _mm_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 - row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 - row34Temp2 = _mm_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + // row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + // row12Temp2 = _mm_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + // row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + // row34Temp2 = _mm_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 - __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 - __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 - __m128h row3 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 - __m128h row4 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + // __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + // __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + // __m128h row3 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + // __m128h row4 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 - row1 = _mm_add_ph(row1, row2); - row1 = _mm_add_ph(row1, row3); - row1 = _mm_add_ph(row1, row4); + // row1 = _mm_add_ph(row1, row2); + // row1 = _mm_add_ph(row1, row3); + // row1 = _mm_add_ph(row1, row4); - return row1; - } else if constexpr(std::is_same_v) { - __m256h mulA = _mm256_mul_ph(A0.v, A1.v); - __m256h mulC = _mm256_mul_ph(C0.v, C1.v); - __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 - __m256i row34Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 - __m256i row1TempTemp1 = row12Temp1; - __m256i row5TempTemp1 = row34Temp1; + // return row1; + // } else if constexpr(std::is_same_v) { + // __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + // __m256h mulC = _mm256_mul_ph(C0.v, C1.v); + // __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + // __m256i row34Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + // __m256i row1TempTemp1 = row12Temp1; + // __m256i row5TempTemp1 = row34Temp1; - __m256h mulE = _mm256_mul_ph(E0.v, E1.v); - __m256h mulG = _mm256_mul_ph(G0.v, G1.v); - __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 - __m256i row34Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + // __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + // __m256h mulG = _mm256_mul_ph(G0.v, G1.v); + // __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + // __m256i row34Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 - row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 - row12Temp2 = _mm256_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 - row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 - row34Temp2 = _mm256_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + // row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + // row12Temp2 = _mm256_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + // row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + // row34Temp2 = _mm256_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 - __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 - __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 - __m256h row3 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 - __m256h row4 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + // __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + // __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + // __m256h row3 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + // __m256h row4 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 - row1 = _mm256_add_ph(row1, row2); - row1 = _mm256_add_ph(row1, row3); - row1 = _mm256_add_ph(row1, row4); + // row1 = _mm256_add_ph(row1, row2); + // row1 = _mm256_add_ph(row1, row3); + // row1 = _mm256_add_ph(row1, row4); - return row1; - } else { - __m512h mulA = _mm512_mul_ph(A0.v, A1.v); - __m512h mulC = _mm512_mul_ph(C0.v, C1.v); - __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 - __m512i row34Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 - __m512i row1TempTemp1 = row12Temp1; - __m512i row5TempTemp1 = row34Temp1; + // return row1; + // } else { + // __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + // __m512h mulC = _mm512_mul_ph(C0.v, C1.v); + // __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + // __m512i row34Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + // __m512i row1TempTemp1 = row12Temp1; + // __m512i row5TempTemp1 = row34Temp1; - __m512h mulE = _mm512_mul_ph(E0.v, E1.v); - __m512h mulG = _mm512_mul_ph(G0.v, G1.v); - __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 - __m512i row34Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + // __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + // __m512h mulG = _mm512_mul_ph(G0.v, G1.v); + // __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + // __m512i row34Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 - row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 - row12Temp2 = _mm512_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 - row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 - row34Temp2 = _mm512_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + // row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + // row12Temp2 = _mm512_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + // row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + // row34Temp2 = _mm512_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 - __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 - __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 - __m512h row3 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 - __m512h row4 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + // __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + // __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + // __m512h row3 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + // __m512h row4 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 - row1 = _mm512_add_ph(row1, row2); - row1 = _mm512_add_ph(row1, row3); - row1 = _mm512_add_ph(row1, row4); + // row1 = _mm512_add_ph(row1, row2); + // row1 = _mm512_add_ph(row1, row3); + // row1 = _mm512_add_ph(row1, row4); - return row1; - } - } + // return row1; + // } + // } - constexpr static VectorF16 Dot( - VectorF16 A0, VectorF16 A1, - VectorF16 E0, VectorF16 E1 - ) requires(Packing == 4) { - if constexpr(std::is_same_v) { - __m128h mulA = _mm_mul_ph(A0.v, A1.v); - __m128h mulE = _mm_mul_ph(E0.v, E1.v); - __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 - __m128i row12Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 - __m128i row12Temp1Temp = row12Temp1; + // constexpr static VectorF16 Dot( + // VectorF16 A0, VectorF16 A1, + // VectorF16 E0, VectorF16 E1 + // ) requires(Packing == 4) { + // if constexpr(std::is_same_v) { + // __m128h mulA = _mm_mul_ph(A0.v, A1.v); + // __m128h mulE = _mm_mul_ph(E0.v, E1.v); + // __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + // __m128i row12Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + // __m128i row12Temp1Temp = row12Temp1; - row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 - row12Temp2 = _mm_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + // row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + // row12Temp2 = _mm_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 - __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 - __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + // __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + // __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 - return _mm_add_ph(row1, row2); - } else if constexpr(std::is_same_v) { - __m256h mulA = _mm256_mul_ph(A0.v, A1.v); - __m256h mulE = _mm256_mul_ph(E0.v, E1.v); - __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 - __m256i row12Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 - __m256i row12Temp1Temp = row12Temp1; + // return _mm_add_ph(row1, row2); + // } else if constexpr(std::is_same_v) { + // __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + // __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + // __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + // __m256i row12Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + // __m256i row12Temp1Temp = row12Temp1; - row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 - row12Temp2 = _mm256_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + // row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + // row12Temp2 = _mm256_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 - __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 - __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + // __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + // __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 - return _mm256_add_ph(row1, row2); - } else { - __m512h mulA = _mm512_mul_ph(A0.v, A1.v); - __m512h mulE = _mm512_mul_ph(E0.v, E1.v); - __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 - __m512i row12Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 - __m512i row12Temp1Temp = row12Temp1; + // return _mm256_add_ph(row1, row2); + // } else { + // __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + // __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + // __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + // __m512i row12Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + // __m512i row12Temp1Temp = row12Temp1; - row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 - row12Temp2 = _mm512_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + // row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + // row12Temp2 = _mm512_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 - __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 - __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + // __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + // __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 - return _mm512_add_ph(row1, row2); - } - } + // return _mm512_add_ph(row1, row2); + // } + // } - template - constexpr static VectorF16 Blend(VectorF16 a, VectorF16 b) { - if constexpr(std::is_same_v) { - constexpr std::uint8_t val = - (A & 1) | - ((B & 1) << 1) | - ((C & 1) << 2) | - ((D & 1) << 3) | - ((E & 1) << 4) | - ((F & 1) << 5) | - ((G & 1) << 6) | - ((H & 1) << 7); - return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b), val)); - } else if constexpr(std::is_same_v) { - constexpr std::uint8_t val = - (A & 1) | - ((B & 1) << 1) | - ((C & 1) << 2) | - ((D & 1) << 3) | - ((E & 1) << 4) | - ((F & 1) << 5) | - ((G & 1) << 6) | - ((H & 1) << 7); - return _mm256_castsi256_ph(_mm256_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), val)); - } else { - constexpr std::uint8_t byte = - (A & 1) | - ((B & 1) << 1) | - ((C & 1) << 2) | - ((D & 1) << 3) | - ((E & 1) << 4) | - ((F & 1) << 5) | - ((G & 1) << 6) | - ((H & 1) << 7); + // template + // constexpr static VectorF16 Blend(VectorF16 a, VectorF16 b) { + // if constexpr(std::is_same_v) { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((E & 1) << 4) | + // ((F & 1) << 5) | + // ((G & 1) << 6) | + // ((H & 1) << 7); + // return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b), val)); + // } else if constexpr(std::is_same_v) { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((E & 1) << 4) | + // ((F & 1) << 5) | + // ((G & 1) << 6) | + // ((H & 1) << 7); + // return _mm256_castsi256_ph(_mm256_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), val)); + // } else { + // constexpr std::uint8_t byte = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((E & 1) << 4) | + // ((F & 1) << 5) | + // ((G & 1) << 6) | + // ((H & 1) << 7); - constexpr std::uint32_t val = byte * 0x01010101u; - return _mm512_castsi512_ph(_mm512_mask_blend_epi16(val, _mm512_castph_si512(a.v), _mm512_castph_si512(b))); - } - } + // constexpr std::uint32_t val = byte * 0x01010101u; + // return _mm512_castsi512_ph(_mm512_mask_blend_epi16(val, _mm512_castph_si512(a.v), _mm512_castph_si512(b))); + // } + // } - constexpr static VectorF16 Rotate(VectorF16<3, 2, Repeats> v, VectorF16<4, 2, Repeats> q) requires(Len == 3 && Packing == 2) { - VectorF16<3, 2, Repeats> qv(q.v); - VectorF16 t = Cross(qv, v) * _Float16(2); - return v + t * q.template Shuffle<3,3,3,3,7,7,7,7>(); + Cross(qv, t); - } + // template + // constexpr static VectorF16 BlendPacked(VectorF16 a, VectorF16 b) requires(Packing == 2) { + // if constexpr(std::is_same_v) { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((A & 1) << 4) | + // ((B & 1) << 5) | + // ((C & 1) << 6) | + // ((D & 1) << 7); + // return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b), val)); + // } else if constexpr(std::is_same_v) { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((A & 1) << 4) | + // ((B & 1) << 5) | + // ((C & 1) << 6) | + // ((D & 1) << 7); + // return _mm256_castsi256_ph(_mm256_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), val)); + // } else { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((A & 1) << 4) | + // ((B & 1) << 5) | + // ((C & 1) << 6) | + // ((D & 1) << 7); - constexpr static VectorF16<4, 2, Repeats> RotatePivot(VectorF16<3, 2, Repeats> v, VectorF16<4, 2, Repeats> q, VectorF16<3, 2, Repeats> pivot) requires(Len == 3 && Packing == 2) { - VectorF16 translated = v - pivot; - VectorF16<3, 2, Repeats> qv(q.v); - VectorF16 t = Cross(qv, translated) * _Float16(2); - VectorF16 rotated = translated + t * q.template Shuffle<3,3,3,3,7,7,7,7>() + Cross(qv, t); - return rotated + pivot; - } + // constexpr std::uint32_t val = byte * 0x01010101u; + // return _mm512_castsi512_ph(_mm512_mask_blend_epi16(val, _mm512_castph_si512(a.v), _mm512_castph_si512(b))); + // } + // } - constexpr static VectorF16<4, 2, Repeats> QuanternionFromEuler(VectorF16<3, 2, Repeats> EulerHalf) requires(Len == 3 && Packing == 2) { - VectorF16<3, 2, Repeats> sin = EulerHalf.Sin(); - VectorF16<3, 2, Repeats> cos = EulerHalf.Cos(); + // template + // constexpr static VectorF16 BlendPacked(VectorF16 a, VectorF16 b) requires(Packing == 4) { + // if constexpr(std::is_same_v) { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((A & 1) << 4) | + // ((B & 1) << 5) | + // ((C & 1) << 6) | + // ((D & 1) << 7); + // return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b), val)); + // } else if constexpr(std::is_same_v) { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((A & 1) << 4) | + // ((B & 1) << 5) | + // ((C & 1) << 6) | + // ((D & 1) << 7); + // return _mm256_castsi256_ph(_mm256_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), val)); + // } else { + // constexpr std::uint8_t val = + // (A & 1) | + // ((B & 1) << 1) | + // ((C & 1) << 2) | + // ((D & 1) << 3) | + // ((A & 1) << 4) | + // ((B & 1) << 5) | + // ((C & 1) << 6) | + // ((D & 1) << 7); - VectorF16<3, 2, Repeats> row1 = cos.template Shuffle<0,0,0,0,4,4,4,4>(); - row1 = VectorF16<3, 2, Repeats>::Blend<0,1,1,1, 0,1,1,1>(sin, row1); + // constexpr std::uint32_t val = byte * 0x01010101u; + // return _mm512_castsi512_ph(_mm512_mask_blend_epi16(val, _mm512_castph_si512(a.v), _mm512_castph_si512(b))); + // } + // } - VectorF16<3, 2, Repeats> row2 = cos.template Shuffle<1,1,1,1,5,5,5,5>(); - row2 = VectorF16<3, 2, Repeats>::Blend<1,0,1,1, 1,0,1,1>(sin, row2); + // constexpr static VectorF16 Rotate(VectorF16<3, 2, Repeats> v, VectorF16<4, 2, Repeats> q) requires(Len == 3 && Packing == 2) { + // VectorF16<3, 2, Repeats> qv(q.v); + // VectorF16 t = Cross(qv, v) * _Float16(2); + // return v + t * q.template Shuffle<3,3,3,3,7,7,7,7>(); + Cross(qv, t); + // } - row1 = row2; + // constexpr static VectorF16<4, 2, Repeats> RotatePivot(VectorF16<3, 2, Repeats> v, VectorF16<4, 2, Repeats> q, VectorF16<3, 2, Repeats> pivot) requires(Len == 3 && Packing == 2) { + // VectorF16 translated = v - pivot; + // VectorF16<3, 2, Repeats> qv(q.v); + // VectorF16 t = Cross(qv, translated) * _Float16(2); + // VectorF16 rotated = translated + t * q.template Shuffle<3,3,3,3,7,7,7,7>() + Cross(qv, t); + // return rotated + pivot; + // } - VectorF16<3, 2, Repeats> row3 = cos.template Shuffle<2,2,2,2,6,6,6,6>(); - row3 = VectorF16<3, 2, Repeats>::Blend<1,1,0,1, 1,1,0,1>(sin, row3); + // constexpr static VectorF16<4, 2, Repeats> QuanternionFromEuler(VectorF16<3, 2, Repeats> EulerHalf) requires(Len == 3 && Packing == 2) { + // VectorF16<3, 2, Repeats> sin = EulerHalf.Sin(); + // VectorF16<3, 2, Repeats> cos = EulerHalf.Cos(); - VectorF16<3, 2, Repeats> row4 = sin.template Shuffle<0,0,0,0,4,4,4,4>(); - row4 = VectorF16<3, 2, Repeats>::Blend<1,0,0,0, 1,0,0,0>(sin, row4); + // VectorF16<3, 2, Repeats> row1 = cos.template Shuffle<0,0,0,0,4,4,4,4>(); + // row1 = VectorF16<3, 2, Repeats>::Blend<0,1,1,1, 0,1,1,1>(sin, row1); + + // VectorF16<3, 2, Repeats> row2 = cos.template Shuffle<1,1,1,1,5,5,5,5>(); + // row2 = VectorF16<3, 2, Repeats>::Blend<1,0,1,1, 1,0,1,1>(sin, row2); + + // row1 = row2; + + // VectorF16<3, 2, Repeats> row3 = cos.template Shuffle<2,2,2,2,6,6,6,6>(); + // row3 = VectorF16<3, 2, Repeats>::Blend<1,1,0,1, 1,1,0,1>(sin, row3); + + // VectorF16<3, 2, Repeats> row4 = sin.template Shuffle<0,0,0,0,4,4,4,4>(); + // row4 = VectorF16<3, 2, Repeats>::Blend<1,0,0,0, 1,0,0,0>(sin, row4); - if constexpr(std::is_same_v) { - constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; - __m128i sign_mask = _mm_load_si128(reinterpret_cast(mask)); - row4.v = (_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(row4.v)))); - } else if constexpr(std::is_same_v) { - constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; - __m256i sign_mask = _mm256_load_si256(reinterpret_cast(mask)); - row4.v = (_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(row4.v)))); - } else { - constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; - __m512i sign_mask = _mm512_load_si512(reinterpret_cast(mask)); - row4.v = (_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(row4.v)))); - } + // if constexpr(std::is_same_v) { + // constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; + // __m128i sign_mask = _mm_load_si128(reinterpret_cast(mask)); + // row4.v = (_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(row4.v)))); + // } else if constexpr(std::is_same_v) { + // constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; + // __m256i sign_mask = _mm256_load_si256(reinterpret_cast(mask)); + // row4.v = (_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(row4.v)))); + // } else { + // constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; + // __m512i sign_mask = _mm512_load_si512(reinterpret_cast(mask)); + // row4.v = (_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(row4.v)))); + // } - row1 = MulitplyAdd(row1, row3, row4); + // row1 = MulitplyAdd(row1, row3, row4); - VectorF16<3, 2, Repeats> row5 = sin.template Shuffle<1,1,1,1,5,5,5,5>(); - row5 = VectorF16<3, 2, Repeats>::Blend<0,1,0,0, 0,1,0,0>(sin, row5); + // VectorF16<3, 2, Repeats> row5 = sin.template Shuffle<1,1,1,1,5,5,5,5>(); + // row5 = VectorF16<3, 2, Repeats>::Blend<0,1,0,0, 0,1,0,0>(sin, row5); - row1 *= row5; + // row1 *= row5; - VectorF16<3, 2, Repeats> row6 = sin.template Shuffle<2,2,2,2,6,6,6,6>(); - row6 = VectorF16<3, 2, Repeats>::Blend<0,0,1,0, 0,0,1,0>(sin, row6); + // VectorF16<3, 2, Repeats> row6 = sin.template Shuffle<2,2,2,2,6,6,6,6>(); + // row6 = VectorF16<3, 2, Repeats>::Blend<0,0,1,0, 0,0,1,0>(sin, row6); - return row1 * row6; - } + // return row1 * row6; + // } }; } -export template -struct std::formatter> : std::formatter { - auto format(const Crafter::VectorF16& obj, format_context& ctx) const { - Crafter::Vector<_Float16, Len * Packing * Repeats, 0> vec = obj.template Store(); - std::string out; - for(std::uint32_t i = 0; i < Repeats; i++) { +export template +struct std::formatter> : std::formatter { + constexpr auto format(const Crafter::VectorF16& obj, format_context& ctx) const { + Crafter::Vector<_Float16, Len * Packing, 0> vec = obj.Store(); + std::string out = "{"; + for(std::uint32_t i = 0; i < Packing; i++) { out += "{"; - for(std::uint32_t i2 = 0; i2 < Packing; i2++) { - out += "{"; - for(std::uint32_t i3 = 0; i3 < Len; i3++) { - out += std::format("{}", static_cast(vec.v[i * Packing * Len + i2 * Len + i3])); - if (i3 + 1 < Len) out += ","; - } - out += "}"; + for(std::uint32_t i2 = 0; i2 < Len; i2++) { + out += std::format("{}", static_cast(vec.v[i * Len + i2])); + if (i2 + 1 < Len) out += ","; } out += "}"; } + out += "}"; return std::formatter::format(out, ctx); } }; diff --git a/interfaces/main.cpp b/interfaces/main.cpp index 210c452..7dfbdc1 100644 --- a/interfaces/main.cpp +++ b/interfaces/main.cpp @@ -5,10 +5,21 @@ import std; using namespace Crafter; int main() { - _Float16 test[] {0,1,2,3,0,1,2,3}; - VectorF16L<4,1,2> vec(test); - VectorF16L<4,1,2> vec2(test); - std::println("{}", vec+vec2); + // _Float16 test[] {2,1,2, 2,1,2, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + // _Float16 test2[] {2,3,3, 2,5,21, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + // VectorF16L<3,2> vec(test); + // VectorF16L<3,2> vec2(test2); + // VectorF16L<3,2> result = VectorF16L<3,2>::Cross(vec, vec2); + + + // Vector test5(2,1,2); + // Vector test6(2,3,3); + // Vector test3 = Vector::Cross(test5, test6); + + + + //VectorF16L<3,2> result = vec + vec2; + //std::println("{}\n{}", result, test3); // std::random_device rd; // std::mt19937 gen(rd()); // std::uniform_real_distribution dist(0, 100); diff --git a/project.json b/project.json index 35d8e5e..adaeb23 100644 --- a/project.json +++ b/project.json @@ -29,8 +29,7 @@ "name": "test", "implementations": ["interfaces/main"], "extends": ["base"], - "debug": true, - "march": "raptorlake" + "debug": true } ] } \ No newline at end of file From 7bd67a2cb913b7b32729ad598e467ea72c89ad2e Mon Sep 17 00:00:00 2001 From: Jorijn van der Graaf Date: Tue, 24 Mar 2026 00:18:00 +0100 Subject: [PATCH 2/2] fully operational F16 --- interfaces/Crafter.Math-VectorF16.cppm | 1989 +++++++++++------------- interfaces/main.cpp | 75 +- project.json | 20 +- tests/VectorF16.cpp | 52 + 4 files changed, 954 insertions(+), 1182 deletions(-) create mode 100644 tests/VectorF16.cpp diff --git a/interfaces/Crafter.Math-VectorF16.cppm b/interfaces/Crafter.Math-VectorF16.cppm index 7e0c649..c334672 100755 --- a/interfaces/Crafter.Math-VectorF16.cppm +++ b/interfaces/Crafter.Math-VectorF16.cppm @@ -28,6 +28,7 @@ import :Vector; namespace Crafter { export template struct VectorF16 { + private: static consteval std::uint8_t GetAlingment() { if(Len * Packing <= 8) { return 8; @@ -37,16 +38,16 @@ namespace Crafter { return 32; } } - static constexpr std::uint32_t MaxSize = 32; - static constexpr std::uint8_t Alignment = GetAlingment(); - static_assert(Len * Packing <= MaxSize, "Len * Packing exceeds MaxSize"); - using VectorType = std::conditional_t< (Len * Packing > 16), __m512h, std::conditional_t<(Len * Packing > 8), __m256h, __m128h> >; VectorType v; + public: + static constexpr std::uint32_t MaxSize = 32; + static constexpr std::uint8_t Alignment = GetAlingment(); + static_assert(Len * Packing <= MaxSize, "Len * Packing exceeds MaxSize"); constexpr VectorF16() = default; constexpr VectorF16(VectorType v) : v(v) {} @@ -226,21 +227,9 @@ namespace Crafter { VectorF16 vB(b); this /= vB; } - + constexpr VectorF16 operator-(){ - if constexpr(std::is_same_v) { - alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; - __m128i sign_mask = _mm_load_si128(reinterpret_cast(mask)); - return VectorF16(_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(v)))); - } else if constexpr(std::is_same_v) { - alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; - __m256i sign_mask = _mm256_load_si256(reinterpret_cast(mask)); - return VectorF16(_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(v)))); - } else { - alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; - __m512i sign_mask = _mm512_load_si512(reinterpret_cast(mask)); - return VectorF16(_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(v)))); - } + return Negate(); } constexpr bool operator==(VectorF16 b) const { @@ -311,66 +300,16 @@ namespace Crafter { } } - template ShuffleValues> - static consteval bool CheckEpi32Shuffle() { - for(std::uint8_t i = 1; i < Len; i+=2) { - if(ShuffleValues[i-1] != ShuffleValues[i] - 1) { - return false; - } + template values> + constexpr std::array Negate() { + std::array mask = GetShuffleMaskEpi32(); + if constexpr(std::is_same_v) { + return VectorF16(_mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(v), _mm_loadu_epi16(mask.data())))); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_castsi2568_ph(_mm256_xor_si256(_mm256_castph_si256(v), _mm_loadu_epi16(mask.data())))); + } else { + return VectorF16(_mm512_castsi512_ph(_mm512_xor_si256(_mm512_castph_si512(v), _mm_loadu_epi16(mask.data())))); } - for(std::uint8_t i = 0; i < Len; i++) { - for(std::uint8_t i2 = 0; i2 < Len; i2 += 8) { - if(ShuffleValues[i] != ShuffleValues[i2]) { - return false; - } - } - } - return true; - } - - template ShuffleValues> - static consteval bool GetShuffleMaskEpi32() { - std::uint8_t mask = 0; - for(std::uint8_t i = 0; i < std::min(Len, std::uint32_t(8)); i+=2) { - mask = mask | (ShuffleValues[i] & 0b11) << i; - } - return mask; - } - - template ShuffleValues> - static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ - std::array shuffleMask {{0}}; - for(std::uint8_t i2 = 0; i2 < Packing; i2++) { - for(std::uint8_t i = 0; i < Len; i++) { - shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); - shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); - } - } - return shuffleMask; - } - - template ShuffleValues> - static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ - std::array shuffleMask {{0}}; - for(std::uint8_t i2 = 0; i2 < Packing; i2++) { - for(std::uint8_t i = 0; i < Len; i++) { - shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); - shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); - } - } - return shuffleMask; - } - - template ShuffleValues> - static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ - std::array shuffleMask {{0}}; - for(std::uint8_t i2 = 0; i2 < Packing; i2++) { - for(std::uint8_t i = 0; i < Len; i++) { - shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); - shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); - } - } - return shuffleMask; } template ShuffleValues> @@ -466,7 +405,7 @@ namespace Crafter { } } - constexpr static _Float16 Dot(VectorF16 a, VectorF16 b) { + constexpr static _Float16 Dot(VectorF16 a, VectorF16 b) { if constexpr(std::is_same_v) { __m128h mul = _mm_mul_ph(a.v, b.v); return _mm_reduce_add_ph(mul); @@ -478,1031 +417,877 @@ namespace Crafter { return _mm512_reduce_add_ph(mul); } } + + constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Len == 8) { + constexpr std::uint8_t shuffleMaskA[] = GetShuffleMaskEpi8<{0,0,0,0,0,0,0,0}>(); + constexpr std::uint8_t shuffleMaskB[] = GetShuffleMaskEpi8<{1,1,1,1,1,1,1,1}>(); + constexpr std::uint8_t shuffleMaskC[] = GetShuffleMaskEpi8<{2,2,2,2,2,2,2,2}>(); + constexpr std::uint8_t shuffleMaskD[] = GetShuffleMaskEpi8<{3,3,3,3,3,3,3,3}>(); + constexpr std::uint8_t shuffleMaskE[] = GetShuffleMaskEpi8<{4,4,4,4,4,4,4,4}>(); + constexpr std::uint8_t shuffleMaskF[] = GetShuffleMaskEpi8<{5,5,5,5,5,5,5,5}>(); + constexpr std::uint8_t shuffleMaskG[] = GetShuffleMaskEpi8<{6,6,6,6,6,6,6,6}>(); + constexpr std::uint8_t shuffleMaskH[] = GetShuffleMaskEpi8<{7,7,7,7,7,7,7,7}>(); + + if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + + __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + __m128i shuffleVecB = _mm_loadu_epi8(shuffleMaskB); + __m128h fLenghtB = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecB)); + + __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); + __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); + + __m128i shuffleVecD = _mm_loadu_epi8(shuffleMaskD); + __m128h fLenghtD = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecD)); + + __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + __m128i shuffleVecF = _mm_loadu_epi8(shuffleMaskF); + __m128h fLenghtF = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecF)); + + __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); + __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); + + __m128i shuffleVecH = _mm_loadu_epi8(shuffleMaskH); + __m128h fLenghtH = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecH)); + + return { + _mm_mul_ph(A.v, fLenghtA), + _mm_mul_ph(B.v, fLenghtB), + _mm_mul_ph(C.v, fLenghtC), + _mm_mul_ph(D.v, fLenghtD), + _mm_mul_ph(E.v, fLenghtE), + _mm_mul_ph(F.v, fLenghtF), + _mm_mul_ph(G.v, fLenghtG), + _mm_mul_ph(H.v, fLenghtH) + }; + } else if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256h one = _mm256_loadu_ph(oneArr); + __m256h fLenght = _mm256_div_ph(one, lenght.v); + + __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + __m256i shuffleVecB = _mm256_loadu_epi8(shuffleMaskB); + __m256h fLenghtB = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecB)); + + __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); + __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); + + __m256i shuffleVecD = _mm256_loadu_epi8(shuffleMaskD); + __m256h fLenghtD = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecD)); + + __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + __m256i shuffleVecF = _mm256_loadu_epi8(shuffleMaskF); + __m256h fLenghtF = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecF)); + + __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); + __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); + + __m256i shuffleVecH = _mm256_loadu_epi8(shuffleMaskH); + __m256h fLenghtH = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecH)); + + return { + _mm256_mul_ph(A.v, fLenghtA), + _mm256_mul_ph(B.v, fLenghtB), + _mm256_mul_ph(C.v, fLenghtC), + _mm256_mul_ph(D.v, fLenghtD), + _mm256_mul_ph(E.v, fLenghtE), + _mm256_mul_ph(F.v, fLenghtF), + _mm256_mul_ph(G.v, fLenghtG), + _mm256_mul_ph(H.v, fLenghtH) + }; + } else { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m512h one = _mm512_loadu_ph(oneArr); + __m512h fLenght = _mm512_div_ph(one, lenght.v); + + __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + __m512i shuffleVecB = _mm512_loadu_epi8(shuffleMaskB); + __m512h fLenghtB = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecB)); + + __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); + __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); + + __m512i shuffleVecD = _mm512_loadu_epi8(shuffleMaskD); + __m512h fLenghtD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecD)); + + __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + __m512i shuffleVecF = _mm512_loadu_epi8(shuffleMaskF); + __m512h fLenghtF = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecF)); + + __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); + __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); + + __m512i shuffleVecH = _mm512_loadu_epi8(shuffleMaskH); + __m512h fLenghtH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecH)); + + return { + _mm512_mul_ph(A.v, fLenghtA), + _mm512_mul_ph(B.v, fLenghtB), + _mm512_mul_ph(C.v, fLenghtC), + _mm512_mul_ph(D.v, fLenghtD), + _mm512_mul_ph(E.v, fLenghtE), + _mm512_mul_ph(F.v, fLenghtF), + _mm512_mul_ph(G.v, fLenghtG), + _mm512_mul_ph(H.v, fLenghtH) + }; + } + } + + constexpr static std::tuple, VectorF16, VectorF16, VectorF16> Normalize( + VectorF16 A, + VectorF16 C, + VectorF16 E, + VectorF16 G + ) requires(Len == 4) { + constexpr std::uint8_t shuffleMaskA[] = GetShuffleMaskEpi8<{0,0,0,0}>(); + constexpr std::uint8_t shuffleMaskC[] = GetShuffleMaskEpi8<{1,1,1,1}>(); + constexpr std::uint8_t shuffleMaskE[] = GetShuffleMaskEpi8<{2,2,2,2}>(); + constexpr std::uint8_t shuffleMaskG[] = GetShuffleMaskEpi8<{3,3,3,3}>(); + + if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, C, E, G); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); + __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); + + __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); + __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); + + return { + _mm_mul_ph(A.v, fLenghtA), + _mm_mul_ph(C.v, fLenghtC), + _mm_mul_ph(E.v, fLenghtE), + _mm_mul_ph(G.v, fLenghtG), + }; + } else if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, C, E, G); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256h one = _mm256_loadu_ph(oneArr); + __m256h fLenght = _mm256_div_ph(one, lenght.v); + + __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); + __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); + + __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); + __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); + + return { + _mm256_mul_ph(A.v, fLenghtA), + _mm256_mul_ph(C.v, fLenghtC), + _mm256_mul_ph(E.v, fLenghtE), + _mm256_mul_ph(G.v, fLenghtG), + }; + } else { + VectorF16 lenght = Length(A, C, E, G); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m512h one = _mm512_loadu_ph(oneArr); + __m512h fLenght = _mm512_div_ph(one, lenght.v); + + __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); + __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); + + __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); + __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); + + return { + VectorF16(_mm512_mul_ph(A.v, fLenghtA)), + VectorF16(_mm512_mul_ph(C.v, fLenghtC)), + VectorF16(_mm512_mul_ph(E.v, fLenghtE)), + VectorF16(_mm512_mul_ph(G.v, fLenghtG)), + }; + } + } + + constexpr static std::tuple, VectorF16> Normalize( + VectorF16 A, + VectorF16 E + ) requires(Len == 2) { + constexpr std::uint8_t shuffleMaskA[] = GetShuffleMaskEpi8<{0,0}>(); + constexpr std::uint8_t shuffleMaskE[] = GetShuffleMaskEpi8<{1,1}>(); + + if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, E); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + return { + _mm_mul_ph(A.v, fLenghtA), + _mm_mul_ph(E.v, fLenghtE), + }; + } else if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, E); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256h one = _mm256_loadu_ph(oneArr); + __m256h fLenght = _mm256_div_ph(one, lenght.v); + + __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + return { + _mm256_mul_ph(A.v, fLenghtA), + _mm256_mul_ph(E.v, fLenghtE), + }; + } else { + VectorF16 lenght = Length(A, E); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m512h one = _mm512_loadu_ph(oneArr); + __m512h fLenght = _mm512_div_ph(one, lenght.v); + + __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + return { + _mm512_mul_ph(A.v, fLenghtA), + _mm512_mul_ph(E.v, fLenghtE), + }; + } + } + + constexpr static VectorF16 Length( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Len == 8) { + VectorF16 lenghtSq = LengthSq(A, B, C, D, E, F, G, H); + if constexpr(std::is_same_v) { + return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + } else { + return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + } + } + + constexpr static VectorF16 Length( + VectorF16 A, + VectorF16 C, + VectorF16 E, + VectorF16 G + ) requires(Len == 4) { + VectorF16 lenghtSq = LengthSq(A, C, E, G); + if constexpr(std::is_same_v) { + return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + } else { + return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + } + } + + constexpr static VectorF16 Length( + VectorF16 A, + VectorF16 E + ) requires(Len == 2) { + VectorF16 lenghtSq = LengthSq(A, E); + if constexpr(std::is_same_v) { + return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + } else { + return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + } + } + + constexpr static VectorF16 LengthSq( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Len == 8) { + return Dot(A, A, B, B, C, C, D, D, E, E, F, F, G, G, H, H); + } + + constexpr static VectorF16 LengthSq( + VectorF16 A, + VectorF16 C, + VectorF16 E, + VectorF16 G + ) requires(Len == 4) { + return Dot(A, A, C, C, E, E, G, G); + } + + constexpr static VectorF16 LengthSq( + VectorF16 A, + VectorF16 E + ) requires(Len == 2) { + return Dot(A, A, E, E); + } + + constexpr static VectorF16 Dot( + VectorF16 A0, VectorF16 A1, + VectorF16 B0, VectorF16 B1, + VectorF16 C0, VectorF16 C1, + VectorF16 D0, VectorF16 D1, + VectorF16 E0, VectorF16 E1, + VectorF16 F0, VectorF16 F1, + VectorF16 G0, VectorF16 G1, + VectorF16 H0, VectorF16 H1 + ) requires(Len == 8) { + if constexpr(std::is_same_v) { + __m128h mulA = _mm_mul_ph(A0.v, A1.v); + __m128h mulB = _mm_mul_ph(B0.v, B1.v); + __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + __m128i row56Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + __m128i row1TempTemp1 = row12Temp1; + __m128i row5TempTemp1 = row56Temp1; + + __m128h mulC = _mm_mul_ph(C0.v, C1.v); + __m128h mulD = _mm_mul_ph(D0.v, D1.v); + __m128i row34Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulC), _mm_castph_si128(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + __m128i row78Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + + row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + row56Temp1 = _mm_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + row78Temp1 = _mm_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 + + __m128h mulE = _mm_mul_ph(E0.v, E1.v); + __m128h mulF = _mm_mul_ph(F0.v, F1.v); + __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + __m128i row56Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + __m128i row1TempTemp2 = row12Temp2; + __m128i row5TempTemp2 = row56Temp2; + + __m128h mulG = _mm_mul_ph(G0.v, G1.v); + __m128h mulH = _mm_mul_ph(H0.v, H1.v); + __m128i row34Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulG), _mm_castph_si128(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + __m128i row78Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + + row12Temp2 = _mm_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + row34Temp2 = _mm_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + row56Temp2 = _mm_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + row78Temp2 = _mm_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + + __m128h row1 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + __m128h row2 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + __m128h row3 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + __m128h row4 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + __m128h row5 = _mm_castsi128_ph(_mm_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + __m128h row6 = _mm_castsi128_ph(_mm_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + __m128h row7 = _mm_castsi128_ph(_mm_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + __m128h row8 = _mm_castsi128_ph(_mm_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + + row1 = _mm_add_ph(row1, row2); + row1 = _mm_add_ph(row1, row3); + row1 = _mm_add_ph(row1, row4); + row1 = _mm_add_ph(row1, row5); + row1 = _mm_add_ph(row1, row6); + row1 = _mm_add_ph(row1, row7); + row1 = _mm_add_ph(row1, row8); + + return row1; + } else if constexpr(std::is_same_v) { + __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + __m256h mulB = _mm256_mul_ph(B0.v, B1.v); + __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + __m256i row56Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + __m256i row1TempTemp1 = row12Temp1; + __m256i row5TempTemp1 = row56Temp1; + + __m256h mulC = _mm256_mul_ph(C0.v, C1.v); + __m256h mulD = _mm256_mul_ph(D0.v, D1.v); + __m256i row34Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulC), _mm256_castph_si256(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + __m256i row78Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + + row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + row56Temp1 = _mm256_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + row78Temp1 = _mm256_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 + + __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + __m256h mulF = _mm256_mul_ph(F0.v, F1.v); + __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + __m256i row56Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + __m256i row1TempTemp2 = row12Temp2; + __m256i row5TempTemp2 = row56Temp2; + + __m256h mulG = _mm256_mul_ph(G0.v, G1.v); + __m256h mulH = _mm256_mul_ph(H0.v, H1.v); + __m256i row34Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulG), _mm256_castph_si256(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + __m256i row78Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + + row12Temp2 = _mm256_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + row34Temp2 = _mm256_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + row56Temp2 = _mm256_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + row78Temp2 = _mm256_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + + __m256h row1 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + __m256h row2 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + __m256h row3 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + __m256h row4 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + __m256h row5 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + __m256h row6 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + __m256h row7 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + __m256h row8 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + + row1 = _mm256_add_ph(row1, row2); + row1 = _mm256_add_ph(row1, row3); + row1 = _mm256_add_ph(row1, row4); + row1 = _mm256_add_ph(row1, row5); + row1 = _mm256_add_ph(row1, row6); + row1 = _mm256_add_ph(row1, row7); + row1 = _mm256_add_ph(row1, row8); + + return row1; + } else { + __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + __m512h mulB = _mm512_mul_ph(B0.v, B1.v); + __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + __m512i row56Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + __m512i row1TempTemp1 = row12Temp1; + __m512i row5TempTemp1 = row56Temp1; + + __m512h mulC = _mm512_mul_ph(C0.v, C1.v); + __m512h mulD = _mm512_mul_ph(D0.v, D1.v); + __m512i row34Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulC), _mm512_castph_si512(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + __m512i row78Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + + row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + row56Temp1 = _mm512_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + row78Temp1 = _mm512_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 + + __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + __m512h mulF = _mm512_mul_ph(F0.v, F1.v); + __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + __m512i row56Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + __m512i row1TempTemp2 = row12Temp2; + __m512i row5TempTemp2 = row56Temp2; + + __m512h mulG = _mm512_mul_ph(G0.v, G1.v); + __m512h mulH = _mm512_mul_ph(H0.v, H1.v); + __m512i row34Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulG), _mm512_castph_si512(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + __m512i row78Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + + row12Temp2 = _mm512_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + row34Temp2 = _mm512_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + row56Temp2 = _mm512_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + row78Temp2 = _mm512_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + + __m512h row1 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + __m512h row2 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + __m512h row3 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + __m512h row4 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + __m512h row5 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + __m512h row6 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + __m512h row7 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + __m512h row8 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + + row1 = _mm512_add_ph(row1, row2); + row1 = _mm512_add_ph(row1, row3); + row1 = _mm512_add_ph(row1, row4); + row1 = _mm512_add_ph(row1, row5); + row1 = _mm512_add_ph(row1, row6); + row1 = _mm512_add_ph(row1, row7); + row1 = _mm512_add_ph(row1, row8); + + return row1; + } + } + + constexpr static VectorF16 Dot( + VectorF16 A0, VectorF16 A1, + VectorF16 C0, VectorF16 C1, + VectorF16 E0, VectorF16 E1, + VectorF16 G0, VectorF16 G1 + ) requires(Len == 4) { + if constexpr(std::is_same_v) { + __m128h mulA = _mm_mul_ph(A0.v, A1.v); + __m128h mulC = _mm_mul_ph(C0.v, C1.v); + __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + __m128i row34Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + __m128i row1TempTemp1 = row12Temp1; + __m128i row5TempTemp1 = row34Temp1; + + __m128h mulE = _mm_mul_ph(E0.v, E1.v); + __m128h mulG = _mm_mul_ph(G0.v, G1.v); + __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + __m128i row34Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + + row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + row12Temp2 = _mm_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + row34Temp2 = _mm_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + + __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + __m128h row3 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + __m128h row4 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + + row1 = _mm_add_ph(row1, row2); + row1 = _mm_add_ph(row1, row3); + row1 = _mm_add_ph(row1, row4); + + return row1; + } else if constexpr(std::is_same_v) { + __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + __m256h mulC = _mm256_mul_ph(C0.v, C1.v); + __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + __m256i row34Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + __m256i row1TempTemp1 = row12Temp1; + __m256i row5TempTemp1 = row34Temp1; + + __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + __m256h mulG = _mm256_mul_ph(G0.v, G1.v); + __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + __m256i row34Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + + row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + row12Temp2 = _mm256_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + row34Temp2 = _mm256_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + + __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + __m256h row3 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + __m256h row4 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + + row1 = _mm256_add_ph(row1, row2); + row1 = _mm256_add_ph(row1, row3); + row1 = _mm256_add_ph(row1, row4); + + return row1; + } else { + __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + __m512h mulC = _mm512_mul_ph(C0.v, C1.v); + __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + __m512i row34Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + __m512i row1TempTemp1 = row12Temp1; + __m512i row5TempTemp1 = row34Temp1; + + __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + __m512h mulG = _mm512_mul_ph(G0.v, G1.v); + __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + __m512i row34Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + + row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + row12Temp2 = _mm512_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + row34Temp2 = _mm512_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + + __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + __m512h row3 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + __m512h row4 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + + row1 = _mm512_add_ph(row1, row2); + row1 = _mm512_add_ph(row1, row3); + row1 = _mm512_add_ph(row1, row4); + + return row1; + } + } + + constexpr static VectorF16 Dot( + VectorF16 A0, VectorF16 A1, + VectorF16 E0, VectorF16 E1 + ) requires(Len == 2) { + if constexpr(std::is_same_v) { + __m128h mulA = _mm_mul_ph(A0.v, A1.v); + __m128h mulE = _mm_mul_ph(E0.v, E1.v); + __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + __m128i row12Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + __m128i row12Temp1Temp = row12Temp1; + + row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + row12Temp2 = _mm_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + + __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + + return _mm_add_ph(row1, row2); + } else if constexpr(std::is_same_v) { + __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + __m256i row12Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + __m256i row12Temp1Temp = row12Temp1; + + row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + row12Temp2 = _mm256_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + + __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + + return _mm256_add_ph(row1, row2); + } else { + __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + __m512i row12Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + __m512i row12Temp1Temp = row12Temp1; + + row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + row12Temp2 = _mm512_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + + __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + + return _mm512_add_ph(row1, row2); + } + } - - // constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( - // VectorF16 A, - // VectorF16 B, - // VectorF16 C, - // VectorF16 D, - // VectorF16 E, - // VectorF16 F, - // VectorF16 G, - // VectorF16 H - // ) requires(Packing == 1) { - // if constexpr(std::is_same_v) { - // VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; - // __m128h one = _mm_loadu_ph(oneArr); - // __m128h fLenght = _mm_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 - // }; - // __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); - // __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskB[] { - // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 - // }; - // __m128i shuffleVecB = _mm_loadu_epi8(shuffleMaskB); - // __m128h fLenghtB = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecB)); - - // constexpr std::uint8_t shuffleMaskC[] { - // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 - // }; - // __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); - // __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); - - // constexpr std::uint8_t shuffleMaskD[] { - // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 - // }; - // __m128i shuffleVecD = _mm_loadu_epi8(shuffleMaskD); - // __m128h fLenghtD = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecD)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 - // }; - // __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); - // __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); - - // constexpr std::uint8_t shuffleMaskF[] { - // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - // }; - // __m128i shuffleVecF = _mm_loadu_epi8(shuffleMaskF); - // __m128h fLenghtF = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecF)); - - // constexpr std::uint8_t shuffleMaskG[] { - // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - // }; - // __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); - // __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); - - // constexpr std::uint8_t shuffleMaskH[] { - // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - // }; - // __m128i shuffleVecH = _mm_loadu_epi8(shuffleMaskH); - // __m128h fLenghtH = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecH)); - - // return { - // _mm_mul_ph(A.v, fLenghtA), - // _mm_mul_ph(B.v, fLenghtB), - // _mm_mul_ph(C.v, fLenghtC), - // _mm_mul_ph(D.v, fLenghtD), - // _mm_mul_ph(E.v, fLenghtE), - // _mm_mul_ph(F.v, fLenghtF), - // _mm_mul_ph(G.v, fLenghtG), - // _mm_mul_ph(H.v, fLenghtH) - // }; - // } else if constexpr(std::is_same_v) { - // VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - // __m256h one = _mm256_loadu_ph(oneArr); - // __m256h fLenght = _mm256_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 - // }; - // __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); - // __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskB[] { - // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 - // }; - // __m256i shuffleVecB = _mm256_loadu_epi8(shuffleMaskB); - // __m256h fLenghtB = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecB)); - - // constexpr std::uint8_t shuffleMaskC[] { - // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 - // }; - // __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); - // __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); - - // constexpr std::uint8_t shuffleMaskD[] { - // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 - // }; - // __m256i shuffleVecD = _mm256_loadu_epi8(shuffleMaskD); - // __m256h fLenghtD = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecD)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 - // }; - // __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); - // __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); - - // constexpr std::uint8_t shuffleMaskF[] { - // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - // }; - // __m256i shuffleVecF = _mm256_loadu_epi8(shuffleMaskF); - // __m256h fLenghtF = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecF)); - - // constexpr std::uint8_t shuffleMaskG[] { - // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 - // }; - // __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); - // __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); - - // constexpr std::uint8_t shuffleMaskH[] { - // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 - // }; - // __m256i shuffleVecH = _mm256_loadu_epi8(shuffleMaskH); - // __m256h fLenghtH = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecH)); - - // return { - // _mm256_mul_ph(A.v, fLenghtA), - // _mm256_mul_ph(B.v, fLenghtB), - // _mm256_mul_ph(C.v, fLenghtC), - // _mm256_mul_ph(D.v, fLenghtD), - // _mm256_mul_ph(E.v, fLenghtE), - // _mm256_mul_ph(F.v, fLenghtF), - // _mm256_mul_ph(G.v, fLenghtG), - // _mm256_mul_ph(H.v, fLenghtH) - // }; - // } else { - // VectorF16 lenght = Length(A, B, C, D, E, F, G, H); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - // __m512h one = _mm512_loadu_ph(oneArr); - // __m512h fLenght = _mm512_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, - // 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 - // }; - // __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); - // __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskB[] { - // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, - // 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 - // }; - // __m512i shuffleVecB = _mm512_loadu_epi8(shuffleMaskB); - // __m512h fLenghtB = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecB)); - - // constexpr std::uint8_t shuffleMaskC[] { - // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, - // 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 - // }; - // __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); - // __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); - - // constexpr std::uint8_t shuffleMaskD[] { - // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, - // 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 - // }; - // __m512i shuffleVecD = _mm512_loadu_epi8(shuffleMaskD); - // __m512h fLenghtD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecD)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, - // 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 - // }; - // __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); - // __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); - - // constexpr std::uint8_t shuffleMaskF[] { - // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - // 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, - // }; - // __m512i shuffleVecF = _mm512_loadu_epi8(shuffleMaskF); - // __m512h fLenghtF = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecF)); - - // constexpr std::uint8_t shuffleMaskG[] { - // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, - // 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 - // }; - // __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); - // __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); - - // constexpr std::uint8_t shuffleMaskH[] { - // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, - // 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 - // }; - // __m512i shuffleVecH = _mm512_loadu_epi8(shuffleMaskH); - // __m512h fLenghtH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecH)); - - // return { - // _mm512_mul_ph(A.v, fLenghtA), - // _mm512_mul_ph(B.v, fLenghtB), - // _mm512_mul_ph(C.v, fLenghtC), - // _mm512_mul_ph(D.v, fLenghtD), - // _mm512_mul_ph(E.v, fLenghtE), - // _mm512_mul_ph(F.v, fLenghtF), - // _mm512_mul_ph(G.v, fLenghtG), - // _mm512_mul_ph(H.v, fLenghtH) - // }; - // } - // } - - // constexpr static std::tuple, VectorF16, VectorF16, VectorF16> Normalize( - // VectorF16 A, - // VectorF16 C, - // VectorF16 E, - // VectorF16 G - // ) requires(Packing == 2) { - // if constexpr(std::is_same_v) { - // VectorF16 lenght = Length(A, C, E, G); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; - // __m128h one = _mm_loadu_ph(oneArr); - // __m128h fLenght = _mm_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 - // }; - // __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); - // __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskC[] { - // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 - // }; - // __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); - // __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 - // }; - // __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); - // __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); - - // constexpr std::uint8_t shuffleMaskG[] { - // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - // }; - // __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); - // __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); - - // return { - // _mm_mul_ph(A.v, fLenghtA), - // _mm_mul_ph(C.v, fLenghtC), - // _mm_mul_ph(E.v, fLenghtE), - // _mm_mul_ph(G.v, fLenghtG), - // }; - // } else if constexpr(std::is_same_v) { - // VectorF16 lenght = Length(A, C, E, G); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - // __m256h one = _mm256_loadu_ph(oneArr); - // __m256h fLenght = _mm256_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 - // }; - // __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); - // __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskC[] { - // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 - // }; - // __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); - // __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 - // }; - // __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); - // __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); - - // constexpr std::uint8_t shuffleMaskG[] { - // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - // }; - // __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); - // __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); - - // return { - // _mm256_mul_ph(A.v, fLenghtA), - // _mm256_mul_ph(C.v, fLenghtC), - // _mm256_mul_ph(E.v, fLenghtE), - // _mm256_mul_ph(G.v, fLenghtG), - // }; - // } else { - // VectorF16 lenght = Length(A, C, E, G); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - // __m512h one = _mm512_loadu_ph(oneArr); - // __m512h fLenght = _mm512_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, - // 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 - // }; - // __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); - // __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskC[] { - // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, - // 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 - // }; - // __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); - // __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, - // 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 - // }; - // __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); - // __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); - - // constexpr std::uint8_t shuffleMaskG[] { - // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - // 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, - // }; - // __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); - // __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); - - // return { - // VectorF16(_mm512_mul_ph(A.v, fLenghtA)), - // VectorF16(_mm512_mul_ph(C.v, fLenghtC)), - // VectorF16(_mm512_mul_ph(E.v, fLenghtE)), - // VectorF16(_mm512_mul_ph(G.v, fLenghtG)), - // }; - // } - // } - - // constexpr static std::tuple, VectorF16> Normalize( - // VectorF16 A, - // VectorF16 E - // ) requires(Packing == 4) { - // if constexpr(std::is_same_v) { - // VectorF16 lenght = Length(A, E); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; - // __m128h one = _mm_loadu_ph(oneArr); - // __m128h fLenght = _mm_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 - // }; - // __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); - // __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 - // }; - // __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); - // __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); - - // return { - // _mm_mul_ph(A.v, fLenghtA), - // _mm_mul_ph(E.v, fLenghtE), - // }; - // } else if constexpr(std::is_same_v) { - // VectorF16 lenght = Length(A, E); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - // __m256h one = _mm256_loadu_ph(oneArr); - // __m256h fLenght = _mm256_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 - // }; - // __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); - // __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 - // }; - // __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); - // __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); - - // return { - // _mm256_mul_ph(A.v, fLenghtA), - // _mm256_mul_ph(E.v, fLenghtE), - // }; - // } else { - // VectorF16 lenght = Length(A, E); - // constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - // __m512h one = _mm512_loadu_ph(oneArr); - // __m512h fLenght = _mm512_div_ph(one, lenght.v); - - // constexpr std::uint8_t shuffleMaskA[] { - // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, - // 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 - // }; - // __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); - // __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); - - // constexpr std::uint8_t shuffleMaskE[] { - // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, - // 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 - // }; - // __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); - // __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); - - // return { - // _mm512_mul_ph(A.v, fLenghtA), - // _mm512_mul_ph(E.v, fLenghtE), - // }; - // } - // } - - // constexpr static VectorF16 Length( - // VectorF16 A, - // VectorF16 B, - // VectorF16 C, - // VectorF16 D, - // VectorF16 E, - // VectorF16 F, - // VectorF16 G, - // VectorF16 H - // ) requires(Packing == 1) { - // VectorF16 lenghtSq = LengthSq(A, B, C, D, E, F, G, H); - // if constexpr(std::is_same_v) { - // return VectorF16(_mm_sqrt_ph(lenghtSq.v)); - // } else if constexpr(std::is_same_v) { - // return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); - // } else { - // return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); - // } - // } - - // constexpr static VectorF16 Length( - // VectorF16 A, - // VectorF16 C, - // VectorF16 E, - // VectorF16 G - // ) requires(Packing == 2) { - // VectorF16 lenghtSq = LengthSq(A, C, E, G); - // if constexpr(std::is_same_v) { - // return VectorF16(_mm_sqrt_ph(lenghtSq.v)); - // } else if constexpr(std::is_same_v) { - // return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); - // } else { - // return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); - // } - // } - - // constexpr static VectorF16 Length( - // VectorF16 A, - // VectorF16 E - // ) requires(Packing == 4) { - // VectorF16 lenghtSq = LengthSq(A, E); - // if constexpr(std::is_same_v) { - // return VectorF16(_mm_sqrt_ph(lenghtSq.v)); - // } else if constexpr(std::is_same_v) { - // return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); - // } else { - // return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); - // } - // } - - // constexpr static VectorF16 LengthSq( - // VectorF16 A, - // VectorF16 B, - // VectorF16 C, - // VectorF16 D, - // VectorF16 E, - // VectorF16 F, - // VectorF16 G, - // VectorF16 H - // ) requires(Packing == 1) { - // return Dot(A, A, B, B, C, C, D, D, E, E, F, F, G, G, H, H); - // } - - // constexpr static VectorF16 LengthSq( - // VectorF16 A, - // VectorF16 C, - // VectorF16 E, - // VectorF16 G - // ) requires(Packing == 2) { - // return Dot(A, A, C, C, E, E, G, G); - // } - - // constexpr static VectorF16 LengthSq( - // VectorF16 A, - // VectorF16 E - // ) requires(Packing == 4) { - // return Dot(A, A, E, E); - // } - - // constexpr static VectorF16 Dot( - // VectorF16 A0, VectorF16 A1, - // VectorF16 B0, VectorF16 B1, - // VectorF16 C0, VectorF16 C1, - // VectorF16 D0, VectorF16 D1, - // VectorF16 E0, VectorF16 E1, - // VectorF16 F0, VectorF16 F1, - // VectorF16 G0, VectorF16 G1, - // VectorF16 H0, VectorF16 H1 - // ) requires(Packing == 1) { - // if constexpr(std::is_same_v) { - // __m128h mulA = _mm_mul_ph(A0.v, A1.v); - // __m128h mulB = _mm_mul_ph(B0.v, B1.v); - // __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 - // __m128i row56Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 - // __m128i row1TempTemp1 = row12Temp1; - // __m128i row5TempTemp1 = row56Temp1; - - // __m128h mulC = _mm_mul_ph(C0.v, C1.v); - // __m128h mulD = _mm_mul_ph(D0.v, D1.v); - // __m128i row34Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulC), _mm_castph_si128(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 - // __m128i row78Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 - - // row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 - // row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 - // row56Temp1 = _mm_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 - // row78Temp1 = _mm_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 - - // __m128h mulE = _mm_mul_ph(E0.v, E1.v); - // __m128h mulF = _mm_mul_ph(F0.v, F1.v); - // __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 - // __m128i row56Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 - // __m128i row1TempTemp2 = row12Temp2; - // __m128i row5TempTemp2 = row56Temp2; - - // __m128h mulG = _mm_mul_ph(G0.v, G1.v); - // __m128h mulH = _mm_mul_ph(H0.v, H1.v); - // __m128i row34Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulG), _mm_castph_si128(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 - // __m128i row78Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 - - // row12Temp2 = _mm_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 - // row34Temp2 = _mm_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 - // row56Temp2 = _mm_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 - // row78Temp2 = _mm_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 - - // __m128h row1 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 - // __m128h row2 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 - // __m128h row3 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 - // __m128h row4 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 - // __m128h row5 = _mm_castsi128_ph(_mm_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 - // __m128h row6 = _mm_castsi128_ph(_mm_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 - // __m128h row7 = _mm_castsi128_ph(_mm_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 - // __m128h row8 = _mm_castsi128_ph(_mm_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 - - // row1 = _mm_add_ph(row1, row2); - // row1 = _mm_add_ph(row1, row3); - // row1 = _mm_add_ph(row1, row4); - // row1 = _mm_add_ph(row1, row5); - // row1 = _mm_add_ph(row1, row6); - // row1 = _mm_add_ph(row1, row7); - // row1 = _mm_add_ph(row1, row8); - - // return row1; - // } else if constexpr(std::is_same_v) { - // __m256h mulA = _mm256_mul_ph(A0.v, A1.v); - // __m256h mulB = _mm256_mul_ph(B0.v, B1.v); - // __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 - // __m256i row56Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 - // __m256i row1TempTemp1 = row12Temp1; - // __m256i row5TempTemp1 = row56Temp1; - - // __m256h mulC = _mm256_mul_ph(C0.v, C1.v); - // __m256h mulD = _mm256_mul_ph(D0.v, D1.v); - // __m256i row34Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulC), _mm256_castph_si256(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 - // __m256i row78Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 - - // row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 - // row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 - // row56Temp1 = _mm256_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 - // row78Temp1 = _mm256_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 - - // __m256h mulE = _mm256_mul_ph(E0.v, E1.v); - // __m256h mulF = _mm256_mul_ph(F0.v, F1.v); - // __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 - // __m256i row56Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 - // __m256i row1TempTemp2 = row12Temp2; - // __m256i row5TempTemp2 = row56Temp2; - - // __m256h mulG = _mm256_mul_ph(G0.v, G1.v); - // __m256h mulH = _mm256_mul_ph(H0.v, H1.v); - // __m256i row34Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulG), _mm256_castph_si256(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 - // __m256i row78Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 - - // row12Temp2 = _mm256_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 - // row34Temp2 = _mm256_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 - // row56Temp2 = _mm256_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 - // row78Temp2 = _mm256_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 - - // __m256h row1 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 - // __m256h row2 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 - // __m256h row3 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 - // __m256h row4 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 - // __m256h row5 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 - // __m256h row6 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 - // __m256h row7 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 - // __m256h row8 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 - - // row1 = _mm256_add_ph(row1, row2); - // row1 = _mm256_add_ph(row1, row3); - // row1 = _mm256_add_ph(row1, row4); - // row1 = _mm256_add_ph(row1, row5); - // row1 = _mm256_add_ph(row1, row6); - // row1 = _mm256_add_ph(row1, row7); - // row1 = _mm256_add_ph(row1, row8); - - // return row1; - // } else { - // __m512h mulA = _mm512_mul_ph(A0.v, A1.v); - // __m512h mulB = _mm512_mul_ph(B0.v, B1.v); - // __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 - // __m512i row56Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 - // __m512i row1TempTemp1 = row12Temp1; - // __m512i row5TempTemp1 = row56Temp1; - - // __m512h mulC = _mm512_mul_ph(C0.v, C1.v); - // __m512h mulD = _mm512_mul_ph(D0.v, D1.v); - // __m512i row34Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulC), _mm512_castph_si512(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 - // __m512i row78Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 - - // row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 - // row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 - // row56Temp1 = _mm512_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 - // row78Temp1 = _mm512_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 - - // __m512h mulE = _mm512_mul_ph(E0.v, E1.v); - // __m512h mulF = _mm512_mul_ph(F0.v, F1.v); - // __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 - // __m512i row56Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 - // __m512i row1TempTemp2 = row12Temp2; - // __m512i row5TempTemp2 = row56Temp2; - - // __m512h mulG = _mm512_mul_ph(G0.v, G1.v); - // __m512h mulH = _mm512_mul_ph(H0.v, H1.v); - // __m512i row34Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulG), _mm512_castph_si512(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 - // __m512i row78Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 - - // row12Temp2 = _mm512_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 - // row34Temp2 = _mm512_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 - // row56Temp2 = _mm512_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 - // row78Temp2 = _mm512_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 - - // __m512h row1 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 - // __m512h row2 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 - // __m512h row3 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 - // __m512h row4 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 - // __m512h row5 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 - // __m512h row6 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 - // __m512h row7 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 - // __m512h row8 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 - - // row1 = _mm512_add_ph(row1, row2); - // row1 = _mm512_add_ph(row1, row3); - // row1 = _mm512_add_ph(row1, row4); - // row1 = _mm512_add_ph(row1, row5); - // row1 = _mm512_add_ph(row1, row6); - // row1 = _mm512_add_ph(row1, row7); - // row1 = _mm512_add_ph(row1, row8); - - // return row1; - // } - // } - - // constexpr static VectorF16 Dot( - // VectorF16 A0, VectorF16 A1, - // VectorF16 C0, VectorF16 C1, - // VectorF16 E0, VectorF16 E1, - // VectorF16 G0, VectorF16 G1 - // ) requires(Packing == 2) { - // if constexpr(std::is_same_v) { - // __m128h mulA = _mm_mul_ph(A0.v, A1.v); - // __m128h mulC = _mm_mul_ph(C0.v, C1.v); - // __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 - // __m128i row34Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 - // __m128i row1TempTemp1 = row12Temp1; - // __m128i row5TempTemp1 = row34Temp1; - - // __m128h mulE = _mm_mul_ph(E0.v, E1.v); - // __m128h mulG = _mm_mul_ph(G0.v, G1.v); - // __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 - // __m128i row34Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 - - // row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 - // row12Temp2 = _mm_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 - // row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 - // row34Temp2 = _mm_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 - - // __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 - // __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 - // __m128h row3 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 - // __m128h row4 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 - - // row1 = _mm_add_ph(row1, row2); - // row1 = _mm_add_ph(row1, row3); - // row1 = _mm_add_ph(row1, row4); - - // return row1; - // } else if constexpr(std::is_same_v) { - // __m256h mulA = _mm256_mul_ph(A0.v, A1.v); - // __m256h mulC = _mm256_mul_ph(C0.v, C1.v); - // __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 - // __m256i row34Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 - // __m256i row1TempTemp1 = row12Temp1; - // __m256i row5TempTemp1 = row34Temp1; - - // __m256h mulE = _mm256_mul_ph(E0.v, E1.v); - // __m256h mulG = _mm256_mul_ph(G0.v, G1.v); - // __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 - // __m256i row34Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 - - // row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 - // row12Temp2 = _mm256_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 - // row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 - // row34Temp2 = _mm256_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 - - // __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 - // __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 - // __m256h row3 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 - // __m256h row4 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 - - // row1 = _mm256_add_ph(row1, row2); - // row1 = _mm256_add_ph(row1, row3); - // row1 = _mm256_add_ph(row1, row4); - - // return row1; - // } else { - // __m512h mulA = _mm512_mul_ph(A0.v, A1.v); - // __m512h mulC = _mm512_mul_ph(C0.v, C1.v); - // __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 - // __m512i row34Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 - // __m512i row1TempTemp1 = row12Temp1; - // __m512i row5TempTemp1 = row34Temp1; - - // __m512h mulE = _mm512_mul_ph(E0.v, E1.v); - // __m512h mulG = _mm512_mul_ph(G0.v, G1.v); - // __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 - // __m512i row34Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 - - // row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 - // row12Temp2 = _mm512_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 - // row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 - // row34Temp2 = _mm512_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 - - // __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 - // __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 - // __m512h row3 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 - // __m512h row4 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 - - // row1 = _mm512_add_ph(row1, row2); - // row1 = _mm512_add_ph(row1, row3); - // row1 = _mm512_add_ph(row1, row4); - - // return row1; - // } - // } - - // constexpr static VectorF16 Dot( - // VectorF16 A0, VectorF16 A1, - // VectorF16 E0, VectorF16 E1 - // ) requires(Packing == 4) { - // if constexpr(std::is_same_v) { - // __m128h mulA = _mm_mul_ph(A0.v, A1.v); - // __m128h mulE = _mm_mul_ph(E0.v, E1.v); - // __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 - // __m128i row12Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 - // __m128i row12Temp1Temp = row12Temp1; - - // row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 - // row12Temp2 = _mm_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 - - // __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 - // __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 - - // return _mm_add_ph(row1, row2); - // } else if constexpr(std::is_same_v) { - // __m256h mulA = _mm256_mul_ph(A0.v, A1.v); - // __m256h mulE = _mm256_mul_ph(E0.v, E1.v); - // __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 - // __m256i row12Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 - // __m256i row12Temp1Temp = row12Temp1; - - // row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 - // row12Temp2 = _mm256_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 - - // __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 - // __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 - - // return _mm256_add_ph(row1, row2); - // } else { - // __m512h mulA = _mm512_mul_ph(A0.v, A1.v); - // __m512h mulE = _mm512_mul_ph(E0.v, E1.v); - // __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 - // __m512i row12Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 - // __m512i row12Temp1Temp = row12Temp1; - - // row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 - // row12Temp2 = _mm512_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 - - // __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 - // __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 - - // return _mm512_add_ph(row1, row2); - // } - // } - - // template - // constexpr static VectorF16 Blend(VectorF16 a, VectorF16 b) { - // if constexpr(std::is_same_v) { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((E & 1) << 4) | - // ((F & 1) << 5) | - // ((G & 1) << 6) | - // ((H & 1) << 7); - // return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b), val)); - // } else if constexpr(std::is_same_v) { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((E & 1) << 4) | - // ((F & 1) << 5) | - // ((G & 1) << 6) | - // ((H & 1) << 7); - // return _mm256_castsi256_ph(_mm256_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), val)); - // } else { - // constexpr std::uint8_t byte = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((E & 1) << 4) | - // ((F & 1) << 5) | - // ((G & 1) << 6) | - // ((H & 1) << 7); - - // constexpr std::uint32_t val = byte * 0x01010101u; - // return _mm512_castsi512_ph(_mm512_mask_blend_epi16(val, _mm512_castph_si512(a.v), _mm512_castph_si512(b))); - // } - // } - - // template - // constexpr static VectorF16 BlendPacked(VectorF16 a, VectorF16 b) requires(Packing == 2) { - // if constexpr(std::is_same_v) { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((A & 1) << 4) | - // ((B & 1) << 5) | - // ((C & 1) << 6) | - // ((D & 1) << 7); - // return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b), val)); - // } else if constexpr(std::is_same_v) { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((A & 1) << 4) | - // ((B & 1) << 5) | - // ((C & 1) << 6) | - // ((D & 1) << 7); - // return _mm256_castsi256_ph(_mm256_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), val)); - // } else { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((A & 1) << 4) | - // ((B & 1) << 5) | - // ((C & 1) << 6) | - // ((D & 1) << 7); - - // constexpr std::uint32_t val = byte * 0x01010101u; - // return _mm512_castsi512_ph(_mm512_mask_blend_epi16(val, _mm512_castph_si512(a.v), _mm512_castph_si512(b))); - // } - // } - - // template - // constexpr static VectorF16 BlendPacked(VectorF16 a, VectorF16 b) requires(Packing == 4) { - // if constexpr(std::is_same_v) { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((A & 1) << 4) | - // ((B & 1) << 5) | - // ((C & 1) << 6) | - // ((D & 1) << 7); - // return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b), val)); - // } else if constexpr(std::is_same_v) { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((A & 1) << 4) | - // ((B & 1) << 5) | - // ((C & 1) << 6) | - // ((D & 1) << 7); - // return _mm256_castsi256_ph(_mm256_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), val)); - // } else { - // constexpr std::uint8_t val = - // (A & 1) | - // ((B & 1) << 1) | - // ((C & 1) << 2) | - // ((D & 1) << 3) | - // ((A & 1) << 4) | - // ((B & 1) << 5) | - // ((C & 1) << 6) | - // ((D & 1) << 7); - - // constexpr std::uint32_t val = byte * 0x01010101u; - // return _mm512_castsi512_ph(_mm512_mask_blend_epi16(val, _mm512_castph_si512(a.v), _mm512_castph_si512(b))); - // } - // } - - // constexpr static VectorF16 Rotate(VectorF16<3, 2, Repeats> v, VectorF16<4, 2, Repeats> q) requires(Len == 3 && Packing == 2) { - // VectorF16<3, 2, Repeats> qv(q.v); - // VectorF16 t = Cross(qv, v) * _Float16(2); - // return v + t * q.template Shuffle<3,3,3,3,7,7,7,7>(); + Cross(qv, t); - // } - - // constexpr static VectorF16<4, 2, Repeats> RotatePivot(VectorF16<3, 2, Repeats> v, VectorF16<4, 2, Repeats> q, VectorF16<3, 2, Repeats> pivot) requires(Len == 3 && Packing == 2) { - // VectorF16 translated = v - pivot; - // VectorF16<3, 2, Repeats> qv(q.v); - // VectorF16 t = Cross(qv, translated) * _Float16(2); - // VectorF16 rotated = translated + t * q.template Shuffle<3,3,3,3,7,7,7,7>() + Cross(qv, t); - // return rotated + pivot; - // } - - // constexpr static VectorF16<4, 2, Repeats> QuanternionFromEuler(VectorF16<3, 2, Repeats> EulerHalf) requires(Len == 3 && Packing == 2) { - // VectorF16<3, 2, Repeats> sin = EulerHalf.Sin(); - // VectorF16<3, 2, Repeats> cos = EulerHalf.Cos(); - - // VectorF16<3, 2, Repeats> row1 = cos.template Shuffle<0,0,0,0,4,4,4,4>(); - // row1 = VectorF16<3, 2, Repeats>::Blend<0,1,1,1, 0,1,1,1>(sin, row1); - - // VectorF16<3, 2, Repeats> row2 = cos.template Shuffle<1,1,1,1,5,5,5,5>(); - // row2 = VectorF16<3, 2, Repeats>::Blend<1,0,1,1, 1,0,1,1>(sin, row2); - - // row1 = row2; - - // VectorF16<3, 2, Repeats> row3 = cos.template Shuffle<2,2,2,2,6,6,6,6>(); - // row3 = VectorF16<3, 2, Repeats>::Blend<1,1,0,1, 1,1,0,1>(sin, row3); - - // VectorF16<3, 2, Repeats> row4 = sin.template Shuffle<0,0,0,0,4,4,4,4>(); - // row4 = VectorF16<3, 2, Repeats>::Blend<1,0,0,0, 1,0,0,0>(sin, row4); - - - // if constexpr(std::is_same_v) { - // constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; - // __m128i sign_mask = _mm_load_si128(reinterpret_cast(mask)); - // row4.v = (_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(row4.v)))); - // } else if constexpr(std::is_same_v) { - // constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; - // __m256i sign_mask = _mm256_load_si256(reinterpret_cast(mask)); - // row4.v = (_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(row4.v)))); - // } else { - // constexpr std::uint64_t mask[] {0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000, 0b1000000000000000000000000000000010000000000000000000000000000000}; - // __m512i sign_mask = _mm512_load_si512(reinterpret_cast(mask)); - // row4.v = (_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(row4.v)))); - // } - - // row1 = MulitplyAdd(row1, row3, row4); - - // VectorF16<3, 2, Repeats> row5 = sin.template Shuffle<1,1,1,1,5,5,5,5>(); - // row5 = VectorF16<3, 2, Repeats>::Blend<0,1,0,0, 0,1,0,0>(sin, row5); - - // row1 *= row5; - - // VectorF16<3, 2, Repeats> row6 = sin.template Shuffle<2,2,2,2,6,6,6,6>(); - // row6 = VectorF16<3, 2, Repeats>::Blend<0,0,1,0, 0,0,1,0>(sin, row6); - - // return row1 * row6; - // } + template ShuffleValues> + constexpr static VectorF16 Blend(VectorF16 a, VectorF16 b) { + if constexpr(std::is_same_v) { + return _mm_castsi128_ph(_mm_blend_epi16(GetBlendMaskEpi16(), _mm_castph_si128(a.v), _mm_castph_si128(b))); + } else if constexpr(std::is_same_v) { + #ifndef __AVX512BW__ + #ifndef __AVX512VL__ + static_assert(false, "No __AVX512BW__ and __AVX512VL__ support"); + #endif + #endif + return _mm256_castsi256_ph(_mm256_mask_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), GetBlendMaskEpi16())); + } else { + return _mm512_castsi512_ph(_mm512_blend_epi16(GetBlendMaskEpi16(), _mm512_castph_si512(a.v), _mm512_castph_si512(b))); + } + } + + constexpr static VectorF16 Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) { + VectorF16<3, Packing> qv(q.v); + VectorF16 t = Cross(qv, v) * _Float16(2); + return v + t * q.template Shuffle<{{3,3,3,3}}>() + Cross(qv, t); + } + + constexpr static VectorF16<4, 2> RotatePivot(VectorF16<3, Packing> v, VectorF16<4, Packing> q, VectorF16<3, Packing> pivot) requires(Len == 3) { + VectorF16 translated = v - pivot; + VectorF16<3, Packing> qv(q.v); + VectorF16 t = Cross(qv, translated) * _Float16(2); + VectorF16 rotated = translated + t * q.template Shuffle<{{3,3,3,3}}>() + Cross(qv, t); + return rotated + pivot; + } + + constexpr static VectorF16<4, Packing> QuanternionFromEuler(VectorF16<3, Packing> EulerHalf) requires(Len == 4) { + VectorF16<4, Packing> sin = EulerHalf.Sin(); + VectorF16<4, Packing> cos = EulerHalf.Cos(); + + VectorF16<4, Packing> row1 = cos.template Shuffle<{{0,0,0,0}}>(); + row1 = Blend<{{0,1,1,1}}>(sin, row1); + + VectorF16<4, Packing> row2 = cos.template Shuffle<{{1,1,1,1}}>(); + row2 = Blend<{{1,0,1,1}}>(sin, row2); + + row1 = row2; + + VectorF16<4, Packing> row3 = cos.template Shuffle<{{2,2,2,2}}>(); + row3 = Blend<{{1,1,0,1}}>(sin, row3); + + row1 *= row3; + + VectorF16<4, Packing> row4 = sin.template Shuffle<{{0,0,0,0}}>(); + row4 = Blend<{{1,0,0,0}}>(sin, row4); + row1 *= row4; + row1 = row1.template Negate<{{true,false,true}}>(); + + VectorF16<4, Packing> row5 = sin.template Shuffle<{{1,1,1,1}}>(); + row5 = Blend<{{0,1,0,0}}>(sin, row5); + + VectorF16<4, Packing> row6 = sin.template Shuffle<{{2,2,2,2}}>(); + row6 = Blend<{{0,0,1,0}}>(sin, row6); + + row1 = MulitplyAdd(row5, row6, row1); + + return row1; + } + private: + template values> + static consteval std::array GetNegateMask() { + std::array mask; + for(std::uint8_t i = 0; i < Len; i++) { + if(values[i]) { + mask[i] = 0b1000000000000000; + } else { + mask[i] = 0; + } + } + return mask; + } + + static consteval std::array GetNegateMaskAll() { + std::array mask; + for(std::uint8_t i = 0; i < Len; i++) { + mask[i] = 0b1000000000000000; + } + return mask; + } + + template ShuffleValues> + static consteval bool GetShuffleMaskEpi32() { + std::uint8_t mask = 0; + for(std::uint8_t i = 0; i < std::min(Len, std::uint32_t(8)); i+=2) { + mask = mask | (ShuffleValues[i] & 0b11) << i; + } + return mask; + } + + template ShuffleValues> + static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ + std::array shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); + shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); + } + } + return shuffleMask; + } + + template ShuffleValues> + static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ + std::array shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); + shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); + } + } + return shuffleMask; + } + + template ShuffleValues> + static consteval std::array GetShuffleMaskEpi8() requires (std::is_same_v){ + std::array shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); + shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); + } + } + return shuffleMask; + } + + consteval std::array GetAllTrue() { + std::array arr{}; + arr.fill(true); + return arr; + } + + template ShuffleValues> + static consteval bool CheckEpi32Shuffle() { + for(std::uint8_t i = 1; i < Len; i+=2) { + if(ShuffleValues[i-1] != ShuffleValues[i] - 1) { + return false; + } + } + for(std::uint8_t i = 0; i < Len; i++) { + for(std::uint8_t i2 = 0; i2 < Len; i2 += 8) { + if(ShuffleValues[i] != ShuffleValues[i2]) { + return false; + } + } + } + return true; + } + + template ShuffleValues> + static consteval std::uint8_t GetBlendMaskEpi16() requires (std::is_same_v){ + std::uint8_t mask = 0; + for (std::uint8_t i2 = 0; i2 < Packing; i2++) { + for (std::uint8_t i = 0; i < Len; i++) { + if (ShuffleValues[i]) { + mask |= (1u << (i2 * Len + i)); + } + } + } + return mask; + } + + template ShuffleValues> + static consteval std::uint16_t GetBlendMaskEpi16() requires (std::is_same_v){ + std::uint16_t mask = 0; + for (std::uint8_t i2 = 0; i2 < Packing; i2++) { + for (std::uint8_t i = 0; i < Len; i++) { + if (ShuffleValues[i]) { + mask |= (1u << (i2 * Len + i)); + } + } + } + return mask; + } + + template ShuffleValues> + static consteval std::uint32_t GetBlendMaskEpi16() requires (std::is_same_v){ + std::uint32_t mask = 0; + for (std::uint8_t i2 = 0; i2 < Packing; i2++) { + for (std::uint8_t i = 0; i < Len; i++) { + if (ShuffleValues[i]) { + mask |= (1u << (i2 * Len + i)); + } + } + } + return mask; + } }; } diff --git a/interfaces/main.cpp b/interfaces/main.cpp index 7dfbdc1..3e0d8b7 100644 --- a/interfaces/main.cpp +++ b/interfaces/main.cpp @@ -5,78 +5,5 @@ import std; using namespace Crafter; int main() { - // _Float16 test[] {2,1,2, 2,1,2, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - // _Float16 test2[] {2,3,3, 2,5,21, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - // VectorF16L<3,2> vec(test); - // VectorF16L<3,2> vec2(test2); - // VectorF16L<3,2> result = VectorF16L<3,2>::Cross(vec, vec2); - - - // Vector test5(2,1,2); - // Vector test6(2,3,3); - // Vector test3 = Vector::Cross(test5, test6); - - - - //VectorF16L<3,2> result = vec + vec2; - //std::println("{}\n{}", result, test3); - // std::random_device rd; - // std::mt19937 gen(rd()); - // std::uniform_real_distribution dist(0, 100); - - // Vector<_Float16, 8, 8> vA; - // for(std::uint32_t i = 0; i < 8; i++) { - // vA.v[i] = dist(gen); - // } - // VectorF16<4, 2, 1> vfA(&vA); - - // Vector<_Float16, 16, 16> vB; - // for(std::uint32_t i = 0; i < 16; i++) { - // vB.v[i] = dist(gen); - // } - // VectorF16<4, 2, 2> vfB(&vB); - - // VectorF16<4, 2, 1> vfC = vfA + vfB; - // auto start = std::chrono::high_resolution_clock::now(); - // for(std::uint32_t i = 0; i < 90000000; i++) { - // vfC = vfC + vfB; - // } - // auto end = std::chrono::high_resolution_clock::now(); - // std::cout << std::chrono::duration_cast(end-start) << std::endl; - // std::println("{}", vfC); - - // std::random_device rd; - // std::mt19937 gen(rd()); - // std::uniform_real_distribution dist(0, 100); - - // Vector<_Float16, 32, 32> vA; - // for(std::uint32_t i = 0; i < 32; i++) { - // vA.v[i] = dist(gen); - // } - - // std::string log; - // std::chrono::duration totalVector(0); - // std::tuple, VectorF16<4, 2, 4>, VectorF16<4, 2, 4>, VectorF16<4, 2, 4>> vfA {VectorF16<4, 2, 4>(&vA), VectorF16<4, 2, 4>(&vA), VectorF16<4, 2, 4>(&vA), VectorF16<4, 2, 4>(&vA)}; - // for(std::uint32_t i = 0; i < 1000000; i++) { - // auto start = std::chrono::high_resolution_clock::now(); - // vfA = VectorF16<4, 2, 4>::Normalize(std::get<0>(vfA), std::get<1>(vfA), std::get<2>(vfA), std::get<3>(vfA)); - // auto end = std::chrono::high_resolution_clock::now(); - // totalVector += end-start; - // } - - // std::chrono::duration totalScalar(0); - // Vector<_Float16, 4, 4> vB; - // for(std::uint32_t i = 0; i < 4; i++) { - // vB.v[i] = dist(gen); - // } - // for(std::uint32_t i = 0; i < 1000000; i++) { - // auto start2 = std::chrono::high_resolution_clock::now(); - // vB.Normalize(); - // auto end2 = std::chrono::high_resolution_clock::now(); - // totalScalar += end2-start2; - // } - - // std::println("{} {} {} {}", std::get<0>(vfA), std::get<1>(vfA), std::get<2>(vfA), std::get<3>(vfA)); - // std::println("{}", vB); - // std::println("Vector: {}, Scalar: {}", std::chrono::duration_cast(totalVector), std::chrono::duration_cast(totalScalar*8)); + } \ No newline at end of file diff --git a/project.json b/project.json index adaeb23..263f1dc 100644 --- a/project.json +++ b/project.json @@ -1,5 +1,5 @@ { - "name": "crafter-match", + "name": "crafter-math", "configurations": [ { "name": "base", @@ -20,16 +20,24 @@ "type":"library", "dependencies": [] }, + { + "name": "lib-shared", + "extends": ["base"], + "type":"shared-library", + "dependencies": [] + }, { "name": "lib-debug", "extends": ["lib"], "debug": true - }, + } + ], + "tests":[ { - "name": "test", - "implementations": ["interfaces/main"], - "extends": ["base"], - "debug": true + "name": "F16x86", + "implementations": ["tests/VectorF16"], + "march": "sapphirerapids", + "extends": ["lib-shared"] } ] } \ No newline at end of file diff --git a/tests/VectorF16.cpp b/tests/VectorF16.cpp new file mode 100644 index 0000000..30ac645 --- /dev/null +++ b/tests/VectorF16.cpp @@ -0,0 +1,52 @@ +/* +Crafter® Build +Copyright (C) 2026 Catcrafts® +Catcrafts.net + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License version 3.0 as published by the Free Software Foundation; + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +import Crafter.Math; +import std; +using namespace Crafter; + +extern "C" { + std::string* RunTest() { + { + _Float16 floats[] {0,1,2,3,4,5,6,7,8}; + VectorF16<8, 1> vec1(floats); + + Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store(); + for(std::uint8_t i = 0; i < 8; i++) { + if(stored.v[i] != floats[i]) { + return new std::string("Load Store does not match"); + } + } + } + + { + _Float16 floats[] {0,1,2,3,4,5,6,7,8}; + VectorF16<8, 1> vec1(floats); + VectorF16<8, 1> result = vec1 + vec1; + Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store(); + for(std::uint8_t i = 0; i < 8; i++) { + if(stored.v[i] != floats[i] + floats[i]) { + return new std::string("Add does not match"); + } + } + } + return nullptr; + } +} + +