From c54ff6228c66b602dc39eb722e4f8c2c5cf8d510 Mon Sep 17 00:00:00 2001 From: Jorijn van der Graaf Date: Thu, 19 Mar 2026 03:22:22 +0100 Subject: [PATCH] more F16 math --- interfaces/Crafter.Math-VectorF16.cppm | 418 +++++++++++++++++++++++-- 1 file changed, 397 insertions(+), 21 deletions(-) diff --git a/interfaces/Crafter.Math-VectorF16.cppm b/interfaces/Crafter.Math-VectorF16.cppm index 814ea5e..b1f9e9a 100755 --- a/interfaces/Crafter.Math-VectorF16.cppm +++ b/interfaces/Crafter.Math-VectorF16.cppm @@ -263,27 +263,72 @@ namespace Crafter { return Dot(*this, *this); } - // template - // constexpr static Vector Cross(Vector a, Vector b) requires(Len == 3 && Alen >= 3 && Blen >= 3) { - // return Vector( - // (a.v[1] * b.v[2]) - (a.v[2] * b.v[1]), - // (a.v[2] * b.v[0]) - (a.v[0] * b.v[2]), - // (a.v[0] * b.v[1]) - (a.v[1] * b.v[0]) - // ); - // } + constexpr static VectorF16 Cross(VectorF16 a, VectorF16 b) requires(Len == 3 && Packing == 2) { + if constexpr(Len == 3) { + if constexpr(Repeats == 1) { + constexpr std::uint8_t shuffleMask1[] { + 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16 + }; + __m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1); + __m128h row1 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec1)); + __m128h row4 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec1)); - // template - // constexpr static Vector Normalize(Vector a) requires(Len == Alen) { - // Vector returned; - // T fLength = a.Length(); - - // fLength = 1.0f / fLength; + constexpr std::uint8_t shuffleMask3[] { + 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16 + }; + __m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3); + __m128h row3 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec3)); + __m128h row2 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec3)); - // for(std::uint32_t i = 0; i < Len; i++) { - // returned.v[i] = a.v[i] * fLength; - // } - // return returned; - // } + __m128h result = _mm_mul_ph(row3, row4); + return _mm_fmsub_ph(row1,row2,result); + } + if constexpr(Repeats == 2) { + constexpr std::uint8_t shuffleMask1[] { + 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, + 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16 + }; + __m256i shuffleVec1 = _mm256_loadu_epi8(shuffleMask1); + __m256h row1 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(a.v), shuffleVec1)); + __m256h row4 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(b.v), shuffleVec1)); + + constexpr std::uint8_t shuffleMask3[] { + 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, + 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16 + }; + __m256i shuffleVec3 = _mm256_loadu_epi8(shuffleMask3); + __m256h row3 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(a.v), shuffleVec3)); + __m256h row2 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(b.v), shuffleVec3)); + + __m256h result = _mm256_mul_ph(row3, row4); + return _mm256_fmsub_ph(row1,row2,result); + } + if constexpr(Repeats == 4) { + constexpr std::uint8_t shuffleMask1[] { + 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, + 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, + 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16, + 2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16 + }; + __m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1); + __m512h row1 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec1)); + __m512h row4 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec1)); + + constexpr std::uint8_t shuffleMask3[] { + 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, + 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, + 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16, + 4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16 + }; + __m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3); + __m512h row3 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec3)); + __m512h row2 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec3)); + + __m512h result = _mm512_mul_ph(row3, row4); + return _mm512_fmsub_ph(row1,row2,result); + } + } + } constexpr static _Float16 Dot(VectorF16 a, VectorF16 b) { if constexpr(std::is_same_v) { @@ -535,6 +580,338 @@ namespace Crafter { } } + constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( + VectorF16 A, + VectorF16 C, + VectorF16 E, + VectorF16 G + ) requires(Packing == 2) { + if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, C, E, G); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 + }; + __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + constexpr std::uint8_t shuffleMaskC[] { + 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 + }; + __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); + __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); + + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 + }; + __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + constexpr std::uint8_t shuffleMaskG[] { + 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + }; + __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); + __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); + + return { + _mm_mul_ph(A.v, fLenghtA), + _mm_mul_ph(C.v, fLenghtC), + _mm_mul_ph(E.v, fLenghtE), + _mm_mul_ph(G.v, fLenghtG), + }; + } else if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, C, E, G); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256h one = _mm256_loadu_ph(oneArr); + __m256h fLenght = _mm256_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 + }; + __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + constexpr std::uint8_t shuffleMaskC[] { + 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 + }; + __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); + __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); + + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 + }; + __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + constexpr std::uint8_t shuffleMaskG[] { + 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + }; + __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); + __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); + + return { + _mm256_mul_ph(A.v, fLenghtA), + _mm256_mul_ph(C.v, fLenghtC), + _mm256_mul_ph(E.v, fLenghtE), + _mm256_mul_ph(G.v, fLenghtG), + }; + } else { + VectorF16 lenght = Length(A, C, E, G); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m512h one = _mm512_loadu_ph(oneArr); + __m512h fLenght = _mm512_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3 + }; + __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + constexpr std::uint8_t shuffleMaskC[] { + 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7, + 4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7 + }; + __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); + __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); + + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11, + 8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11 + }; + __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + constexpr std::uint8_t shuffleMaskG[] { + 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + 12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15, + }; + __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); + __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); + + return { + _mm512_mul_ph(A.v, fLenghtA), + _mm512_mul_ph(C.v, fLenghtC), + _mm512_mul_ph(E.v, fLenghtE), + _mm512_mul_ph(G.v, fLenghtG), + }; + } + } + + constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( + VectorF16 A, + VectorF16 E + ) requires(Packing == 4) { + if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, E); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 + }; + __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); + + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 + }; + __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + + return { + _mm_mul_ph(A.v, fLenghtA), + _mm_mul_ph(E.v, fLenghtE), + }; + } else if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, E); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256h one = _mm256_loadu_ph(oneArr); + __m256h fLenght = _mm256_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 + }; + __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); + + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 + }; + __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + + return { + _mm256_mul_ph(A.v, fLenghtA), + _mm256_mul_ph(E.v, fLenghtE), + }; + } else { + VectorF16 lenght = Length(A, E); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m512h one = _mm512_loadu_ph(oneArr); + __m512h fLenght = _mm512_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7 + }; + __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15 + }; + __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + return { + _mm512_mul_ph(A.v, fLenghtA), + _mm512_mul_ph(E.v, fLenghtE), + }; + } + } + + constexpr static std::tuple, VectorF16> NormalizeRepeated( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Len == 8 && Packing == 1 && Repeats == 1) { + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskABCD[] { + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 + }; + __m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD); //10 0.5 + __m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD)); //1 1 + + __m512h vecABCD; + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0)); //3 1 + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3)); + vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD); //4 0.5 + + constexpr std::uint8_t shuffleMaskEFGH[] { + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + __m512h shuffleVecEFGH = _mm512_loadu_epi8(shuffleMaskEFGH); //10 0.5 + __m512h fLenghtEFGH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), _mm512_castph_si512(shuffleVecEFGH))); //1 1 + + __m512h vecEFGH; + vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(E.v), 0)); //3 1 + vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(F.v), 1)); + vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(G.v), 2)); + vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(H.v), 3)); + vecEFGH = _mm512_mul_ph(vecABCD, fLenghtEFGH); //4 0.5 + return { + vecABCD, + vecEFGH + }; + } + + constexpr static std::tuple, VectorF16> NormalizeRepeated( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Len == 4 && Packing == 2 && Repeats == 1) { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskABCD[] { + 0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3, + 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7, + 8,9,8,9,8,9,8,9,10,11,10,11,10,11, + 12,13,12,13,12,13,14,15,14,15,14,15 + }; + __m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD); + __m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD)); + + __m512h vecABCD; + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3)); + vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD); + return vecABCD; + } + + constexpr static std::tuple, VectorF16> NormalizeRepeated( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Len == 2 && Packing == 4 && Repeats == 1) { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskABCD[] { + 0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7, + 8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15, + 16,17,16,17,18,19,18,19,20,21,20,21,22,23,22,23, + 24,25,24,25,26,27,26,27,28,29,28,29,30,31,30,31 + }; + __m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD); + __m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD)); + + __m512h vecABCD; + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2)); + vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3)); + vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD); + return vecABCD; + } + constexpr static VectorF16 Length( VectorF16 A, VectorF16 B, @@ -882,8 +1259,7 @@ namespace Crafter { } } - // template - // constexpr static Vector Rotate(Vector v, Vector q) requires(Len == 3) { + // constexpr static VectorF16 Rotate(VectorF16<3, Packing, Repeats> v, VectorF16<4, Packing, Repeats> q) requires(Len == 3) { // Vector qv(q.x, q.y, q.z); // Vector t = Vector::Cross(qv, v) * T(2); // return v + t * q.w + Vector::Cross(qv, t);