diff --git a/interfaces/Crafter.Math-VectorF16.cppm b/interfaces/Crafter.Math-VectorF16.cppm index 901c864..81b8557 100755 --- a/interfaces/Crafter.Math-VectorF16.cppm +++ b/interfaces/Crafter.Math-VectorF16.cppm @@ -45,11 +45,141 @@ namespace Crafter { VectorType v; public: - template - friend class VectorF16; static constexpr std::uint32_t MaxSize = 32; static constexpr std::uint8_t Alignment = GetAlingment(); static_assert(Len * Packing <= MaxSize, "Len * Packing exceeds MaxSize"); + private: + + template values> + static consteval std::array GetNegateMask() { + std::array mask{0}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + if(values[i]) { + mask[i2*Len+i] = 0b1000000000000000; + } else { + mask[i2*Len+i] = 0; + } + } + } + return mask; + } + + static consteval std::array GetNegateMaskAll() { + std::array mask{0}; + for(std::uint8_t i = 0; i < Packing*Len; i++) { + mask[i] = 0b1000000000000000; + } + return mask; + } + + template ShuffleValues> + static consteval bool GetShuffleMaskEpi32() { + std::uint8_t mask = 0; + for(std::uint8_t i = 0; i < std::min(Len, std::uint32_t(8)); i+=2) { + mask = mask | (ShuffleValues[i] & 0b11) << i; + } + return mask; + } + + template ShuffleValues> + static consteval std::array::Alignment> GetPermuteMaskEpi16() { + std::array::Alignment> shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[i2*Len+i] = ShuffleValues[i]+i2*Len; + } + } + return shuffleMask; + } + + static consteval std::array GetAllTrue() { + std::array arr{}; + arr.fill(true); + return arr; + } + + template ShuffleValues> + static consteval bool CheckEpi32Shuffle() { + for(std::uint8_t i = 1; i < Len; i+=2) { + if(ShuffleValues[i-1] != ShuffleValues[i] - 1) { + return false; + } + } + for(std::uint8_t i = 0; i < Len; i++) { + for(std::uint8_t i2 = 0; i2 < Len; i2 += 8) { + if(ShuffleValues[i] != ShuffleValues[i2]) { + return false; + } + } + } + return true; + } + + template ShuffleValues> + static consteval bool CheckEpi8Shuffle() { + for(std::uint8_t i = 0; i < Len; i++) { + std::uint8_t lane = i / 8; + if(ShuffleValues[i] < lane * 8 || ShuffleValues[i] > lane * 8 + 7) { + return false; + } + } + return true; + } + + template ShuffleValues> + static consteval std::uint8_t GetBlendMaskEpi16() requires (std::is_same_v){ + std::uint8_t mask = 0; + for (std::uint8_t i2 = 0; i2 < Packing; i2++) { + for (std::uint8_t i = 0; i < Len; i++) { + if (ShuffleValues[i]) { + mask |= (1u << (i2 * Len + i)); + } + } + } + return mask; + } + + template ShuffleValues> + static consteval std::uint16_t GetBlendMaskEpi16() requires (std::is_same_v){ + std::uint16_t mask = 0; + for (std::uint8_t i2 = 0; i2 < Packing; i2++) { + for (std::uint8_t i = 0; i < Len; i++) { + if (ShuffleValues[i]) { + mask |= (1u << (i2 * Len + i)); + } + } + } + return mask; + } + + template ShuffleValues> + static consteval std::uint32_t GetBlendMaskEpi16() requires (std::is_same_v){ + std::uint32_t mask = 0; + for (std::uint8_t i2 = 0; i2 < Packing; i2++) { + for (std::uint8_t i = 0; i < Len; i++) { + if (ShuffleValues[i]) { + mask |= (1u << (i2 * Len + i)); + } + } + } + return mask; + } + + template ShuffleValues> + static consteval std::array::Alignment*2> GetShuffleMaskEpi8() { + std::array::Alignment*2> shuffleMask {{0}}; + for(std::uint8_t i2 = 0; i2 < Packing; i2++) { + for(std::uint8_t i = 0; i < Len; i++) { + shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); + shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); + } + } + return shuffleMask; + } + public: + template + friend class VectorF16; constexpr VectorF16() = default; constexpr VectorF16(VectorType v) : v(v) {} @@ -108,8 +238,60 @@ namespace Crafter { } else { return VectorF16(v); } - } else { + } else if constexpr (BLen <= Len) { return this->template ExtractLo(); + } else { + if constexpr(std::is_same_v::VectorType, __m128h>) { + if constexpr(std::is_same_v) { + constexpr std::array::Alignment*2> shuffleMask = GetExtractLoMaskEpi8(); + __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec))); + } else if constexpr(std::is_same_v) { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m256i permIdx = _mm256_loadu_epi16(permMask.data()); + __m256i result = _mm256_permutexvar_epi16(permIdx, _mm_castph_si256(v)); + return VectorF16(_mm_castsi128_ph(_mm256_castsi256_si128(result))); + } else { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m512i permIdx = _mm512_loadu_epi16(permMask.data()); + __m512i result = _mm512_permutexvar_epi16(permIdx, _mm512_castph_si512(v)); + return VectorF16(_mm_castsi128_ph(_mm512_castsi512_si128(result))); + } + } else if constexpr(std::is_same_v::VectorType, __m256h>) { + if constexpr(std::is_same_v) { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m256i permIdx = _mm256_loadu_epi16(permMask.data()); + __m256i result = _mm256_permutexvar_epi16(permIdx, _mm256_castsi128_si256(_mm_castph_si128(v))); + return VectorF16(_mm256_castsi256_ph(result)); + } else if constexpr(std::is_same_v) { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m256i permIdx = _mm256_loadu_epi16(permMask.data()); + __m256i result = _mm256_permutexvar_epi16(permIdx, _mm256_castph_si256(v)); + return VectorF16(_mm256_castsi256_ph(result)); + } else { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m256i permIdx = _mm512_loadu_epi16(permMask.data()); + __m256i result = _mm512_permutexvar_epi16(permIdx, _mm512_castsi512_si256(_mm512_castph_si512(v))); + return VectorF16(_mm256_castsi256_ph(result)); + } + } else { + if constexpr(std::is_same_v) { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m512i permIdx = _mm512_loadu_epi16(permMask.data()); + __m512i result = _mm512_permutexvar_epi16(permIdx, _mm512_castsi128_si512(_mm_castph_si128(v))); + return VectorF16(_mm512_castsi512_ph(result)); + } else if constexpr(std::is_same_v) { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m512i permIdx = _mm512_loadu_epi16(permMask.data()); + __m512i result = _mm512_permutexvar_epi16(permIdx, _mm512_castsi256_si512(_mm256_castph_si256(v))); + return VectorF16(_mm512_castsi512_ph(result)); + } else { + constexpr std::array::Alignment> permMask = GetExtractLoMaskEpi16(); + __m512i permIdx = _mm512_loadu_epi16(permMask.data()); + __m512i result = _mm512_permutexvar_epi16(permIdx, _mm512_castph_si512(v)); + return VectorF16(_mm512_castsi512_ph(result)); + } + } } } @@ -442,33 +624,6 @@ namespace Crafter { } } - template ShuffleValues> - constexpr VectorF16 Shuffle() { - if constexpr(VectorF16::CheckEpi32Shuffle()) { - if constexpr(std::is_same_v) { - return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi32(_mm_castph_si128(v), GetShuffleMaskEpi32()))); - } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_castsi256_ph(_mm256_shuffle_epi32(_mm256_castph_si256(v), GetShuffleMaskEpi32()))); - } else { - return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi32(_mm512_castph_si512(v), GetShuffleMaskEpi32()))); - } - } else { - if constexpr(std::is_same_v) { - constexpr std::array::Alignment*2> shuffleMask = GetShuffleMaskEpi8(); - __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); - return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec))); - } else if constexpr(std::is_same_v) { - constexpr std::array::Alignment*2> shuffleMask = GetShuffleMaskEpi8(); - __m256i shuffleVec = _mm256_loadu_epi8(shuffleMask.data()); - return VectorF16(_mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(v)), _mm512_castsi256_si512(shuffleVec))))); - } else { - constexpr std::array::Alignment*2> shuffleMask = GetShuffleMaskEpi8(); - __m512i shuffleVec = _mm512_loadu_epi8(shuffleMask.data()); - return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(v), shuffleVec))); - } - } - } - static constexpr VectorF16 MulitplyAdd(VectorF16 a, VectorF16 b, VectorF16 add) { if constexpr(std::is_same_v) { return VectorF16(_mm_fmadd_ph(a.v, b.v, add.v)); @@ -549,6 +704,47 @@ namespace Crafter { } } + template ShuffleValues> + constexpr VectorF16 Shuffle() { + if constexpr(CheckEpi32Shuffle()) { + if constexpr(std::is_same_v) { + return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi32(_mm_castph_si128(v), GetShuffleMaskEpi32()))); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_castsi256_ph(_mm256_shuffle_epi32(_mm256_castph_si256(v), GetShuffleMaskEpi32()))); + } else { + return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi32(_mm512_castph_si512(v), GetShuffleMaskEpi32()))); + } + } else if constexpr(CheckEpi8Shuffle()){ + if constexpr(std::is_same_v) { + constexpr std::array::Alignment*2> shuffleMask = GetShuffleMaskEpi8(); + __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec))); + } else if constexpr(std::is_same_v) { + constexpr std::array::Alignment*2> shuffleMask = GetShuffleMaskEpi8(); + __m256i shuffleVec = _mm256_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(v)), _mm512_castsi256_si512(shuffleVec))))); + } else { + constexpr std::array::Alignment*2> shuffleMask = GetShuffleMaskEpi8(); + __m512i shuffleVec = _mm512_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(v), shuffleVec))); + } + } else { + if constexpr(std::is_same_v) { + constexpr std::array::Alignment*2> shuffleMask = GetShuffleMaskEpi8(); + __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); + return VectorF16(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec))); + } else if constexpr(std::is_same_v) { + constexpr std::array::Alignment> permMask = GetPermuteMaskEpi16(); + __m256i permIdx = _mm256_loadu_epi16(permMask.data()); + return VectorF16(_mm256_castsi256_ph(_mm256_permutexvar_epi16(permIdx, _mm256_castph_si256(v)))); + } else { + constexpr std::array::Alignment> permMask = GetPermuteMaskEpi16(); + __m512i permIdx = _mm512_loadu_epi16(permMask.data()); + return VectorF16(_mm512_castsi512_ph(_mm512_permutexvar_epi16(permIdx, _mm512_castph_si512(v)))); + } + } + } + constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( VectorF16 A, VectorF16 B, @@ -558,7 +754,7 @@ namespace Crafter { VectorF16 F, VectorF16 G, VectorF16 H - ) requires(Len == 8) { + ) requires(Len == 8 && Packing*Len == Alignment) { constexpr std::array shuffleMaskA = GetShuffleMaskEpi8<{{0,0,0,0,0,0,0,0}}>(); constexpr std::array shuffleMaskB = GetShuffleMaskEpi8<{{1,1,1,1,1,1,1,1}}>(); constexpr std::array shuffleMaskC = GetShuffleMaskEpi8<{{2,2,2,2,2,2,2,2}}>(); @@ -696,7 +892,7 @@ namespace Crafter { VectorF16 C, VectorF16 E, VectorF16 G - ) requires(Len == 4) { + ) requires(Len == 4 && Packing*Len == Alignment) { constexpr std::array shuffleMaskA = GetShuffleMaskEpi8<{{0,0,0,0}}>(); constexpr std::array shuffleMaskC = GetShuffleMaskEpi8<{{1,1,1,1}}>(); constexpr std::array shuffleMaskE = GetShuffleMaskEpi8<{{2,2,2,2}}>(); @@ -780,62 +976,50 @@ namespace Crafter { constexpr static std::tuple, VectorF16> Normalize( VectorF16 A, VectorF16 E - ) requires(Len == 2) { - constexpr std::array shuffleMaskA = GetShuffleMaskEpi8<{{0,0}}>(); - constexpr std::array shuffleMaskE = GetShuffleMaskEpi8<{{1,1}}>(); - + ) requires(Len == 2 && Packing*Len == Alignment) { if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, E); + VectorF16<1, 8> lenght = Length(A, E); constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; __m128h one = _mm_loadu_ph(oneArr); - __m128h fLenght = _mm_div_ph(one, lenght.v); + VectorF16<8, 1> fLenght(_mm_div_ph(one, lenght.v)); - __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA.data()); - __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); - - __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE.data()); - __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + VectorF16<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,2,2,3,3}}>(); + VectorF16<8, 1> fLenghtE = fLenght.template Shuffle<{{4,4,5,5,6,6,7,7}}>(); return { - _mm_mul_ph(A.v, fLenghtA), - _mm_mul_ph(E.v, fLenghtE), + _mm_mul_ph(A.v, fLenghtA.v), + _mm_mul_ph(E.v, fLenghtE.v), }; } else if constexpr(std::is_same_v) { - VectorF16 lenght = Length(A, E); + VectorF16<1, 16> lenght = Length(A, E); constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m256h one = _mm256_loadu_ph(oneArr); - __m256h fLenght = _mm256_div_ph(one, lenght.v); + VectorF16<16, 1> fLenght(_mm256_div_ph(one, lenght.v)); - __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA.data()); - __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); - - __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE.data()); - __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); + VectorF16<16, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7}}>(); + VectorF16<16, 1> fLenghtE = fLenght.template Shuffle<{{8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15}}>(); return { - _mm256_mul_ph(A.v, fLenghtA), - _mm256_mul_ph(E.v, fLenghtE), + _mm256_mul_ph(A.v, fLenghtA.v), + _mm256_mul_ph(E.v, fLenghtE.v), }; } else { - VectorF16 lenght = Length(A, E); + VectorF16<1, 32> lenght = Length(A, E); constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m512h one = _mm512_loadu_ph(oneArr); - __m512h fLenght = _mm512_div_ph(one, lenght.v); + VectorF16<32, 1> fLenght(_mm512_div_ph(one, lenght.v)); - __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA.data()); - __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); - - __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE.data()); - __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + VectorF16<32, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15}}>(); + VectorF16<32, 1> fLenghtE = fLenght.template Shuffle<{{16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31}}>(); return { - _mm512_mul_ph(A.v, fLenghtA), - _mm512_mul_ph(E.v, fLenghtE), + _mm512_mul_ph(A.v, fLenghtA.v), + _mm512_mul_ph(E.v, fLenghtE.v), }; } } - constexpr static VectorF16 Length( + constexpr static VectorF16<1, Packing*8> Length( VectorF16 A, VectorF16 B, VectorF16 C, @@ -844,48 +1028,48 @@ namespace Crafter { VectorF16 F, VectorF16 G, VectorF16 H - ) requires(Len == 8) { - VectorF16 lenghtSq = LengthSq(A, B, C, D, E, F, G, H); + ) requires(Len == 8 && Packing*Len == Alignment) { + VectorF16<1, Packing*8> lenghtSq = LengthSq(A, B, C, D, E, F, G, H); if constexpr(std::is_same_v) { - return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*8>(_mm_sqrt_ph(lenghtSq.v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*8>(_mm256_sqrt_ph(lenghtSq.v)); } else { - return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*8>(_mm512_sqrt_ph(lenghtSq.v)); } } - constexpr static VectorF16 Length( + constexpr static VectorF16<1, Packing*4> Length( VectorF16 A, VectorF16 C, VectorF16 E, VectorF16 G - ) requires(Len == 4) { - VectorF16 lenghtSq = LengthSq(A, C, E, G); + ) requires(Len == 4 && Packing*Len == Alignment) { + VectorF16<1, Packing*4> lenghtSq = LengthSq(A, C, E, G); if constexpr(std::is_same_v) { - return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*4>(_mm_sqrt_ph(lenghtSq.v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*4>(_mm256_sqrt_ph(lenghtSq.v)); } else { - return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*4>(_mm512_sqrt_ph(lenghtSq.v)); } } - constexpr static VectorF16 Length( + constexpr static VectorF16<1, Packing*2> Length( VectorF16 A, VectorF16 E - ) requires(Len == 2) { - VectorF16 lenghtSq = LengthSq(A, E); + ) requires(Len == 2 && Packing*Len == Alignment) { + VectorF16<1, Packing*2> lenghtSq = LengthSq(A, E); if constexpr(std::is_same_v) { - return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*2>(_mm_sqrt_ph(lenghtSq.v)); } else if constexpr(std::is_same_v) { - return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*2>(_mm256_sqrt_ph(lenghtSq.v)); } else { - return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + return VectorF16<1, Packing*2>(_mm512_sqrt_ph(lenghtSq.v)); } } - constexpr static VectorF16 LengthSq( + constexpr static VectorF16<1, Packing*8> LengthSq( VectorF16 A, VectorF16 B, VectorF16 C, @@ -894,27 +1078,27 @@ namespace Crafter { VectorF16 F, VectorF16 G, VectorF16 H - ) requires(Len == 8) { + ) requires(Len == 8 && Packing*Len == Alignment) { return Dot(A, A, B, B, C, C, D, D, E, E, F, F, G, G, H, H); } - constexpr static VectorF16 LengthSq( + constexpr static VectorF16<1, Packing*4> LengthSq( VectorF16 A, VectorF16 C, VectorF16 E, VectorF16 G - ) requires(Len == 4) { + ) requires(Len == 4 && Packing*Len == Alignment) { return Dot(A, A, C, C, E, E, G, G); } - constexpr static VectorF16 LengthSq( + constexpr static VectorF16<1, Packing*2> LengthSq( VectorF16 A, VectorF16 E - ) requires(Len == 2) { + ) requires(Len == 2 && Packing*Len == Alignment) { return Dot(A, A, E, E); } - constexpr static VectorF16 Dot( + constexpr static VectorF16<1, Packing*8> Dot( VectorF16 A0, VectorF16 A1, VectorF16 B0, VectorF16 B1, VectorF16 C0, VectorF16 C1, @@ -923,7 +1107,7 @@ namespace Crafter { VectorF16 F0, VectorF16 F1, VectorF16 G0, VectorF16 G1, VectorF16 H0, VectorF16 H1 - ) requires(Len == 8) { + ) requires(Len == 8 && Packing*Len == Alignment) { if constexpr(std::is_same_v) { __m128h mulA = _mm_mul_ph(A0.v, A1.v); __m128h mulB = _mm_mul_ph(B0.v, B1.v); @@ -1086,12 +1270,12 @@ namespace Crafter { } } - constexpr static VectorF16 Dot( + constexpr static VectorF16<1, Packing*4> Dot( VectorF16 A0, VectorF16 A1, VectorF16 C0, VectorF16 C1, VectorF16 E0, VectorF16 E1, VectorF16 G0, VectorF16 G1 - ) requires(Len == 4) { + ) requires(Len == 4 && Packing*Len == Alignment) { if constexpr(std::is_same_v) { __m128h mulA = _mm_mul_ph(A0.v, A1.v); __m128h mulC = _mm_mul_ph(C0.v, C1.v); @@ -1179,10 +1363,10 @@ namespace Crafter { } } - constexpr static VectorF16 Dot( + constexpr static VectorF16<1, Packing*2> Dot( VectorF16 A0, VectorF16 A1, VectorF16 E0, VectorF16 E1 - ) requires(Len == 2) { + ) requires(Len == 2 && Packing*Len == Alignment) { if constexpr(std::is_same_v) { __m128h mulA = _mm_mul_ph(A0.v, A1.v); __m128h mulE = _mm_mul_ph(E0.v, E1.v); @@ -1200,7 +1384,9 @@ namespace Crafter { } else if constexpr(std::is_same_v) { __m256h mulA = _mm256_mul_ph(A0.v, A1.v); __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + __m256i row12Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 __m256i row12Temp1Temp = row12Temp1; @@ -1209,8 +1395,12 @@ namespace Crafter { __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 - - return _mm256_add_ph(row1, row2); + __m256h result = _mm256_add_ph(row1, row2); + + VectorF16<16, 1> vec(result); + vec = vec.template Shuffle<{{0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15}}>(); + + return VectorF16<1, 16>(vec.v); } else { __m512h mulA = _mm512_mul_ph(A0.v, A1.v); __m512h mulE = _mm512_mul_ph(E0.v, E1.v); @@ -1223,8 +1413,11 @@ namespace Crafter { __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + __m512h result = _mm512_add_ph(row1, row2); - return _mm512_add_ph(row1, row2); + VectorF16<32, 1> vec(result); + vec = vec.template Shuffle<{{0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27,4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}}>(); + return VectorF16<1, 32>(vec.v); } } @@ -1292,113 +1485,6 @@ namespace Crafter { return row1; } - private: - template values> - static consteval std::array GetNegateMask() { - std::array mask{0}; - for(std::uint8_t i2 = 0; i2 < Packing; i2++) { - for(std::uint8_t i = 0; i < Len; i++) { - if(values[i]) { - mask[i2*Len+i] = 0b1000000000000000; - } else { - mask[i2*Len+i] = 0; - } - } - } - return mask; - } - - static consteval std::array GetNegateMaskAll() { - std::array mask{0}; - for(std::uint8_t i = 0; i < Packing*Len; i++) { - mask[i] = 0b1000000000000000; - } - return mask; - } - - template ShuffleValues> - static consteval bool GetShuffleMaskEpi32() { - std::uint8_t mask = 0; - for(std::uint8_t i = 0; i < std::min(Len, std::uint32_t(8)); i+=2) { - mask = mask | (ShuffleValues[i] & 0b11) << i; - } - return mask; - } - - template ShuffleValues> - static consteval std::array::Alignment*2> GetShuffleMaskEpi8() { - std::array::Alignment*2> shuffleMask {{0}}; - for(std::uint8_t i2 = 0; i2 < Packing; i2++) { - for(std::uint8_t i = 0; i < Len; i++) { - shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2); - shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2); - } - } - return shuffleMask; - } - - static consteval std::array GetAllTrue() { - std::array arr{}; - arr.fill(true); - return arr; - } - - template ShuffleValues> - static consteval bool CheckEpi32Shuffle() { - for(std::uint8_t i = 1; i < Len; i+=2) { - if(ShuffleValues[i-1] != ShuffleValues[i] - 1) { - return false; - } - } - for(std::uint8_t i = 0; i < Len; i++) { - for(std::uint8_t i2 = 0; i2 < Len; i2 += 8) { - if(ShuffleValues[i] != ShuffleValues[i2]) { - return false; - } - } - } - return true; - } - - template ShuffleValues> - static consteval std::uint8_t GetBlendMaskEpi16() requires (std::is_same_v){ - std::uint8_t mask = 0; - for (std::uint8_t i2 = 0; i2 < Packing; i2++) { - for (std::uint8_t i = 0; i < Len; i++) { - if (ShuffleValues[i]) { - mask |= (1u << (i2 * Len + i)); - } - } - } - return mask; - } - - template ShuffleValues> - static consteval std::uint16_t GetBlendMaskEpi16() requires (std::is_same_v){ - std::uint16_t mask = 0; - for (std::uint8_t i2 = 0; i2 < Packing; i2++) { - for (std::uint8_t i = 0; i < Len; i++) { - if (ShuffleValues[i]) { - mask |= (1u << (i2 * Len + i)); - } - } - } - return mask; - } - - template ShuffleValues> - static consteval std::uint32_t GetBlendMaskEpi16() requires (std::is_same_v){ - std::uint32_t mask = 0; - for (std::uint8_t i2 = 0; i2 < Packing; i2++) { - for (std::uint8_t i = 0; i < Len; i++) { - if (ShuffleValues[i]) { - mask |= (1u << (i2 * Len + i)); - } - } - } - return mask; - } - static constexpr float two_over_pi = 0.6366197723675814f; static constexpr float pi_over_2_hi = 1.5707963267341256f; static constexpr float pi_over_2_lo = 6.077100506506192e-11f; diff --git a/project.json b/project.json index 63c7728..b233dc6 100644 --- a/project.json +++ b/project.json @@ -34,7 +34,7 @@ ], "tests":[ { - "name": "F16-x86-64-sapphirerapids", + "name": "Vector-x86-64-sapphirerapids", "implementations": ["tests/Vector"], "march": "sapphirerapids", "extends": ["lib-shared"] diff --git a/tests/Vector.cpp b/tests/Vector.cpp index 7f396d0..3566f05 100644 --- a/tests/Vector.cpp +++ b/tests/Vector.cpp @@ -36,6 +36,15 @@ consteval std::array AlternateTrueFalse() { return result; } +template +consteval std::array GetCountReverse() { + std::array result = {}; + for (std::uint8_t i = 0; i < Len; ++i) { + result[Len - 1 - i] = i; + } + return result; +} + template class VectorType, std::uint32_t MaxSize, std::uint32_t Len = 1, std::uint32_t Packing = 1> std::string* TestAllCombinations() { if constexpr (Len > MaxSize) { @@ -59,6 +68,13 @@ std::string* TestAllCombinations() { floats1[i] = 0; floats2[i] = 0; } + T expectedLength[Packing] = {0}; + for (std::uint32_t i2 = 0; i2 < Packing; i2++) { + for (std::uint32_t i = 0; i < Len; i++) { + expectedLength[i2] += floats[i2*Len+i] * floats[i2*Len+i]; + } + expectedLength[i2] = T(std::sqrt(float(expectedLength[i2]))); + } std::string* result = nullptr; constexpr auto total = Len * Packing; @@ -304,113 +320,97 @@ std::string* TestAllCombinations() { return new std::string(std::format("Normalize mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, 1, (float)length)); } } + + { + VectorType vec(floats); + VectorType result = vec.template Shuffle()>(); + Vector::Alignment> stored = result.Store(); + for (std::uint32_t i = 0; i < Len; i++) { + T expected = floats[Len - 1 - i]; + if (!FloatEquals(stored.v[i], expected)) { + return new std::string(std::format("Shuffle mismatch at Len={} Packing={}, Index={}, Expected: {}, Got: {}", Len, Packing, i, (float)expected, (float)stored.v[i])); + } + } + } } - // if constexpr(Len == 3) { - // { - // VectorType vec1(floats1); - // VectorType vec2(floats2); - // VectorType result = VectorType::Cross(vec1, vec2); - // Vector::Alignment> stored = result.Store(); - // if (!FloatEquals(stored.v[0], T(-3)) || !FloatEquals(stored.v[1], T(6)) || !FloatEquals(stored.v[2], T(-3))) { - // return new std::string(std::format("Cross mismatch at Len={} Packing={}, Expected: -3,6,-3, Got: {},{},{}", Len, Packing, (float)stored.v[0], (float)stored.v[1], (float)stored.v[2])); - // } - // } - // // if constexpr(4 * Packing < VectorType<1, 1>::MaxSize) { - // // T qData[VectorType<4, Packing>::Alignment]; - // // qData[0] = T(1); - // // qData[1] = T(0); - // // qData[2] = T(0); - // // qData[3] = T(0); + if constexpr(Len == 3) { + { + VectorType vec1(floats1); + VectorType vec2(floats2); + VectorType result = VectorType::Cross(vec1, vec2); + Vector::Alignment> stored = result.Store(); + if (!FloatEquals(stored.v[0], T(-3)) || !FloatEquals(stored.v[1], T(6)) || !FloatEquals(stored.v[2], T(-3))) { + return new std::string(std::format("Cross mismatch at Len={} Packing={}, Expected: -3,6,-3, Got: {},{},{}", Len, Packing, (float)stored.v[0], (float)stored.v[1], (float)stored.v[2])); + } + } + if constexpr(4 * Packing < VectorType<1, 1>::MaxSize) { + T qData[VectorType<4, Packing>::Alignment]; + qData[0] = T(0); + qData[1] = T(0); + qData[2] = T(0); + qData[3] = T(1); - // // VectorType<3, Packing> vecV(floats); - // // VectorType<4, Packing> vecQ(qData); - // // VectorType<3, Packing> result = VectorType<3, Packing>::Rotate(vecV, vecQ); - // // Vector::Alignment> stored = result.Store(); + VectorType<3, Packing> vecV(floats); + VectorType<4, Packing> vecQ(qData); + VectorType<3, Packing> result = VectorType<3, Packing>::Rotate(vecV, vecQ); + Vector::Alignment> stored = result.Store(); - // // for (std::uint32_t i = 0; i < 3; i++) { - // // if (!FloatEquals(stored.v[i], floats[i])) { - // // return new std::string(std::format("Rotate mismatch at Len={} Packing={}, Index={}, Expected: {}, Got: {}", Len, Packing, i, (float)floats[i], (float)stored.v[i])); - // // } - // // } - // // } - // } + for (std::uint32_t i = 0; i < 3; i++) { + if (!FloatEquals(stored.v[i], floats[i])) { + return new std::string(std::format("Rotate mismatch at Len={} Packing={}, Index={}, Expected: {}, Got: {}", Len, Packing, i, (float)floats[i], (float)stored.v[i])); + } + } + } + } + if constexpr(Len == 4) { + T eulerData[VectorType<3, Packing>::Alignment]; + for(std::uint8_t i = 0; i < Packing; i++) { + eulerData[i*3] = T(0.7853981); + eulerData[i*3+1] = T(0.1243412); + eulerData[i*3+2] = T(0.3245312); + } + VectorType<3, Packing> eulerVec(eulerData); + VectorType<4, Packing> result = VectorType<4, Packing>::QuanternionFromEuler(eulerVec); + Vector::Alignment> stored = result.Store(); - // // Test QuanternionFromEuler() static method (Len == 4 only) - // if constexpr(Len == 4) { - // T eulerData[3] = {T(0), T(0), T(0)}; // Zero rotation - // VectorType<3, 1> eulerVec(eulerData); - // VectorType<4, 1> result = VectorType<4, 1>::QuanternionFromEuler(eulerVec); - // Vector stored = result.Store(); - - // // Identity quaternion should be (1, 0, 0, 0) - // if (!FloatEquals(stored.v[0], T(1)) || !FloatEquals(stored.v[1], T(0)) || - // !FloatEquals(stored.v[2], T(0)) || !FloatEquals(stored.v[3], T(0))) { - // return new std::string(std::format("QuanternionFromEuler mismatch at Len={} Packing={}, Expected: 1,0,0,0, Got: {},{},{},{}", Len, Packing, (float)stored.v[0], (float)stored.v[1], (float)stored.v[2], (float)stored.v[3])); - // } - // } + if (!FloatEquals(stored.v[0], T(0.63720703)) || !FloatEquals(stored.v[1], T(0.30688477)) || + !FloatEquals(stored.v[2], T(0.14074707)) || !FloatEquals(stored.v[3], T(0.6933594))) { + return new std::string(std::format("QuanternionFromEuler mismatch at Len={} Packing={}, Expected: 0.63720703,0.30688477,0.14074707,0.6933594, Got: {},{},{},{}", Len, Packing, (float)stored.v[0], (float)stored.v[1], (float)stored.v[2], (float)stored.v[3])); + } + } - // // Test batch Normalize() for 2 vectors (Len == 2) - // if constexpr(Len == 2) { - // T aData[2] = {T(3), T(4)}; - // T eData[2] = {T(6), T(8)}; - // VectorType<2, 1> vecA(aData); - // VectorType<2, 1> vecE(eData); - // auto result = VectorType<2, 1>::Normalize(vecA, vecE); - // Vector storedA = std::get<0>(result).Store(); - // Vector storedE = std::get<1>(result).Store(); - - // // Normalize (3,4) -> (0.6, 0.8) - // for (std::uint32_t i = 0; i < 2; i++) { - // if (!FloatEquals(storedA.v[i], static_cast(0.6f + i * 0.2f))) { - // return new std::string(std::format("Normalize 2 vec test failed (A) at index {}, Expected: {}, Got: {}", i, (float)(0.6f + i * 0.2f), (float)storedA.v[i])); - // } - // } - - // // Normalize (6,8) -> (0.6, 0.8) - // for (std::uint32_t i = 0; i < 2; i++) { - // if (!FloatEquals(storedE.v[i], static_cast(0.6f + i * 0.2f))) { - // return new std::string(std::format("Normalize 2 vec test failed (E) at index {}, Expected: {}, Got: {}", i, (float)(0.6f + i * 0.2f), (float)storedE.v[i])); - // } - // } - // } + if constexpr(Len == 2 && Packing*Len == VectorType::Alignment) { + { + VectorType vecA(floats); + VectorType vecE = vecA *2; + VectorType<1, Packing*2> result = VectorType::Length(vecA, vecE); + Vector::Alignment> stored = result.Store(); - // // Test batch LengthSq() for 2 vectors (Len == 2) - // if constexpr(Len == 2) { - // T aData[2] = {T(3), T(4)}; - // T eData[2] = {T(5), T(12)}; - // VectorType<2, 1> vecA(aData); - // VectorType<2, 1> vecE(eData); - // VectorType<2, 1> result = VectorType<2, 1>::LengthSq(vecA, vecE); - // Vector stored = result.Store(); - - // // LengthSq of (3,4) = 9+16 = 25 - // // LengthSq of (5,12) = 25+144 = 169 - // if (!FloatEquals(stored.v[0], T(25)) || !FloatEquals(stored.v[1], T(169))) { - // return new std::string(std::format("LengthSq 2 vec test failed at Len={} Packing={}, Expected: 25,169, Got: {},{}", Len, Packing, (float)stored.v[0], (float)stored.v[1])); - // } - // } + if (!FloatEquals(stored.v[0], expectedLength[0])) { + return new std::string(std::format("Length 2 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored.v[0])); + } + + if (!FloatEquals(stored.v[(Len*Packing)/2], expectedLength[0] * 2)) { + return new std::string(std::format("Length 2 vecE test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0] * 2, (float)stored.v[(Len*Packing)/2])); + } + } - // // Test batch Dot() for 2 vectors (Len == 2) - // if constexpr(Len == 2) { - // T a0Data[2] = {T(1), T(2)}; - // T a1Data[2] = {T(3), T(4)}; - // T e0Data[2] = {T(5), T(6)}; - // T e1Data[2] = {T(7), T(8)}; - // VectorType<2, 1> vecA0(a0Data); - // VectorType<2, 1> vecA1(a1Data); - // VectorType<2, 1> vecE0(e0Data); - // VectorType<2, 1> vecE1(e1Data); - // VectorType<2, 1> result = VectorType<2, 1>::Dot(vecA0, vecA1, vecE0, vecE1); - // Vector stored = result.Store(); - - // // Dot (1,2) with (3,4) = 3+8=11 - // // Dot (5,6) with (7,8) = 35+48=83 - // if (!FloatEquals(stored.v[0], T(11)) || !FloatEquals(stored.v[1], T(83))) { - // return new std::string(std::format("Dot 2 vec test failed at Len={} Packing={}, Expected: 11,83, Got: {},{}", Len, Packing, (float)stored.v[0], (float)stored.v[1])); - // } - // } + { + VectorType vecA(floats); + VectorType vecE = vecA * 2; + auto result = VectorType::Normalize(vecA, vecE); + VectorType<1, Packing*2> result2 = VectorType::Length(std::get<0>(result), std::get<1>(result)); + Vector::Alignment> stored = result2.Store(); + + for(std::uint8_t i = 0; i < Len*Packing; i++) { + if (!FloatEquals(stored.v[i], T(1))) { + return new std::string(std::format("Normalize {} test failed at Len={} Packing={} Expected: {}, Got: {}", i, Len, Packing, 1, (float)stored.v[i])); + } + } + } + } return TestAllCombinations(); }