diff --git a/interfaces/Crafter.Math-Vector.cppm b/interfaces/Crafter.Math-Vector.cppm index 6e113b2..4934e46 100755 --- a/interfaces/Crafter.Math-Vector.cppm +++ b/interfaces/Crafter.Math-Vector.cppm @@ -317,10 +317,8 @@ namespace Crafter { constexpr void Normalize() { T fLength = Length(); - if (fLength > 0) { - fLength = 1.0f / fLength; - } - + fLength = 1.0f / fLength; + for(std::uint32_t i = 0; i < Len; i++) { this->v[i] *= fLength; } diff --git a/interfaces/Crafter.Math-VectorF16.cppm b/interfaces/Crafter.Math-VectorF16.cppm index 5ccddc7..814ea5e 100755 --- a/interfaces/Crafter.Math-VectorF16.cppm +++ b/interfaces/Crafter.Math-VectorF16.cppm @@ -1,489 +1,991 @@ -// /* -// Crafter®.Math -// Copyright (C) 2026 Catcrafts® -// catcrafts.net +/* +Crafter®.Math +Copyright (C) 2026 Catcrafts® +catcrafts.net -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License version 3.0 as published by the Free Software Foundation; +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License version 3.0 as published by the Free Software Foundation; -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -// */ -// module; -// #include -// export module Crafter.Math:VectorF16; -// import std; -// import :Vector; +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ +module; +#ifdef __x86_64 +#include +#endif +export module Crafter.Math:VectorF16; +import std; +import :Vector; -// namespace Crafter { -// export template -// struct VectorF16 { -// static constexpr std::uint32_t MaxSize = 32; -// static constexpr std::uint32_t MaxElement = 8; -// static consteval std::uint32_t GetAlignment() { -// if constexpr (Len * Packing <= 8) { -// return 8; -// } -// if constexpr (Len * Packing <= 16) { -// return 16; -// } -// if constexpr (Len * Packing <= 32) { -// return 32; -// } -// static_assert(Len * Packing <= 32, "Len * Packing is larger than supported max size of 32"); -// static_assert(Len * Packing <= 8, "Len * Packing is larger than supported packed size of 8"); -// static_assert(Len * Packing * Repeats <= 32, "Len * Packing * Repeats is larger than supported max of 32"); -// } -// static consteval std::uint32_t GetTotalSize() { -// return GetAlignment() * Repeats; -// } +#ifdef __AVX512FP16__ +namespace Crafter { + export template + struct VectorF16 { + static constexpr std::uint32_t MaxSize = 32; + static constexpr std::uint32_t MaxElement = 8; + static consteval std::uint32_t GetAlignment() { + if constexpr (Len * Packing <= 8) { + return 8; + } + if constexpr (Len * Packing <= 16) { + return 16; + } + if constexpr (Len * Packing <= 32) { + return 32; + } + static_assert(Len * Packing <= 32, "Len * Packing is larger than supported max size of 32"); + static_assert(Len * Packing <= 8, "Len * Packing is larger than supported packed size of 8"); + static_assert(Len * Packing * Repeats <= 32, "Len * Packing * Repeats is larger than supported max of 32"); + } + static consteval std::uint32_t GetTotalSize() { + return GetAlignment() * Repeats; + } -// using VectorType = std::conditional_t< -// (GetTotalSize() == 32), __m512h, -// std::conditional_t<(GetTotalSize() == 16), __m256h, __m128h> -// >; + using VectorType = std::conditional_t< + (GetTotalSize() == 32), __m512h, + std::conditional_t<(GetTotalSize() == 16), __m256h, __m128h> + >; -// VectorType v; + VectorType v; -// constexpr VectorF16() = default; -// constexpr VectorF16(VectorType v) : v(v) {} -// template -// constexpr VectorF16(const Vector<_Float16, VLen, VAlign>* vA) requires(VAlign != 0 || VLen >= GetTotalSize()) { -// if constexpr(std::is_same_v) { -// v = _mm_loadu_ph(vA->v); -// } else if constexpr(std::is_same_v) { -// v = _mm256_loadu_ph(vA->v); -// } else { -// v = _mm512_loadu_ph(vA->v); -// } -// }; + constexpr VectorF16() = default; + constexpr VectorF16(VectorType v) : v(v) {} + template + constexpr VectorF16(const Vector<_Float16, VLen, VAlign>* vA) requires(VAlign != 0 || VLen >= GetTotalSize()) { + if constexpr(std::is_same_v) { + v = _mm_loadu_ph(vA->v); + } else if constexpr(std::is_same_v) { + v = _mm256_loadu_ph(vA->v); + } else { + v = _mm512_loadu_ph(vA->v); + } + }; -// template -// constexpr void Load(const Vector<_Float16, VLen, VAlign>* vA) { -// if constexpr(std::is_same_v) { -// v = _mm_loadu_ph(vA->v); -// } else if constexpr(std::is_same_v) { -// v = _mm256_loadu_ph(vA->v); -// } else { -// v = _mm512_loadu_ph(vA->v); -// } -// } + template + constexpr void Load(const Vector<_Float16, VLen, VAlign>* vA) { + if constexpr(std::is_same_v) { + v = _mm_loadu_ph(vA->v); + } else if constexpr(std::is_same_v) { + v = _mm256_loadu_ph(vA->v); + } else { + v = _mm512_loadu_ph(vA->v); + } + } -// template -// constexpr void Store(Vector<_Float16, VLen, VAlign>* vA) const { -// if constexpr(std::is_same_v) { -// _mm_storeu_ph(vA->v, v); -// } else if constexpr(std::is_same_v) { -// _mm256_storeu_ph(vA->v, v); -// } else { -// _mm512_storeu_ph(vA->v, v); -// } -// } + template + constexpr void Store(Vector<_Float16, VLen, VAlign>* vA) const { + if constexpr(std::is_same_v) { + _mm_storeu_ph(vA->v, v); + } else if constexpr(std::is_same_v) { + _mm256_storeu_ph(vA->v, v); + } else { + _mm512_storeu_ph(vA->v, v); + } + } -// template -// constexpr Vector<_Float16, VLen, VAlign> Store() const { -// Vector<_Float16, VLen, VAlign> returnVec; -// Store(&returnVec); -// return returnVec; -// } + template + constexpr Vector<_Float16, VLen, VAlign> Store() const { + Vector<_Float16, VLen, VAlign> returnVec; + Store(&returnVec); + return returnVec; + } -// template -// constexpr operator VectorF16() const { -// if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { -// return VectorF16(_mm256_castph256_ph128(v)); -// } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { -// return VectorF16(_mm512_castph512_ph128(v)); -// } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { -// return VectorF16(_mm512_castph512_ph256(v)); -// } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { -// return VectorF16(_mm256_castph128_ph256(v)); -// } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { -// return VectorF16(_mm512_castph128_ph512(v)); -// } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { -// return VectorF16(_mm512_castph256_ph512(v)); -// } else { -// return VectorF16(v); -// } -// } + template + constexpr operator VectorF16() const { + if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { + return VectorF16(_mm256_castph256_ph128(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m128h>) { + return VectorF16(_mm512_castph512_ph128(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { + return VectorF16(_mm512_castph512_ph256(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m256h>) { + return VectorF16(_mm256_castph128_ph256(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { + return VectorF16(_mm512_castph128_ph512(v)); + } else if constexpr(std::is_same_v && std::is_same_v::VectorType, __m512h>) { + return VectorF16(_mm512_castph256_ph512(v)); + } else { + return VectorF16(v); + } + } -// constexpr VectorF16 operator+(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// return VectorF16(_mm_add_ph(v, b.v)); -// } else if constexpr(std::is_same_v) { -// return VectorF16(_mm256_add_ph(v, b.v)); -// } else { -// return VectorF16(_mm512_add_ph(v, b.v)); -// } -// } + constexpr VectorF16 operator+(VectorF16 b) const { + if constexpr(std::is_same_v) { + return VectorF16(_mm_add_ph(v, b.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_add_ph(v, b.v)); + } else { + return VectorF16(_mm512_add_ph(v, b.v)); + } + } -// constexpr VectorF16 operator-(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// return VectorF16(_mm_sub_ph(v, b.v)); -// } else if constexpr(std::is_same_v) { -// return VectorF16(_mm256_sub_ph(v, b.v)); -// } else { -// return VectorF16(_mm512_sub_ph(v, b.v)); -// } -// } + constexpr VectorF16 operator-(VectorF16 b) const { + if constexpr(std::is_same_v) { + return VectorF16(_mm_sub_ph(v, b.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_sub_ph(v, b.v)); + } else { + return VectorF16(_mm512_sub_ph(v, b.v)); + } + } -// constexpr VectorF16 operator*(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// return VectorF16(_mm_mul_ph(v, b.v)); -// } else if constexpr(std::is_same_v) { -// return VectorF16(_mm256_mul_ph(v, b.v)); -// } else { -// return VectorF16(_mm512_mul_ph(v, b.v)); -// } -// } + constexpr VectorF16 operator*(VectorF16 b) const { + if constexpr(std::is_same_v) { + return VectorF16(_mm_mul_ph(v, b.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_mul_ph(v, b.v)); + } else { + return VectorF16(_mm512_mul_ph(v, b.v)); + } + } -// constexpr VectorF16 operator/(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// return VectorF16(_mm_div_ph(v, b.v)); -// } else if constexpr(std::is_same_v) { -// return VectorF16(_mm256_div_ph(v, b.v)); -// } else { -// return VectorF16(_mm512_div_ph(v, b.v)); -// } -// } + constexpr VectorF16 operator/(VectorF16 b) const { + if constexpr(std::is_same_v) { + return VectorF16(_mm_div_ph(v, b.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_div_ph(v, b.v)); + } else { + return VectorF16(_mm512_div_ph(v, b.v)); + } + } -// constexpr VectorF16 operator+=(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// v = _mm_add_ph(v, b.v); -// } else if constexpr(std::is_same_v) { -// v = _mm256_add_ph(v, b.v); -// } else { -// v = _mm512_add_ph(v, b.v); -// } -// } + constexpr VectorF16 operator+=(VectorF16 b) const { + if constexpr(std::is_same_v) { + v = _mm_add_ph(v, b.v); + } else if constexpr(std::is_same_v) { + v = _mm256_add_ph(v, b.v); + } else { + v = _mm512_add_ph(v, b.v); + } + } -// constexpr VectorF16 operator-=(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// v = _mm_sub_ph(v, b.v); -// } else if constexpr(std::is_same_v) { -// v = _mm256_sub_ph(v, b.v); -// } else { -// v = _mm512_sub_ph(v, b.v); -// } -// } + constexpr VectorF16 operator-=(VectorF16 b) const { + if constexpr(std::is_same_v) { + v = _mm_sub_ph(v, b.v); + } else if constexpr(std::is_same_v) { + v = _mm256_sub_ph(v, b.v); + } else { + v = _mm512_sub_ph(v, b.v); + } + } -// constexpr VectorF16 operator*=(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// v = _mm_mul_ph(v, b.v); -// } else if constexpr(std::is_same_v) { -// v = _mm256_mul_ph(v, b.v); -// } else { -// v = _mm512_mul_ph(v, b.v); -// } -// } + constexpr VectorF16 operator*=(VectorF16 b) const { + if constexpr(std::is_same_v) { + v = _mm_mul_ph(v, b.v); + } else if constexpr(std::is_same_v) { + v = _mm256_mul_ph(v, b.v); + } else { + v = _mm512_mul_ph(v, b.v); + } + } -// constexpr VectorF16 operator/=(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// v = _mm_div_ph(v, b.v); -// } else if constexpr(std::is_same_v) { -// v = _mm256_div_ph(v, b.v); -// } else { -// v = _mm512_div_ph(v, b.v); -// } -// } + constexpr VectorF16 operator/=(VectorF16 b) const { + if constexpr(std::is_same_v) { + v = _mm_div_ph(v, b.v); + } else if constexpr(std::is_same_v) { + v = _mm256_div_ph(v, b.v); + } else { + v = _mm512_div_ph(v, b.v); + } + } -// constexpr VectorF16 operator-(){ -// if constexpr(std::is_same_v) { -// alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; -// __m128i sign_mask = _mm_load_si128(reinterpret_cast(mask)); -// return VectorF16(_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(v)))); -// } else if constexpr(std::is_same_v) { -// alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; -// __m256i sign_mask = _mm256_load_si256(reinterpret_cast(mask)); -// return VectorF16(_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(v)))); -// } else { -// alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; -// __m512i sign_mask = _mm512_load_si512(reinterpret_cast(mask)); -// return VectorF16(_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(v)))); -// } -// } + constexpr VectorF16 operator-(){ + if constexpr(std::is_same_v) { + alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; + __m128i sign_mask = _mm_load_si128(reinterpret_cast(mask)); + return VectorF16(_mm_castsi128_ph(_mm_xor_si128(sign_mask, _mm_castph_si128(v)))); + } else if constexpr(std::is_same_v) { + alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; + __m256i sign_mask = _mm256_load_si256(reinterpret_cast(mask)); + return VectorF16(_mm256_castsi256_ph(_mm256_xor_si256(sign_mask, _mm256_castph_si256(v)))); + } else { + alignas(16) constexpr std::uint64_t mask[] {0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000, 0b1000000000000000100000000000000010000000000000001000000000000000}; + __m512i sign_mask = _mm512_load_si512(reinterpret_cast(mask)); + return VectorF16(_mm512_castsi512_ph(_mm512_xor_si512(sign_mask, _mm512_castph_si512(v)))); + } + } -// constexpr bool operator==(VectorF16 b) const { -// if constexpr(std::is_same_v) { -// return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 255; -// } else if constexpr(std::is_same_v) { -// return _mm256_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 65535; -// } else { -// return _mm512_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 4294967295; -// } -// } + constexpr bool operator==(VectorF16 b) const { + if constexpr(std::is_same_v) { + return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 255; + } else if constexpr(std::is_same_v) { + return _mm256_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 65535; + } else { + return _mm512_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) == 4294967295; + } + } -// template -// constexpr bool operator!=(Vector b) const { -// if constexpr(std::is_same_v) { -// return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 255; -// } else if constexpr(std::is_same_v) { -// return _mm256_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 65535; -// } else { -// return _mm512_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 4294967295; -// } -// } + template + constexpr bool operator!=(Vector b) const { + if constexpr(std::is_same_v) { + return _mm_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 255; + } else if constexpr(std::is_same_v) { + return _mm256_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 65535; + } else { + return _mm512_cmp_ph_mask(v, b.v, _CMP_EQ_OQ) != 4294967295; + } + } -// constexpr void Normalize() { -// if constexpr(std::is_same_v) { -// _Float16 dot = LengthSq(); -// __m128h vec = _mm_set1_ph(dot); -// __m128h sqrt = _mm_rsqrt_ph(vec); -// v = _mm_div_ps(v, sqrt); -// } else if constexpr(std::is_same_v) { -// // __m256h mul = _mm256_mul_ph(a.v, b.v); -// // return _mm256_reduce_add_ph(mul); -// } else { -// // __m512h mul = _mm512_mul_ph(a.v, b.v); -// // return _mm512_reduce_add_ph(mul); -// } -// } + constexpr void Normalize() { + if constexpr(std::is_same_v) { + _Float16 dot = LengthSq(); + __m128h vec = _mm_set1_ph(dot); + __m128h sqrt = _mm_rsqrt_ph(vec); + v = _mm_div_ps(v, sqrt); + } else if constexpr(std::is_same_v) { + _Float16 dot = LengthSq(); + __m256h vec = _mm256_set1_ph(dot); + __m256h sqrt = _mm256_rsqrt_ph(vec); + v = _mm256_div_ps(v, sqrt); + } else { + _Float16 dot = LengthSq(); + __m512h vec = _mm512_set1_ph(dot); + __m512h sqrt = _mm512_rsqrt_ph(vec); + v = _mm512_div_ps(v, sqrt); + } + } -// constexpr _Float16 Length() const { -// _Float16 Result = LengthSq(); -// return std::sqrtf(Result); -// } + constexpr _Float16 Length() const { + _Float16 Result = LengthSq(); + return std::sqrtf(Result); + } -// constexpr _Float16 LengthSq() const { -// return Dot(*this, *this); -// } + constexpr _Float16 LengthSq() const { + return Dot(*this, *this); + } -// // template -// // constexpr static Vector Cross(Vector a, Vector b) requires(Len == 3 && Alen >= 3 && Blen >= 3) { -// // return Vector( -// // (a.v[1] * b.v[2]) - (a.v[2] * b.v[1]), -// // (a.v[2] * b.v[0]) - (a.v[0] * b.v[2]), -// // (a.v[0] * b.v[1]) - (a.v[1] * b.v[0]) -// // ); -// // } + // template + // constexpr static Vector Cross(Vector a, Vector b) requires(Len == 3 && Alen >= 3 && Blen >= 3) { + // return Vector( + // (a.v[1] * b.v[2]) - (a.v[2] * b.v[1]), + // (a.v[2] * b.v[0]) - (a.v[0] * b.v[2]), + // (a.v[0] * b.v[1]) - (a.v[1] * b.v[0]) + // ); + // } -// // template -// // constexpr static Vector Normalize(Vector a) requires(Len == Alen) { -// // Vector returned; -// // T fLength = a.Length(); + // template + // constexpr static Vector Normalize(Vector a) requires(Len == Alen) { + // Vector returned; + // T fLength = a.Length(); -// // if (fLength > 0) { -// // fLength = 1.0f / fLength; -// // } + // fLength = 1.0f / fLength; -// // for(std::uint32_t i = 0; i < Len; i++) { -// // returned.v[i] = a.v[i] * fLength; -// // } -// // return returned; -// // } + // for(std::uint32_t i = 0; i < Len; i++) { + // returned.v[i] = a.v[i] * fLength; + // } + // return returned; + // } -// constexpr static _Float16 Dot(VectorF16 a, VectorF16 b) { -// if constexpr(std::is_same_v) { -// __m128h mul = _mm_mul_ph(a.v, b.v); -// return _mm_reduce_add_ph(mul); -// } else if constexpr(std::is_same_v) { -// __m256h mul = _mm256_mul_ph(a.v, b.v); -// return _mm256_reduce_add_ph(mul); -// } else { -// __m512h mul = _mm512_mul_ph(a.v, b.v); -// return _mm512_reduce_add_ph(mul); -// } -// } + constexpr static _Float16 Dot(VectorF16 a, VectorF16 b) { + if constexpr(std::is_same_v) { + __m128h mul = _mm_mul_ph(a.v, b.v); + return _mm_reduce_add_ph(mul); + } else if constexpr(std::is_same_v) { + __m256h mul = _mm256_mul_ph(a.v, b.v); + return _mm256_reduce_add_ph(mul); + } else { + __m512h mul = _mm512_mul_ph(a.v, b.v); + return _mm512_reduce_add_ph(mul); + } + } -// constexpr static VectorF16<8, 1, Repeats> Dot( -// VectorF16<8, 1, Repeats> A0, VectorF16<8, 1, Repeats> A1, -// VectorF16<8, 1, Repeats> B0, VectorF16<8, 1, Repeats> B1, -// VectorF16<8, 1, Repeats> C0, VectorF16<8, 1, Repeats> C1, -// VectorF16<8, 1, Repeats> D0, VectorF16<8, 1, Repeats> D1, -// VectorF16<8, 1, Repeats> E0, VectorF16<8, 1, Repeats> E1, -// VectorF16<8, 1, Repeats> F0, VectorF16<8, 1, Repeats> F1, -// VectorF16<8, 1, Repeats> G0, VectorF16<8, 1, Repeats> G1, -// VectorF16<8, 1, Repeats> H0, VectorF16<8, 1, Repeats> H1 -// ) { -// if constexpr(std::is_same_v) { -// __m128h mulA = _mm_mul_ph(A0.v, A1.v); -// __m128h mulB = _mm_mul_ph(B0.v, B1.v); -// __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 -// __m128i row56Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 -// __m128i row1TempTemp1 = row12Temp1; -// __m128i row5TempTemp1 = row56Temp1; + constexpr static std::tuple, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16, VectorF16> Normalize( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Packing == 1) { + if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; + __m128h one = _mm_loadu_ph(oneArr); + __m128h fLenght = _mm_div_ph(one, lenght.v); -// __m128h mulC = _mm_mul_ph(C0.v, C1.v); -// __m128h mulD = _mm_mul_ph(D0.v, D1.v); -// __m128i row34Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulC), _mm_castph_si128(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 -// __m128i row78Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 + }; + __m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA); + __m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA)); -// row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row34Temp1); // A1 B1 A2 B2 C1 D1 C2 D2 -// row12Temp1 = _mm_shuffle_epi32(row12Temp1, 0b01'00'11'10); // A1 B1 C1 D1 A2 B2 C2 D2 + constexpr std::uint8_t shuffleMaskB[] { + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 + }; + __m128i shuffleVecB = _mm_loadu_epi8(shuffleMaskB); + __m128h fLenghtB = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecB)); -// row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 B3 A4 B4 C3 D3 C4 D4 -// row34Temp1 = _mm_shuffle_epi32(row34Temp1, 0b01'00'11'10); // A3 B3 C3 D3 A4 B4 C4 D4 + constexpr std::uint8_t shuffleMaskC[] { + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 + }; + __m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC); + __m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC)); -// row56Temp1 = _mm_unpacklo_epi16(row56Temp1, row56Temp1); // A5 B5 A6 B6 C7 D7 C8 D8 -// row56Temp1 = _mm_shuffle_epi32(row56Temp1, 0b01'00'11'10); // A5 B5 C5 D5 A6 B6 C6 D6 + constexpr std::uint8_t shuffleMaskD[] { + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 + }; + __m128i shuffleVecD = _mm_loadu_epi8(shuffleMaskD); + __m128h fLenghtD = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecD)); -// row78Temp1 = _mm_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 B7 A8 B8 C7 D7 C8 D8 -// row78Temp1 = _mm_shuffle_epi32(row78Temp1, 0b01'00'11'10); // A7 B7 C7 D7 A8 B8 C8 D8 + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 + }; + __m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE); + __m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE)); + constexpr std::uint8_t shuffleMaskF[] { + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + }; + __m128i shuffleVecF = _mm_loadu_epi8(shuffleMaskF); + __m128h fLenghtF = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecF)); -// __m128h mulE = _mm_mul_ph(E0.v, E1.v); -// __m128h mulF = _mm_mul_ph(F0.v, F1.v); -// __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 -// __m128i row56Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 -// __m128i row1TempTemp2 = row12Temp2; -// __m128i row5TempTemp2 = row56Temp2; + constexpr std::uint8_t shuffleMaskG[] { + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + }; + __m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG); + __m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG)); -// __m128h mulG = _mm_mul_ph(G0.v, G1.v); -// __m128h mulH = _mm_mul_ph(H0.v, H1.v); -// __m128i row34Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulG), _mm_castph_si128(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 -// __m128i row78Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + constexpr std::uint8_t shuffleMaskH[] { + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + }; + __m128i shuffleVecH = _mm_loadu_epi8(shuffleMaskH); + __m128h fLenghtH = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecH)); -// row12Temp2 = _mm_unpacklo_epi16(row12Temp2, row34Temp2); // E1 F1 E2 F2 G1 H1 G2 H2 -// row12Temp2 = _mm_shuffle_epi32(row12Temp2, 0b01'00'11'10); // E1 F1 G1 H1 E2 F2 G2 H2 + return { + _mm_mul_ph(A.v, fLenghtA), + _mm_mul_ph(B.v, fLenghtB), + _mm_mul_ph(C.v, fLenghtC), + _mm_mul_ph(D.v, fLenghtD), + _mm_mul_ph(E.v, fLenghtE), + _mm_mul_ph(F.v, fLenghtF), + _mm_mul_ph(G.v, fLenghtG), + _mm_mul_ph(H.v, fLenghtH) + }; + } else if constexpr(std::is_same_v) { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256h one = _mm256_loadu_ph(oneArr); + __m256h fLenght = _mm256_div_ph(one, lenght.v); -// row34Temp2 = _mm_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 F3 E4 F4 G3 H3 G4 H4 -// row34Temp2 = _mm_shuffle_epi32(row34Temp2, 0b01'00'11'10); // E3 F3 G3 H3 E4 F4 G4 H4 + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 + }; + __m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA); + __m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA)); -// row56Temp2 = _mm_unpacklo_epi16(row56Temp2, row56Temp2); // E5 F5 E6 F6 G7 H7 G8 H8 -// row56Temp2 = _mm_shuffle_epi32(row56Temp2, 0b01'00'11'10); // E5 F5 G5 H5 E6 F6 G6 H6 + constexpr std::uint8_t shuffleMaskB[] { + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 + }; + __m256i shuffleVecB = _mm256_loadu_epi8(shuffleMaskB); + __m256h fLenghtB = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecB)); -// row78Temp2 = _mm_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 F7 E8 F8 G7 H7 G8 H8 -// row78Temp2 = _mm_shuffle_epi32(row78Temp2, 0b01'00'11'10); // E7 F7 G7 H7 E8 F8 G8 H8 + constexpr std::uint8_t shuffleMaskC[] { + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 + }; + __m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC); + __m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC)); -// __m128h row1 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 -// __m128h row2 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 -// __m128h row3 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A3 B3 C3 D3 E3 F3 G3 H3 -// __m128h row4 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A4 B4 C4 D4 E4 F4 G4 H4 -// __m128h row5 = _mm_castsi128_ph(_mm_unpackhi_epi16(row56Temp1, row56Temp2));// A5 B5 C5 D5 E5 F5 G5 H5 -// __m128h row6 = _mm_castsi128_ph(_mm_unpacklo_epi16(row56Temp1, row56Temp2));// A6 B6 C6 D6 E6 F6 G6 H6 -// __m128h row7 = _mm_castsi128_ph(_mm_unpackhi_epi16(row78Temp1, row78Temp2));// A7 B7 C7 D7 E7 F7 G7 H7 -// __m128h row8 = _mm_castsi128_ph(_mm_unpacklo_epi16(row78Temp1, row78Temp2));// A8 B8 C8 D8 E8 F8 G8 H8 + constexpr std::uint8_t shuffleMaskD[] { + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 + }; + __m256i shuffleVecD = _mm256_loadu_epi8(shuffleMaskD); + __m256h fLenghtD = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecD)); + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 + }; + __m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE); + __m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE)); -// row1 = _mm_add_ph(row1, row2); -// row1 = _mm_add_ph(row1, row3); -// row1 = _mm_add_ph(row1, row4); -// row1 = _mm_add_ph(row1, row5); -// row1 = _mm_add_ph(row1, row6); -// row1 = _mm_add_ph(row1, row7); -// row1 = _mm_add_ph(row1, row8); + constexpr std::uint8_t shuffleMaskF[] { + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + }; + __m256i shuffleVecF = _mm256_loadu_epi8(shuffleMaskF); + __m256h fLenghtF = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecF)); + + constexpr std::uint8_t shuffleMaskG[] { + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 + }; + __m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG); + __m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG)); + + constexpr std::uint8_t shuffleMaskH[] { + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + }; + __m256i shuffleVecH = _mm256_loadu_epi8(shuffleMaskH); + __m256h fLenghtH = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecH)); + + return { + _mm256_mul_ph(A.v, fLenghtA), + _mm256_mul_ph(B.v, fLenghtB), + _mm256_mul_ph(C.v, fLenghtC), + _mm256_mul_ph(D.v, fLenghtD), + _mm256_mul_ph(E.v, fLenghtE), + _mm256_mul_ph(F.v, fLenghtF), + _mm256_mul_ph(G.v, fLenghtG), + _mm256_mul_ph(H.v, fLenghtH) + }; + } else { + VectorF16 lenght = Length(A, B, C, D, E, F, G, H); + constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m512h one = _mm512_loadu_ph(oneArr); + __m512h fLenght = _mm512_div_ph(one, lenght.v); + + constexpr std::uint8_t shuffleMaskA[] { + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1 + }; + __m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA); + __m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA)); + + constexpr std::uint8_t shuffleMaskB[] { + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3, + 2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3 + }; + __m512i shuffleVecB = _mm512_loadu_epi8(shuffleMaskB); + __m512h fLenghtB = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecB)); + + constexpr std::uint8_t shuffleMaskC[] { + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5, + 4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5 + }; + __m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC); + __m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC)); + + constexpr std::uint8_t shuffleMaskD[] { + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7, + 6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7 + }; + __m512i shuffleVecD = _mm512_loadu_epi8(shuffleMaskD); + __m512h fLenghtD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecD)); + + constexpr std::uint8_t shuffleMaskE[] { + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9, + 8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9 + }; + __m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE); + __m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE)); + + constexpr std::uint8_t shuffleMaskF[] { + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11, + }; + __m512i shuffleVecF = _mm512_loadu_epi8(shuffleMaskF); + __m512h fLenghtF = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecF)); + + constexpr std::uint8_t shuffleMaskG[] { + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, + 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13 + }; + __m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG); + __m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG)); + + constexpr std::uint8_t shuffleMaskH[] { + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15, + 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15 + }; + __m512i shuffleVecH = _mm512_loadu_epi8(shuffleMaskH); + __m512h fLenghtH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecH)); + + return { + _mm512_mul_ph(A.v, fLenghtA), + _mm512_mul_ph(B.v, fLenghtB), + _mm512_mul_ph(C.v, fLenghtC), + _mm512_mul_ph(D.v, fLenghtD), + _mm512_mul_ph(E.v, fLenghtE), + _mm512_mul_ph(F.v, fLenghtF), + _mm512_mul_ph(G.v, fLenghtG), + _mm512_mul_ph(H.v, fLenghtH) + }; + } + } + + constexpr static VectorF16 Length( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Packing == 1) { + VectorF16 lenghtSq = LengthSq(A, B, C, D, E, F, G, H); + if constexpr(std::is_same_v) { + return VectorF16(_mm_sqrt_ph(lenghtSq.v)); + } else if constexpr(std::is_same_v) { + return VectorF16(_mm256_sqrt_ph(lenghtSq.v)); + } else { + return VectorF16(_mm512_sqrt_ph(lenghtSq.v)); + } + } + + constexpr static VectorF16 LengthSq( + VectorF16 A, + VectorF16 B, + VectorF16 C, + VectorF16 D, + VectorF16 E, + VectorF16 F, + VectorF16 G, + VectorF16 H + ) requires(Packing == 1) { + return Dot(A, A, B, B, C, C, D, D, E, E, F, F, G, G, H, H); + } + + constexpr static VectorF16 Dot( + VectorF16 A0, VectorF16 A1, + VectorF16 B0, VectorF16 B1, + VectorF16 C0, VectorF16 C1, + VectorF16 D0, VectorF16 D1, + VectorF16 E0, VectorF16 E1, + VectorF16 F0, VectorF16 F1, + VectorF16 G0, VectorF16 G1, + VectorF16 H0, VectorF16 H1 + ) requires(Packing == 1) { + if constexpr(std::is_same_v) { + __m128h mulA = _mm_mul_ph(A0.v, A1.v); + __m128h mulB = _mm_mul_ph(B0.v, B1.v); + __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + __m128i row56Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + __m128i row1TempTemp1 = row12Temp1; + __m128i row5TempTemp1 = row56Temp1; + + __m128h mulC = _mm_mul_ph(C0.v, C1.v); + __m128h mulD = _mm_mul_ph(D0.v, D1.v); + __m128i row34Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulC), _mm_castph_si128(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + __m128i row78Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 + + row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + row56Temp1 = _mm_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + row78Temp1 = _mm_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 + + __m128h mulE = _mm_mul_ph(E0.v, E1.v); + __m128h mulF = _mm_mul_ph(F0.v, F1.v); + __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + __m128i row56Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + __m128i row1TempTemp2 = row12Temp2; + __m128i row5TempTemp2 = row56Temp2; + + __m128h mulG = _mm_mul_ph(G0.v, G1.v); + __m128h mulH = _mm_mul_ph(H0.v, H1.v); + __m128i row34Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulG), _mm_castph_si128(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + __m128i row78Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + + row12Temp2 = _mm_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + row34Temp2 = _mm_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + row56Temp2 = _mm_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + row78Temp2 = _mm_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + + __m128h row1 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + __m128h row2 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + __m128h row3 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + __m128h row4 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + __m128h row5 = _mm_castsi128_ph(_mm_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + __m128h row6 = _mm_castsi128_ph(_mm_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + __m128h row7 = _mm_castsi128_ph(_mm_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + __m128h row8 = _mm_castsi128_ph(_mm_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + + row1 = _mm_add_ph(row1, row2); + row1 = _mm_add_ph(row1, row3); + row1 = _mm_add_ph(row1, row4); + row1 = _mm_add_ph(row1, row5); + row1 = _mm_add_ph(row1, row6); + row1 = _mm_add_ph(row1, row7); + row1 = _mm_add_ph(row1, row8); -// return row1; -// } else if constexpr(std::is_same_v) { -// // __m256h mul = _mm256_mul_ph(a.v, b.v); -// // return _mm256_reduce_add_ph(mul); -// } else { -// // __m512h mul = _mm512_mul_ph(a.v, b.v); -// // return _mm512_reduce_add_ph(mul); -// } -// } + return row1; + } else if constexpr(std::is_same_v) { + __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + __m256h mulB = _mm256_mul_ph(B0.v, B1.v); + __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + __m256i row56Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + __m256i row1TempTemp1 = row12Temp1; + __m256i row5TempTemp1 = row56Temp1; -// // template -// // constexpr static Vector Rotate(Vector v, Vector q) requires(Len == 3) { -// // Vector qv(q.x, q.y, q.z); -// // Vector t = Vector::Cross(qv, v) * T(2); -// // return v + t * q.w + Vector::Cross(qv, t); -// // } + __m256h mulC = _mm256_mul_ph(C0.v, C1.v); + __m256h mulD = _mm256_mul_ph(D0.v, D1.v); + __m256i row34Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulC), _mm256_castph_si256(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + __m256i row78Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 -// // template -// // constexpr static Vector RotatePivot(Vector v, Vector q, Vector pivot) requires(Len == 3) { -// // Vector translated = v - pivot; -// // Vector qv(q.x, q.y, q.z); -// // Vector t = Cross(qv, translated) * T(2); -// // Vector rotated = translated + t * q.w +Cross(qv, t); -// // return rotated + pivot; -// // } + row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + row56Temp1 = _mm256_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + row78Temp1 = _mm256_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 -// // template -// // constexpr static Vector QuanternionFromBasis(Vector right, Vector up, Vector forward) requires(Len == 4) { -// // T m00 = right.x; -// // T m01 = up.x; -// // T m02 = forward.x; + __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + __m256h mulF = _mm256_mul_ph(F0.v, F1.v); + __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + __m256i row56Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + __m256i row1TempTemp2 = row12Temp2; + __m256i row5TempTemp2 = row56Temp2; -// // T m10 = right.y; -// // T m11 = up.y; -// // T m12 = forward.y; + __m256h mulG = _mm256_mul_ph(G0.v, G1.v); + __m256h mulH = _mm256_mul_ph(H0.v, H1.v); + __m256i row34Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulG), _mm256_castph_si256(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + __m256i row78Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 -// // T m20 = right.z; -// // T m21 = up.z; -// // T m22 = forward.z; + row12Temp2 = _mm256_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + row34Temp2 = _mm256_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + row56Temp2 = _mm256_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + row78Temp2 = _mm256_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 -// // T trace = m00 + m11 + m22; + __m256h row1 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + __m256h row2 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + __m256h row3 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + __m256h row4 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + __m256h row5 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + __m256h row6 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + __m256h row7 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + __m256h row8 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 -// // Vector q; + row1 = _mm256_add_ph(row1, row2); + row1 = _mm256_add_ph(row1, row3); + row1 = _mm256_add_ph(row1, row4); + row1 = _mm256_add_ph(row1, row5); + row1 = _mm256_add_ph(row1, row6); + row1 = _mm256_add_ph(row1, row7); + row1 = _mm256_add_ph(row1, row8); + + return row1; + } else { + __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + __m512h mulB = _mm512_mul_ph(B0.v, B1.v); + __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A1 B1 A2 B2 A3 B3 A4 B4 + __m512i row56Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // A5 B5 A6 B6 A7 B7 A8 B8 + __m512i row1TempTemp1 = row12Temp1; + __m512i row5TempTemp1 = row56Temp1; -// // if (trace > std::numeric_limits::epsilon()) { -// // T s = std::sqrt(trace + T(1)) * T(2); -// // q.w = T(0.25) * s; -// // q.x = (m21 - m12) / s; -// // q.y = (m02 - m20) / s; -// // q.z = (m10 - m01) / s; -// // } -// // else if ((m00 > m11) && (m00 > m22)) { -// // T s = std::sqrt(T(1) + m00 - m11 - m22) * T(2); -// // q.w = (m21 - m12) / s; -// // q.x = T(0.25) * s; -// // q.y = (m01 + m10) / s; -// // q.z = (m02 + m20) / s; -// // } -// // else if (m11 > m22) { -// // T s = std::sqrt(T(1) + m11 - m00 - m22) * T(2); -// // q.w = (m02 - m20) / s; -// // q.x = (m01 + m10) / s; -// // q.y = T(0.25) * s; -// // q.z = (m12 + m21) / s; -// // } -// // else { -// // T s = std::sqrt(T(1) + m22 - m00 - m11) * T(2); -// // q.w = (m10 - m01) / s; -// // q.x = (m02 + m20) / s; -// // q.y = (m12 + m21) / s; -// // q.z = T(0.25) * s; -// // } + __m512h mulC = _mm512_mul_ph(C0.v, C1.v); + __m512h mulD = _mm512_mul_ph(D0.v, D1.v); + __m512i row34Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulC), _mm512_castph_si512(mulD)); // C1 D1 C2 D2 C3 D3 C4 D4 + __m512i row78Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulB)); // C5 D5 C6 D6 C7 D7 C8 D8 -// // q.Normalize(); -// // return q; -// // } + row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row34Temp1); // A1 C1 B1 D1 A2 C2 B2 D2 + row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 C3 B3 D3 A4 C4 B4 D4 + row56Temp1 = _mm512_unpacklo_epi16(row56Temp1, row78Temp1); // A5 C5 B5 D5 A6 C6 B6 D6 + row78Temp1 = _mm512_unpackhi_epi16(row5TempTemp1, row78Temp1); // A7 C7 B7 D7 A8 C8 B8 D8 -// // constexpr static Vector QuanternionFromEuler(T roll, T pitch, T yaw) { -// // T cr = std::cos(roll * 0.5); -// // T sr = std::sin(roll * 0.5); -// // T cp = std::cos(pitch * 0.5); -// // T sp = std::sin(pitch * 0.5); -// // T cy = std::cos(yaw * 0.5); -// // T sy = std::sin(yaw * 0.5); + __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + __m512h mulF = _mm512_mul_ph(F0.v, F1.v); + __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E1 F1 E2 F2 E3 F3 E4 F4 + __m512i row56Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //E5 F5 E6 F6 E7 F7 E8 F8 + __m512i row1TempTemp2 = row12Temp2; + __m512i row5TempTemp2 = row56Temp2; -// // return Vector( -// // sr * cp * cy - cr * sp * sy, -// // cr * sp * cy + sr * cp * sy, -// // cr * cp * sy - sr * sp * cy, -// // cr * cp * cy + sr * sp * sy -// // ); -// // } -// }; -// } + __m512h mulG = _mm512_mul_ph(G0.v, G1.v); + __m512h mulH = _mm512_mul_ph(H0.v, H1.v); + __m512i row34Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulG), _mm512_castph_si512(mulH)); //G1 H1 G2 H2 G3 H3 G4 H4 + __m512i row78Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulF)); //G5 H5 G6 H6 G7 H7 G8 H8 + + row12Temp2 = _mm512_unpacklo_epi16(row12Temp2, row34Temp2); // E1 G1 F1 H1 E2 G2 F2 H2 + row34Temp2 = _mm512_unpackhi_epi16(row1TempTemp2, row34Temp2); // E3 G3 F3 H3 E4 G4 F4 H4 + row56Temp2 = _mm512_unpacklo_epi16(row56Temp2, row78Temp2); // E5 G5 F5 H5 E6 G6 F6 H6 + row78Temp2 = _mm512_unpackhi_epi16(row5TempTemp2, row78Temp2); // E7 G7 F7 H7 E8 G8 F8 H8 + + __m512h row1 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A1 E1 C1 G1 B1 F1 D1 H1 + __m512h row2 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A2 E2 C2 G2 B2 F2 D2 H2 + __m512h row3 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A3 E3 C3 G3 B3 F3 D3 H3 + __m512h row4 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A4 E4 C4 G4 B4 F4 D4 H4 + __m512h row5 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row56Temp1, row56Temp2));// A5 E5 C5 G5 B5 F5 D5 H5 + __m512h row6 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row56Temp1, row56Temp2));// A6 E6 C6 G6 B6 F6 D6 H6 + __m512h row7 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row78Temp1, row78Temp2));// A7 E7 C7 G7 B7 F7 D7 H7 + __m512h row8 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row78Temp1, row78Temp2));// A8 E8 C8 G8 B8 F8 D8 H8 + + row1 = _mm512_add_ph(row1, row2); + row1 = _mm512_add_ph(row1, row3); + row1 = _mm512_add_ph(row1, row4); + row1 = _mm512_add_ph(row1, row5); + row1 = _mm512_add_ph(row1, row6); + row1 = _mm512_add_ph(row1, row7); + row1 = _mm512_add_ph(row1, row8); + + return row1; + } + } + + constexpr static VectorF16 Dot( + VectorF16 A0, VectorF16 A1, + VectorF16 C0, VectorF16 C1, + VectorF16 E0, VectorF16 E1, + VectorF16 G0, VectorF16 G1 + ) requires(Packing == 2) { + if constexpr(std::is_same_v) { + __m128h mulA = _mm_mul_ph(A0.v, A1.v); + __m128h mulC = _mm_mul_ph(C0.v, C1.v); + __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + __m128i row34Temp1 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + __m128i row1TempTemp1 = row12Temp1; + __m128i row5TempTemp1 = row34Temp1; + + __m128h mulE = _mm_mul_ph(E0.v, E1.v); + __m128h mulG = _mm_mul_ph(G0.v, G1.v); + __m128i row12Temp2 = _mm_unpacklo_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + __m128i row34Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulE), _mm_castph_si128(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + + row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + row12Temp2 = _mm_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + row34Temp1 = _mm_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + row34Temp2 = _mm_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + + __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + __m128h row3 = _mm_castsi128_ph(_mm_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + __m128h row4 = _mm_castsi128_ph(_mm_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + + row1 = _mm_add_ph(row1, row2); + row1 = _mm_add_ph(row1, row3); + row1 = _mm_add_ph(row1, row4); + + return row1; + } else if constexpr(std::is_same_v) { + __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + __m256h mulC = _mm256_mul_ph(C0.v, C1.v); + __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + __m256i row34Temp1 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + __m256i row1TempTemp1 = row12Temp1; + __m256i row5TempTemp1 = row34Temp1; + + __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + __m256h mulG = _mm256_mul_ph(G0.v, G1.v); + __m256i row12Temp2 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + __m256i row34Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulE), _mm256_castph_si256(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + + row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + row12Temp2 = _mm256_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + row34Temp1 = _mm256_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + row34Temp2 = _mm256_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + + __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + __m256h row3 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + __m256h row4 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + + row1 = _mm256_add_ph(row1, row2); + row1 = _mm256_add_ph(row1, row3); + row1 = _mm256_add_ph(row1, row4); + + return row1; + } else { + __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + __m512h mulC = _mm512_mul_ph(C0.v, C1.v); + __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // A1 C1 A2 C2 A3 C3 A4 C4 + __m512i row34Temp1 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulC)); // B1 D1 B2 D2 B3 D3 B4 D4 + __m512i row1TempTemp1 = row12Temp1; + __m512i row5TempTemp1 = row34Temp1; + + __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + __m512h mulG = _mm512_mul_ph(G0.v, G1.v); + __m512i row12Temp2 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // E1 G1 E2 G2 E3 G3 E4 G4 + __m512i row34Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulE), _mm512_castph_si512(mulG)); // F1 H1 F2 H2 F3 H3 F4 H4 + + row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 E1 C1 G1 A2 E2 C2 G2 + row12Temp2 = _mm512_unpacklo_epi16(row34Temp1, row34Temp2); // B1 F1 D1 H1 B2 F2 D2 H2 + row34Temp1 = _mm512_unpackhi_epi16(row1TempTemp1, row34Temp1); // A3 E3 C3 G3 A4 E4 C4 G4 + row34Temp2 = _mm512_unpackhi_epi16(row5TempTemp1, row34Temp2); // B3 F3 D3 H3 B4 F4 D4 H4 + + __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 E1 F1 C1 D1 G1 H1 + __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 E2 F2 C2 D2 G2 H2 + __m512h row3 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row34Temp1, row34Temp2));// A3 B3 E3 F3 C3 D3 G3 H3 + __m512h row4 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row34Temp1, row34Temp2));// A4 B4 E4 F4 C4 D4 G4 H4 + + row1 = _mm512_add_ph(row1, row2); + row1 = _mm512_add_ph(row1, row3); + row1 = _mm512_add_ph(row1, row4); + + return row1; + } + } + + constexpr static VectorF16 Dot( + VectorF16 A0, VectorF16 A1, + VectorF16 E0, VectorF16 E1 + ) requires(Packing == 4) { + if constexpr(std::is_same_v) { + __m128h mulA = _mm_mul_ph(A0.v, A1.v); + __m128h mulE = _mm_mul_ph(E0.v, E1.v); + __m128i row12Temp1 = _mm_unpacklo_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + __m128i row12Temp2 = _mm_unpackhi_epi16(_mm_castph_si128(mulA), _mm_castph_si128(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + __m128i row12Temp1Temp = row12Temp1; + + row12Temp1 = _mm_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + row12Temp2 = _mm_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + + __m128h row1 = _mm_castsi128_ph(_mm_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + __m128h row2 = _mm_castsi128_ph(_mm_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + + return _mm_add_ph(row1, row2); + } else if constexpr(std::is_same_v) { + __m256h mulA = _mm256_mul_ph(A0.v, A1.v); + __m256h mulE = _mm256_mul_ph(E0.v, E1.v); + __m256i row12Temp1 = _mm256_unpacklo_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + __m256i row12Temp2 = _mm256_unpackhi_epi16(_mm256_castph_si256(mulA), _mm256_castph_si256(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + __m256i row12Temp1Temp = row12Temp1; + + row12Temp1 = _mm256_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + row12Temp2 = _mm256_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + + __m256h row1 = _mm256_castsi256_ph(_mm256_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + __m256h row2 = _mm256_castsi256_ph(_mm256_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + + return _mm256_add_ph(row1, row2); + } else { + __m512h mulA = _mm512_mul_ph(A0.v, A1.v); + __m512h mulE = _mm512_mul_ph(E0.v, E1.v); + __m512i row12Temp1 = _mm512_unpacklo_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // A1 E1 A2 E2 B1 F1 B2 F2 + __m512i row12Temp2 = _mm512_unpackhi_epi16(_mm512_castph_si512(mulA), _mm512_castph_si512(mulE)); // C1 G1 C2 G2 D1 H1 D2 H2 + __m512i row12Temp1Temp = row12Temp1; + + row12Temp1 = _mm512_unpacklo_epi16(row12Temp1, row12Temp2); // A1 C1 E1 G1 A2 C2 E2 G2 + row12Temp2 = _mm512_unpackhi_epi16(row12Temp1Temp, row12Temp2); // B1 D1 F1 H1 B2 D2 F2 H2 + + __m512h row1 = _mm512_castsi512_ph(_mm512_unpacklo_epi16(row12Temp1, row12Temp2));// A1 B1 C1 D1 E1 F1 G1 H1 + __m512h row2 = _mm512_castsi512_ph(_mm512_unpackhi_epi16(row12Temp1, row12Temp2));// A2 B2 C2 D2 E2 F2 G2 H2 + + return _mm512_add_ph(row1, row2); + } + } + + // template + // constexpr static Vector Rotate(Vector v, Vector q) requires(Len == 3) { + // Vector qv(q.x, q.y, q.z); + // Vector t = Vector::Cross(qv, v) * T(2); + // return v + t * q.w + Vector::Cross(qv, t); + // } + + // template + // constexpr static Vector RotatePivot(Vector v, Vector q, Vector pivot) requires(Len == 3) { + // Vector translated = v - pivot; + // Vector qv(q.x, q.y, q.z); + // Vector t = Cross(qv, translated) * T(2); + // Vector rotated = translated + t * q.w +Cross(qv, t); + // return rotated + pivot; + // } + + // template + // constexpr static Vector QuanternionFromBasis(Vector right, Vector up, Vector forward) requires(Len == 4) { + // T m00 = right.x; + // T m01 = up.x; + // T m02 = forward.x; + + // T m10 = right.y; + // T m11 = up.y; + // T m12 = forward.y; + + // T m20 = right.z; + // T m21 = up.z; + // T m22 = forward.z; + + // T trace = m00 + m11 + m22; + + // Vector q; + + // if (trace > std::numeric_limits::epsilon()) { + // T s = std::sqrt(trace + T(1)) * T(2); + // q.w = T(0.25) * s; + // q.x = (m21 - m12) / s; + // q.y = (m02 - m20) / s; + // q.z = (m10 - m01) / s; + // } + // else if ((m00 > m11) && (m00 > m22)) { + // T s = std::sqrt(T(1) + m00 - m11 - m22) * T(2); + // q.w = (m21 - m12) / s; + // q.x = T(0.25) * s; + // q.y = (m01 + m10) / s; + // q.z = (m02 + m20) / s; + // } + // else if (m11 > m22) { + // T s = std::sqrt(T(1) + m11 - m00 - m22) * T(2); + // q.w = (m02 - m20) / s; + // q.x = (m01 + m10) / s; + // q.y = T(0.25) * s; + // q.z = (m12 + m21) / s; + // } + // else { + // T s = std::sqrt(T(1) + m22 - m00 - m11) * T(2); + // q.w = (m10 - m01) / s; + // q.x = (m02 + m20) / s; + // q.y = (m12 + m21) / s; + // q.z = T(0.25) * s; + // } + + // q.Normalize(); + // return q; + // } + + // constexpr static Vector QuanternionFromEuler(T roll, T pitch, T yaw) { + // T cr = std::cos(roll * 0.5); + // T sr = std::sin(roll * 0.5); + // T cp = std::cos(pitch * 0.5); + // T sp = std::sin(pitch * 0.5); + // T cy = std::cos(yaw * 0.5); + // T sy = std::sin(yaw * 0.5); + + // return Vector( + // sr * cp * cy - cr * sp * sy, + // cr * sp * cy + sr * cp * sy, + // cr * cp * sy - sr * sp * cy, + // cr * cp * cy + sr * sp * sy + // ); + // } + }; +} -// export template -// struct std::formatter> : std::formatter { -// auto format(const Crafter::VectorF16& obj, format_context& ctx) const { -// Crafter::Vector<_Float16, Len * Packing * Repeats, 0> vec = obj.template Store(); -// std::string out; -// for(std::uint32_t i = 0; i < Repeats; i++) { -// out += "{"; -// for(std::uint32_t i2 = 0; i2 < Packing; i2++) { -// out += "{"; -// for(std::uint32_t i3 = 0; i3 < Len; i3++) { -// out += std::format("{}", static_cast(vec.v[i * Packing * Len + i2 * Len + i3])); -// if (i3 + 1 < Len) out += ","; -// } -// out += "}"; -// } -// out += "}"; -// } -// return std::formatter::format(out, ctx); -// } -// }; \ No newline at end of file +export template +struct std::formatter> : std::formatter { + auto format(const Crafter::VectorF16& obj, format_context& ctx) const { + Crafter::Vector<_Float16, Len * Packing * Repeats, 0> vec = obj.template Store(); + std::string out; + for(std::uint32_t i = 0; i < Repeats; i++) { + out += "{"; + for(std::uint32_t i2 = 0; i2 < Packing; i2++) { + out += "{"; + for(std::uint32_t i3 = 0; i3 < Len; i3++) { + out += std::format("{}", static_cast(vec.v[i * Packing * Len + i2 * Len + i3])); + if (i3 + 1 < Len) out += ","; + } + out += "}"; + } + out += "}"; + } + return std::formatter::format(out, ctx); + } +}; +#endif \ No newline at end of file diff --git a/interfaces/Crafter.Math.cppm b/interfaces/Crafter.Math.cppm index 32ff0ea..e32e750 100644 --- a/interfaces/Crafter.Math.cppm +++ b/interfaces/Crafter.Math.cppm @@ -23,4 +23,4 @@ export import :Basic; export import :Vector; export import :MatrixRowMajor; export import :Intersection; -//export import :VectorF16; \ No newline at end of file +export import :VectorF16; \ No newline at end of file diff --git a/interfaces/main.cpp b/interfaces/main.cpp index ab32cf4..c9ae69b 100644 --- a/interfaces/main.cpp +++ b/interfaces/main.cpp @@ -32,19 +32,42 @@ int main() { - // Vector<_Float16, 8, 8> vA; - // for(std::uint32_t i = 0; i < 8; i++) { + Vector<_Float16, 1326, 32> vA; + // for(std::uint32_t i = 0; i < 2; i++) { // vA.v[i] = i; // } - // VectorF16<8, 1, 1> vfA(&vA); - // VectorF16<8, 1, 1> dot = VectorF16<8, 1, 1>::Dot(vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA); - // std::println("{}", dot); - - // Vector vB; - // for(std::uint32_t i = 0; i < 8; i++) { - // vB.v[i] = i; + // for(std::uint32_t i = 2; i < 4; i++) { + // vA.v[i] = i-2; // } - // float test = Vector::Dot(vA, vA); + // for(std::uint32_t i = 4; i < 6; i++) { + // vA.v[i] = i-4; + // } + // for(std::uint32_t i = 6; i < 8; i++) { + // vA.v[i] = i-6; + // } + for(std::uint32_t i = 0; i < 8; i++) { + vA.v[i] = i; + } + for(std::uint32_t i = 8; i < 16; i++) { + vA.v[i] = i-8; + } + for(std::uint32_t i = 16; i < 24; i++) { + vA.v[i] = i-16; + } + for(std::uint32_t i = 24; i < 32; i++) { + } + VectorF16<8, 1, 4> vfA(&vA); + std::tuple, VectorF16<8, 1, 4>, VectorF16<8, 1, 4>, VectorF16<8, 1, 4>, VectorF16<8, 1, 4>, VectorF16<8, 1, 4>, VectorF16<8, 1, 4>, VectorF16<8, 1, 4>> dot = VectorF16<8, 1, 4>::Normalize(vfA, vfA, vfA, vfA, vfA, vfA, vfA, vfA); + std::println("{}", std::get<0>(dot)); - // std::println("{}", test); + Vector vB; + for(std::uint32_t i = 0; i < 8; i++) { + vB.v[i] = i; + } + vB.Normalize(); + std::string log; + for(std::uint32_t i = 0; i < 8; i++) { + log += std::format("{} ", (float)vB.v[i]); + } + std::println("{{{}}}", log); } \ No newline at end of file diff --git a/project.json b/project.json index a609d90..f972172 100644 --- a/project.json +++ b/project.json @@ -8,7 +8,8 @@ "interfaces/Crafter.Math-Basic", "interfaces/Crafter.Math-MatrixRowMajor", "interfaces/Crafter.Math", - "interfaces/Crafter.Math-Intersection" + "interfaces/Crafter.Math-Intersection", + "interfaces/Crafter.Math-VectorF16" ], "implementations": [] }, @@ -27,7 +28,7 @@ "name": "test", "implementations": ["interfaces/main"], "extends": ["base"], - "debug": true + "debug": false } ] } \ No newline at end of file