/* Crafter®.Math Copyright (C) 2026 Catcrafts® catcrafts.net This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 3.0 as published by the Free Software Foundation; This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ module; #ifdef __x86_64 #include #endif #ifdef __wasm_simd128__ #include #endif #ifdef __riscv_vector #include #endif export module Crafter.Math:VectorF32; import std; import :Common; namespace Crafter { #ifdef __x86_64 export template struct VectorF32 : public VectorBase { template friend struct VectorF32; constexpr VectorF32() = default; constexpr VectorF32(VectorBase::VectorType v) { this->v = v; } constexpr VectorF32(const float* vB) { Load(vB); }; constexpr VectorF32(const _Float16* vB) { Load(vB); }; constexpr VectorF32(float val) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_set1_ps(val); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_set1_ps(val); } else { this->v = _mm512_set1_ps(val); } }; constexpr void Load(const float* vB) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_loadu_ps(vB); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_loadu_ps(vB); } else { this->v = _mm512_loadu_ps(vB); } } constexpr void Store(float* vB) const { if constexpr(std::is_same_v::VectorType, __m128>) { _mm_storeu_ps(vB, this->v); } else if constexpr(std::is_same_v::VectorType, __m256>) { _mm256_storeu_ps(vB, this->v); } else { _mm512_storeu_ps(vB, this->v); } } constexpr void Load(const _Float16* vB) { #ifdef __F16C__ if constexpr (std::is_same_v::VectorType, __m128>) { this->v = _mm_cvtph_ps(_mm_loadl_epi64(reinterpret_cast(vB))); } else if constexpr (std::is_same_v::VectorType, __m256>) { this->v = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast(vB))); } else { this->v = _mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast(vB))); } #else alignas(64) float tmp[Len]; for (int i = 0; i < Len; ++i) tmp[i] = static_cast(vB[i]); if constexpr (std::is_same_v::VectorType, __m128>) { this->v = _mm_load_ps(tmp); } else if constexpr (std::is_same_v::VectorType, __m256>) { this->v = _mm256_load_ps(tmp); } else { this->v = _mm512_load_ps(tmp); } #endif } constexpr void Store(_Float16* vB) const { #ifdef __F16C__ if constexpr (std::is_same_v::VectorType, __m128>) { _mm_storel_epi64(reinterpret_cast<__m128i*>(vB), _mm_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT)); } else if constexpr (std::is_same_v::VectorType, __m256>) { _mm_storeu_si128(reinterpret_cast<__m128i*>(vB), _mm256_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT)); } else { _mm256_storeu_si256(reinterpret_cast<__m256i*>(vB), _mm512_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT)); } #else alignas(64) float tmp[Len]; if constexpr (std::is_same_v::VectorType, __m128>) { _mm_store_ps(tmp, this->v); } else if constexpr (std::is_same_v::VectorType, __m256>) { _mm256_store_ps(tmp, this->v); } else { _mm512_store_ps(tmp, this->v); } for (int i = 0; i < Len; ++i) vB[i] = static_cast<_Float16>(tmp[i]); #endif } template constexpr std::array::AlignmentElement> Store() const { std::array::AlignmentElement> returnArray; Store(returnArray.data()); return returnArray; } template constexpr operator VectorF32() const { if constexpr (Len == BLen) { if constexpr(std::is_same_v::VectorType, __m256> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm256_castps256_ps128(this->v)); } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm512_castps512_ps128(this->v)); } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m256>) { return VectorF32(_mm512_castps512_ps256(this->v)); } else if constexpr(std::is_same_v::VectorType, __m128> && std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_castps128_ps256(this->v)); } else if constexpr(std::is_same_v::VectorType, __m128> && std::is_same_v::VectorType, __m512>) { return VectorF32(_mm512_castps128_ps512(this->v)); } else if constexpr(std::is_same_v::VectorType, __m256> && std::is_same_v::VectorType, __m512>) { return VectorF32(_mm512_castps256_ps512(this->v)); } else { return VectorF32(this->v); } } else if constexpr (BLen <= Len) { return this->template ExtractLo(); } else { if constexpr(std::is_same_v::VectorType, __m128>) { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetExtractLoMaskEpi8(); __m128i shuffleVec = _mm_loadu_si128(reinterpret_cast(shuffleMask.data())); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask =VectorBase::template GetExtractLoMaskepi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm_castps_si256(this->v)); return VectorF32(_mm_castsi128_ps(_mm256_castsi256_si128(result))); #ifdef __AVX512F__ } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); return VectorF32(_mm_castsi128_ps(_mm512_castsi512_si128(result))); #endif } } else if constexpr(std::is_same_v::VectorType, __m256>) { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castsi128_si256(_mm_castps_si128(this->v))); return VectorF32(_mm256_castsi256_ps(result)); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v)); return VectorF32(_mm256_castsi256_ps(result)); #ifdef __AVX512F__ } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm512_loadu_epi32(permMask.data()); __m256i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi512_si256(_mm512_castps_si512(this->v))); return VectorF32(_mm256_castsi256_ps(result)); #endif } #ifdef __AVX512F__ } else { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi128_si512(_mm_castps_si128(this->v))); return VectorF32(_mm512_castsi512_ps(result)); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi256_si512(_mm256_castps_si256(this->v))); return VectorF32(_mm512_castsi512_ps(result)); } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); return VectorF32(_mm512_castsi512_ps(result)); } #endif } } } constexpr VectorF32 operator+(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_add_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_add_ps(this->v, b.v)); } else { return VectorF32(_mm512_add_ps(this->v, b.v)); } } constexpr VectorF32 operator-(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_sub_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_sub_ps(this->v, b.v)); } else { return VectorF32(_mm512_sub_ps(this->v, b.v)); } } constexpr VectorF32 operator*(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_mul_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_mul_ps(this->v, b.v)); } else { return VectorF32(_mm512_mul_ps(this->v, b.v)); } } constexpr VectorF32 operator/(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_div_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_div_ps(this->v, b.v)); } else { return VectorF32(_mm512_div_ps(this->v, b.v)); } } constexpr void operator+=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_add_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_add_ps(this->v, b.v); } else { this->v = _mm512_add_ps(this->v, b.v); } } constexpr void operator-=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_sub_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_sub_ps(this->v, b.v); } else { this->v = _mm512_sub_ps(this->v, b.v); } } constexpr void operator*=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_mul_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_mul_ps(this->v, b.v); } else { this->v = _mm512_mul_ps(this->v, b.v); } } constexpr void operator/=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_div_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_div_ps(this->v, b.v); } else { this->v = _mm512_div_ps(this->v, b.v); } } constexpr VectorF32 operator+(float b) { VectorF32 vB(b); return *this + vB; } constexpr VectorF32 operator-(float b) { VectorF32 vB(b); return *this - vB; } constexpr VectorF32 operator*(float b) { VectorF32 vB(b); return *this * vB; } constexpr VectorF32 operator/(float b) { VectorF32 vB(b); return *this / vB; } constexpr void operator+=(float b) { VectorF32 vB(b); *this += vB; } constexpr void operator-=(float b) { VectorF32 vB(b); *this -= vB; } constexpr void operator*=(float b) { VectorF32 vB(b); *this *= vB; } constexpr void operator/=(float b) { VectorF32 vB(b); *this /= vB; } constexpr VectorF32 operator-(){ return Negate::GetAllTrue()>(); } constexpr bool operator==(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { #ifdef __AVX512VL__ return _mm_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xF; #else return _mm_movemask_ps(_mm_cmpeq_ps(this->v, b.v)) == 0xF; #endif } else if constexpr(std::is_same_v::VectorType, __m256>) { #ifdef __AVX512VL__ return _mm256_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFF; #else return _mm256_movemask_ps(_mm256_cmp_ps(this->v, b.v, _CMP_EQ_OQ)) == 0xFF; #endif } else { return _mm512_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFFFF; } } constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } template constexpr VectorF32 ExtractLo() const { if constexpr(Packing > 1) { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetExtractLoMaskEpi8(); __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v)); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm256_castps256_ps128(_mm256_castsi256_ps(result))); } else { return VectorF32(_mm256_castsi256_ps(result)); } } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm512_castps512_ps128(_mm512_castsi512_ps(result))); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm512_castps512_ps256(_mm512_castsi512_ps(result))); } else { return VectorF32(_mm512_castsi512_ps(result)); } } } else { if constexpr(std::is_same_v::VectorType, __m256> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm256_castps256_ps128(this->v)); #ifdef __AVX512F__ } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm512_castps512_ps128(this->v)); } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m256>) { return VectorF32(_mm512_castps512_ps256(this->v)); #endif } else { return VectorF32(this->v); } } } constexpr VectorF32 Cos() { if constexpr (std::is_same_v::VectorType, __m128>) { return VectorF32(VectorBase::cos_f32x4(this->v)); } else if constexpr (std::is_same_v::VectorType, __m256>) { return VectorF32(VectorBase::cos_f32x8(this->v)); #ifdef __AVX512F__ } else { return VectorF32(VectorBase::cos_f32x16(this->v)); #endif } } constexpr VectorF32 Sin() { if constexpr (std::is_same_v::VectorType, __m128>) { return VectorF32(VectorBase::sin_f32x4(this->v)); } else if constexpr (std::is_same_v::VectorType, __m256>) { return VectorF32(VectorBase::sin_f32x8(this->v)); #ifdef __AVX512F__ } else { return VectorF32(VectorBase::sin_f32x16(this->v)); #endif } } std::tuple, VectorF32> SinCos() { if constexpr (std::is_same_v::VectorType, __m128>) { __m128 s, c; VectorBase::sincos_f32x4(this->v, s, c); return { VectorF32(s), VectorF32(c) }; } else if constexpr (std::is_same_v::VectorType, __m256>) { __m256 s, c; VectorBase::sincos_f32x8(this->v, s, c); return { VectorF32(s), VectorF32(c) }; #ifdef __AVX512F__ } else { __m512 s, c; VectorBase::sincos_f32x16(this->v, s, c); return { VectorF32(s), VectorF32(c) }; #endif } } template values> constexpr VectorF32 Negate() const { std::array::AlignmentElement> mask = VectorBase::template GetNegateMask(); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data()))))); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(this->v), _mm256_loadu_si256(reinterpret_cast<__m256i*>(mask.data()))))); #ifdef __AVX512F__ } else { return VectorF32(_mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(this->v), _mm512_loadu_epi32(mask.data())))); #endif } } static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_fmadd_ps(a.v, b.v, add.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_fmadd_ps(a.v, b.v, add.v)); #ifdef __AVX512F__ } else { return VectorF32(_mm512_fmadd_ps(a.v, b.v, add.v)); #endif } } static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_fmsub_ps(a.v, b.v, sub.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_fmsub_ps(a.v, b.v, sub.v)); #ifdef __AVX512F__ } else { return VectorF32(_mm512_fmsub_ps(a.v, b.v, sub.v)); #endif } } constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { VectorF32 row1 = a.template Shuffle<{{1,2,0}}>(); VectorF32 row4 = b.template Shuffle<{{1,2,0}}>(); VectorF32 row3 = a.template Shuffle<{{2,0,1}}>(); VectorF32 row2 = b.template Shuffle<{{2,0,1}}>(); VectorF32 result = row3 * row4; return VectorF32::MulitplySub(row1, row2, result); } template ShuffleValues> constexpr VectorF32 Shuffle() { if constexpr(VectorBase::template CheckEpi32Shuffle()) { constexpr std::uint8_t imm = VectorBase::template GetShuffleMaskEpi32(); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(this->v), imm))); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_castsi256_ps(_mm256_shuffle_epi32(_mm256_castps_si256(this->v), imm))); #ifdef __AVX512F__ } else { return VectorF32(_mm512_castsi512_ps(_mm512_shuffle_epi32(_mm512_castps_si512(this->v), imm))); #endif } } else if constexpr(VectorBase::template CheckEpi8Shuffle()) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetShuffleMaskEpi8(); if constexpr(std::is_same_v::VectorType, __m128>) { __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { #ifdef __AVX512BW__ __m256i shuffleVec = _mm256_loadu_si256(reinterpret_cast(shuffleMask.data())); return VectorF32(_mm256_castsi256_ps( _mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(this->v)),_mm512_castsi256_si512(shuffleVec))))); #else constexpr std::array::AlignmentElement> permMask = VectorBase::template GetPermuteMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); return VectorF32(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx))); #endif #ifdef __AVX512F__ } else { __m512i shuffleVec = _mm512_loadu_si512(reinterpret_cast(shuffleMask.data())); return VectorF32(_mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(this->v), shuffleVec))); #endif } } else { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetShuffleMaskEpi8(); __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetPermuteMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); return VectorF32(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx))); #ifdef __AVX512F__ } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetPermuteMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); return VectorF32(_mm512_castsi512_ps(_mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)))); #endif } } } // Public variadic surface — one name per op, arity locked to BatchSize. // The Pack helpers below carry the SIMD bodies and the per-(Len,Packing) // requires clauses; this wrapper just forwards once arity matches. template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF32 first, Rest... rest) { return NormalizePack(first, rest...); } template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF32 first, Rest... rest) { return LengthPack(first, rest...); } template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF32 first, Rest... rest) { return LengthSqPack(first, rest...); } template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize)) constexpr static auto Dot(VectorF32 first, Rest... rest) { return DotPack(first, rest...); } private: constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { VectorF32<1, 4> lenght = LengthNoShuffle(A, C, B, D); constexpr float oneArr[] {1, 1, 1, 1}; __m128 one = _mm_loadu_ps(oneArr); VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v)); VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0}}>(); VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1}}>(); VectorF32<4, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2}}>(); VectorF32<4, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3}}>(); return { _mm_mul_ps(A.v, fLenghtA.v), _mm_mul_ps(B.v, fLenghtB.v), _mm_mul_ps(C.v, fLenghtC.v), _mm_mul_ps(D.v, fLenghtD.v) }; } else if constexpr(std::is_same_v::VectorType, __m256>) { VectorF32<1, 8> lenght = LengthNoShuffle(A, C, B, D); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; __m256 one = _mm256_loadu_ps(oneArr); VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v)); VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0,4,4,4,4}}>(); VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1,5,5,5,5}}>(); VectorF32<8, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2,6,6,6,6}}>(); VectorF32<8, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3,7,7,7,7}}>(); return { _mm256_mul_ps(A.v, fLenghtA.v), _mm256_mul_ps(B.v, fLenghtB.v), _mm256_mul_ps(C.v, fLenghtC.v), _mm256_mul_ps(D.v, fLenghtD.v) }; #if defined(__AVX512F__) } else { VectorF32<1, 16> lenght = LengthNoShuffle(A, C, B, D); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m512 one = _mm512_loadu_ps(oneArr); VectorF32<16, 1> fLenght(_mm512_div_ps(one, lenght.v)); VectorF32<16, 1> fLenght2(lenght.v); VectorF32<16, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12}}>(); VectorF32<16, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13}}>(); VectorF32<16, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14}}>(); VectorF32<16, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15}}>(); return { VectorF32(_mm512_mul_ps(A.v, fLenghtA.v)), VectorF32(_mm512_mul_ps(B.v, fLenghtB.v)), VectorF32(_mm512_mul_ps(C.v, fLenghtC.v)), VectorF32(_mm512_mul_ps(D.v, fLenghtD.v)), }; #endif } } constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { VectorF32<1, 4> lenght = Length(A, B, C, D); constexpr float oneArr[] {1, 1, 1, 1}; __m128 one = _mm_loadu_ps(oneArr); VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v)); VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0}}>(); VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1}}>(); VectorF32<4, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2}}>(); VectorF32<4, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3}}>(); return { _mm_mul_ps(A.v, fLenghtA.v), _mm_mul_ps(B.v, fLenghtB.v), _mm_mul_ps(C.v, fLenghtC.v), _mm_mul_ps(D.v, fLenghtD.v) }; } constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { VectorF32<1, 8> lenght = Length(A, B, C, D); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; __m256 one = _mm256_loadu_ps(oneArr); VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v)); VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0, 1,1,1}}>(); VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{2,2,2, 3,3,3}}>(); VectorF32<8, 1> fLenghtC = fLenght.template Shuffle<{{4,4,4, 5,5,5}}>(); VectorF32<8, 1> fLenghtD = fLenght.template Shuffle<{{6,6,6, 7,7,7}}>(); return { _mm256_mul_ps(A.v, fLenghtA.v), _mm256_mul_ps(B.v, fLenghtB.v), _mm256_mul_ps(C.v, fLenghtC.v), _mm256_mul_ps(D.v, fLenghtD.v) }; } #ifdef __AVX512F__ constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { VectorF32<1, 15> lenght = Length(A, B, C); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m512 one = _mm512_loadu_ps(oneArr); VectorF32<15, 1> fLenght(_mm512_div_ps(one, lenght.v)); VectorF32<15, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0, 1,1,1, 2,2,2, 3,3,3, 4,4,4}}>(); VectorF32<15, 1> fLenghtB = fLenght.template Shuffle<{{5,5,5, 6,6,6, 7,7,7, 8,8,8, 9,9,9}}>(); VectorF32<15, 1> fLenghtC = fLenght.template Shuffle<{{10,10,10, 11,11,11, 12,12,12, 13,13,13, 14,14,14}}>(); return { _mm512_mul_ps(A.v, fLenghtA.v), _mm512_mul_ps(B.v, fLenghtB.v), _mm512_mul_ps(C.v, fLenghtC.v), }; } #endif constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { VectorF32<1, 4> lenght = LengthNoShuffle(A, B); constexpr float oneArr[] {1, 1, 1, 1}; __m128 one = _mm_loadu_ps(oneArr); VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v)); VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1}}>(); VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3}}>(); return { _mm_mul_ps(A.v, fLenghtA.v), _mm_mul_ps(B.v, fLenghtB.v), }; } else if constexpr(std::is_same_v::VectorType, __m256>) { VectorF32<1, 8> lenght = LengthNoShuffle(A, B); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; __m256 one = _mm256_loadu_ps(oneArr); VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v)); VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,4,4,5,5}}>(); VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3,6,6,7,7}}>(); return { _mm256_mul_ps(A.v, fLenghtA.v), _mm256_mul_ps(B.v, fLenghtB.v), }; #ifdef __AVX512F__ } else { VectorF32<1, 16> lenght = LengthNoShuffle(A, B); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m512 one = _mm512_loadu_ps(oneArr); VectorF32<16, 1> fLenght(_mm512_div_ps(one, lenght.v)); VectorF32<16, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,4,4,5,5,8,8,9,9,12,12,13,13}}>(); VectorF32<16, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3,6,6,7,7,10,10,11,11,14,14,15,15}}>(); return { _mm512_mul_ps(A.v, fLenghtA.v), _mm512_mul_ps(B.v, fLenghtB.v), }; #endif } } constexpr static VectorF32<1, Packing*4> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { VectorF32<1, Packing*4> lenghtSq = LengthSqPack(A, B, C, D); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); } else { return VectorF32<1, Packing*4>(_mm512_sqrt_ps(lenghtSq.v)); } } constexpr static VectorF32<1, 4> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { VectorF32<1, 4> lenghtSq = LengthSqPack(A, B, C, D); return VectorF32<1, 4>(_mm_sqrt_ps(lenghtSq.v)); } constexpr static VectorF32<1, 8> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { VectorF32<1, 8> lenghtSq = LengthSqPack(A, B, C, D); return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); } #ifdef __AVX512F__ constexpr static VectorF32<1, 15> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { VectorF32<1, 15> lenghtSq = LengthSqPack(A, B, C); return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v)); } #endif constexpr static VectorF32<1, Packing*2> LengthPack( VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { VectorF32<1, Packing*2> lenghtSq = LengthSqPack(A, C); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v)); #ifdef __AVX512F__ } else { return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v)); #endif } } constexpr static VectorF32<1, Packing*4> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { return DotPack(A, A, B, B, C, C, D, D); } constexpr static VectorF32<1, 4> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { return DotPack(A, A, B, B, C, C, D, D); } constexpr static VectorF32<1, 8> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { return DotPack(A, A, B, B, C, C, D, D); } #ifdef __AVX512F__ constexpr static VectorF32<1, 15> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { return DotPack(A, A, B, B, C, C); } #endif constexpr static VectorF32<1, Packing*2> LengthSqPack( VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { return DotPack(A, A, C, C); } constexpr static VectorF32<1, Packing*4> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32