/* Crafter®.Math Copyright (C) 2026 Catcrafts® catcrafts.net This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 3.0 as published by the Free Software Foundation; This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ module; #ifdef __x86_64 #include #endif #ifdef __wasm_simd128__ #include #endif #ifdef __riscv_vector #include #endif export module Crafter.Math:VectorF32; import std; import :Common; namespace Crafter { #ifdef __x86_64 export template struct VectorF32 : public VectorBase { template friend struct VectorF32; constexpr VectorF32() = default; constexpr VectorF32(VectorBase::VectorType v) { this->v = v; } constexpr VectorF32(const float* vB) { Load(vB); }; constexpr VectorF32(const _Float16* vB) { Load(vB); }; constexpr VectorF32(float val) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_set1_ps(val); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_set1_ps(val); } else { this->v = _mm512_set1_ps(val); } }; constexpr void Load(const float* vB) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_loadu_ps(vB); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_loadu_ps(vB); } else { this->v = _mm512_loadu_ps(vB); } } constexpr void Store(float* vB) const { if constexpr(std::is_same_v::VectorType, __m128>) { _mm_storeu_ps(vB, this->v); } else if constexpr(std::is_same_v::VectorType, __m256>) { _mm256_storeu_ps(vB, this->v); } else { _mm512_storeu_ps(vB, this->v); } } constexpr void Load(const _Float16* vB) { #ifdef __F16C__ if constexpr (std::is_same_v::VectorType, __m128>) { this->v = _mm_cvtph_ps(_mm_loadl_epi64(reinterpret_cast(vB))); } else if constexpr (std::is_same_v::VectorType, __m256>) { this->v = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast(vB))); } else { this->v = _mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast(vB))); } #else alignas(64) float tmp[Len]; for (int i = 0; i < Len; ++i) tmp[i] = static_cast(vB[i]); if constexpr (std::is_same_v::VectorType, __m128>) { this->v = _mm_load_ps(tmp); } else if constexpr (std::is_same_v::VectorType, __m256>) { this->v = _mm256_load_ps(tmp); } else { this->v = _mm512_load_ps(tmp); } #endif } constexpr void Store(_Float16* vB) const { #ifdef __F16C__ if constexpr (std::is_same_v::VectorType, __m128>) { _mm_storel_epi64(reinterpret_cast<__m128i*>(vB), _mm_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT)); } else if constexpr (std::is_same_v::VectorType, __m256>) { _mm_storeu_si128(reinterpret_cast<__m128i*>(vB), _mm256_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT)); } else { _mm256_storeu_si256(reinterpret_cast<__m256i*>(vB), _mm512_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT)); } #else alignas(64) float tmp[Len]; if constexpr (std::is_same_v::VectorType, __m128>) { _mm_store_ps(tmp, this->v); } else if constexpr (std::is_same_v::VectorType, __m256>) { _mm256_store_ps(tmp, this->v); } else { _mm512_store_ps(tmp, this->v); } for (int i = 0; i < Len; ++i) vB[i] = static_cast<_Float16>(tmp[i]); #endif } template constexpr std::array::AlignmentElement> Store() const { std::array::AlignmentElement> returnArray; Store(returnArray.data()); return returnArray; } template constexpr operator VectorF32() const { if constexpr (Len == BLen) { if constexpr(std::is_same_v::VectorType, __m256> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm256_castps256_ps128(this->v)); } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm512_castps512_ps128(this->v)); } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m256>) { return VectorF32(_mm512_castps512_ps256(this->v)); } else if constexpr(std::is_same_v::VectorType, __m128> && std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_castps128_ps256(this->v)); } else if constexpr(std::is_same_v::VectorType, __m128> && std::is_same_v::VectorType, __m512>) { return VectorF32(_mm512_castps128_ps512(this->v)); } else if constexpr(std::is_same_v::VectorType, __m256> && std::is_same_v::VectorType, __m512>) { return VectorF32(_mm512_castps256_ps512(this->v)); } else { return VectorF32(this->v); } } else if constexpr (BLen <= Len) { return this->template ExtractLo(); } else { if constexpr(std::is_same_v::VectorType, __m128>) { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetExtractLoMaskEpi8(); __m128i shuffleVec = _mm_loadu_si128(reinterpret_cast(shuffleMask.data())); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask =VectorBase::template GetExtractLoMaskepi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm_castps_si256(this->v)); return VectorF32(_mm_castsi128_ps(_mm256_castsi256_si128(result))); #ifdef __AVX512F__ } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); return VectorF32(_mm_castsi128_ps(_mm512_castsi512_si128(result))); #endif } } else if constexpr(std::is_same_v::VectorType, __m256>) { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castsi128_si256(_mm_castps_si128(this->v))); return VectorF32(_mm256_castsi256_ps(result)); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v)); return VectorF32(_mm256_castsi256_ps(result)); #ifdef __AVX512F__ } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm512_loadu_epi32(permMask.data()); __m256i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi512_si256(_mm512_castps_si512(this->v))); return VectorF32(_mm256_castsi256_ps(result)); #endif } #ifdef __AVX512F__ } else { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi128_si512(_mm_castps_si128(this->v))); return VectorF32(_mm512_castsi512_ps(result)); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi256_si512(_mm256_castps_si256(this->v))); return VectorF32(_mm512_castsi512_ps(result)); } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); return VectorF32(_mm512_castsi512_ps(result)); } #endif } } } constexpr VectorF32 operator+(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_add_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_add_ps(this->v, b.v)); } else { return VectorF32(_mm512_add_ps(this->v, b.v)); } } constexpr VectorF32 operator-(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_sub_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_sub_ps(this->v, b.v)); } else { return VectorF32(_mm512_sub_ps(this->v, b.v)); } } constexpr VectorF32 operator*(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_mul_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_mul_ps(this->v, b.v)); } else { return VectorF32(_mm512_mul_ps(this->v, b.v)); } } constexpr VectorF32 operator/(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_div_ps(this->v, b.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_div_ps(this->v, b.v)); } else { return VectorF32(_mm512_div_ps(this->v, b.v)); } } constexpr void operator+=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_add_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_add_ps(this->v, b.v); } else { this->v = _mm512_add_ps(this->v, b.v); } } constexpr void operator-=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_sub_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_sub_ps(this->v, b.v); } else { this->v = _mm512_sub_ps(this->v, b.v); } } constexpr void operator*=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_mul_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_mul_ps(this->v, b.v); } else { this->v = _mm512_mul_ps(this->v, b.v); } } constexpr void operator/=(VectorF32 b) { if constexpr(std::is_same_v::VectorType, __m128>) { this->v = _mm_div_ps(this->v, b.v); } else if constexpr(std::is_same_v::VectorType, __m256>) { this->v = _mm256_div_ps(this->v, b.v); } else { this->v = _mm512_div_ps(this->v, b.v); } } constexpr VectorF32 operator+(float b) { VectorF32 vB(b); return *this + vB; } constexpr VectorF32 operator-(float b) { VectorF32 vB(b); return *this - vB; } constexpr VectorF32 operator*(float b) { VectorF32 vB(b); return *this * vB; } constexpr VectorF32 operator/(float b) { VectorF32 vB(b); return *this / vB; } constexpr void operator+=(float b) { VectorF32 vB(b); *this += vB; } constexpr void operator-=(float b) { VectorF32 vB(b); *this -= vB; } constexpr void operator*=(float b) { VectorF32 vB(b); *this *= vB; } constexpr void operator/=(float b) { VectorF32 vB(b); *this /= vB; } constexpr VectorF32 operator-(){ return Negate::GetAllTrue()>(); } constexpr bool operator==(VectorF32 b) const { if constexpr(std::is_same_v::VectorType, __m128>) { #ifdef __AVX512VL__ return _mm_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xF; #else return _mm_movemask_ps(_mm_cmpeq_ps(this->v, b.v)) == 0xF; #endif } else if constexpr(std::is_same_v::VectorType, __m256>) { #ifdef __AVX512VL__ return _mm256_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFF; #else return _mm256_movemask_ps(_mm256_cmp_ps(this->v, b.v, _CMP_EQ_OQ)) == 0xFF; #endif } else { return _mm512_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFFFF; } } constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } template constexpr VectorF32 ExtractLo() const { if constexpr(Packing > 1) { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetExtractLoMaskEpi8(); __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v)); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm256_castps256_ps128(_mm256_castsi256_ps(result))); } else { return VectorF32(_mm256_castsi256_ps(result)); } } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetExtractLoMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm512_castps512_ps128(_mm512_castsi512_ps(result))); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm512_castps512_ps256(_mm512_castsi512_ps(result))); } else { return VectorF32(_mm512_castsi512_ps(result)); } } } else { if constexpr(std::is_same_v::VectorType, __m256> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm256_castps256_ps128(this->v)); #ifdef __AVX512F__ } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m128>) { return VectorF32(_mm512_castps512_ps128(this->v)); } else if constexpr(std::is_same_v::VectorType, __m512> && std::is_same_v::VectorType, __m256>) { return VectorF32(_mm512_castps512_ps256(this->v)); #endif } else { return VectorF32(this->v); } } } constexpr VectorF32 Cos() { if constexpr (std::is_same_v::VectorType, __m128>) { return VectorF32(VectorBase::cos_f32x4(this->v)); } else if constexpr (std::is_same_v::VectorType, __m256>) { return VectorF32(VectorBase::cos_f32x8(this->v)); #ifdef __AVX512F__ } else { return VectorF32(VectorBase::cos_f32x16(this->v)); #endif } } constexpr VectorF32 Sin() { if constexpr (std::is_same_v::VectorType, __m128>) { return VectorF32(VectorBase::sin_f32x4(this->v)); } else if constexpr (std::is_same_v::VectorType, __m256>) { return VectorF32(VectorBase::sin_f32x8(this->v)); #ifdef __AVX512F__ } else { return VectorF32(VectorBase::sin_f32x16(this->v)); #endif } } std::tuple, VectorF32> SinCos() { if constexpr (std::is_same_v::VectorType, __m128>) { __m128 s, c; VectorBase::sincos_f32x4(this->v, s, c); return { VectorF32(s), VectorF32(c) }; } else if constexpr (std::is_same_v::VectorType, __m256>) { __m256 s, c; VectorBase::sincos_f32x8(this->v, s, c); return { VectorF32(s), VectorF32(c) }; #ifdef __AVX512F__ } else { __m512 s, c; VectorBase::sincos_f32x16(this->v, s, c); return { VectorF32(s), VectorF32(c) }; #endif } } template values> constexpr VectorF32 Negate() const { std::array::AlignmentElement> mask = VectorBase::template GetNegateMask(); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data()))))); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(this->v), _mm256_loadu_si256(reinterpret_cast<__m256i*>(mask.data()))))); #ifdef __AVX512F__ } else { return VectorF32(_mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(this->v), _mm512_loadu_epi32(mask.data())))); #endif } } static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_fmadd_ps(a.v, b.v, add.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_fmadd_ps(a.v, b.v, add.v)); #ifdef __AVX512F__ } else { return VectorF32(_mm512_fmadd_ps(a.v, b.v, add.v)); #endif } } static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_fmsub_ps(a.v, b.v, sub.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_fmsub_ps(a.v, b.v, sub.v)); #ifdef __AVX512F__ } else { return VectorF32(_mm512_fmsub_ps(a.v, b.v, sub.v)); #endif } } constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { VectorF32 row1 = a.template Shuffle<{{1,2,0}}>(); VectorF32 row4 = b.template Shuffle<{{1,2,0}}>(); VectorF32 row3 = a.template Shuffle<{{2,0,1}}>(); VectorF32 row2 = b.template Shuffle<{{2,0,1}}>(); VectorF32 result = row3 * row4; return VectorF32::MulitplySub(row1, row2, result); } template ShuffleValues> constexpr VectorF32 Shuffle() { if constexpr(VectorBase::template CheckEpi32Shuffle()) { constexpr std::uint8_t imm = VectorBase::template GetShuffleMaskEpi32(); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(this->v), imm))); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32(_mm256_castsi256_ps(_mm256_shuffle_epi32(_mm256_castps_si256(this->v), imm))); #ifdef __AVX512F__ } else { return VectorF32(_mm512_castsi512_ps(_mm512_shuffle_epi32(_mm512_castps_si512(this->v), imm))); #endif } } else if constexpr(VectorBase::template CheckEpi8Shuffle()) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetShuffleMaskEpi8(); if constexpr(std::is_same_v::VectorType, __m128>) { __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { #ifdef __AVX512BW__ __m256i shuffleVec = _mm256_loadu_si256(reinterpret_cast(shuffleMask.data())); return VectorF32(_mm256_castsi256_ps( _mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(this->v)),_mm512_castsi256_si512(shuffleVec))))); #else constexpr std::array::AlignmentElement> permMask = VectorBase::template GetPermuteMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); return VectorF32(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx))); #endif #ifdef __AVX512F__ } else { __m512i shuffleVec = _mm512_loadu_si512(reinterpret_cast(shuffleMask.data())); return VectorF32(_mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(this->v), shuffleVec))); #endif } } else { if constexpr(std::is_same_v::VectorType, __m128>) { constexpr std::array::Alignment> shuffleMask = VectorBase::template GetShuffleMaskEpi8(); __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); return VectorF32(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); } else if constexpr(std::is_same_v::VectorType, __m256>) { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetPermuteMaskEpi32(); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast(permMask.data())); return VectorF32(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx))); #ifdef __AVX512F__ } else { constexpr std::array::AlignmentElement> permMask = VectorBase::template GetPermuteMaskEpi32(); __m512i permIdx = _mm512_loadu_epi32(permMask.data()); return VectorF32(_mm512_castsi512_ps(_mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)))); #endif } } } // Public variadic surface — one name per op, arity locked to BatchSize. // The Pack helpers below carry the SIMD bodies and the per-(Len,Packing) // requires clauses; this wrapper just forwards once arity matches. template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF32 first, Rest... rest) { return NormalizePack(first, rest...); } template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF32 first, Rest... rest) { return LengthPack(first, rest...); } template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF32 first, Rest... rest) { return LengthSqPack(first, rest...); } template requires ((std::is_same_v> && ...) && (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize)) constexpr static auto Dot(VectorF32 first, Rest... rest) { return DotPack(first, rest...); } private: constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { VectorF32<1, 4> lenght = LengthNoShuffle(A, C, B, D); constexpr float oneArr[] {1, 1, 1, 1}; __m128 one = _mm_loadu_ps(oneArr); VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v)); VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0}}>(); VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1}}>(); VectorF32<4, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2}}>(); VectorF32<4, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3}}>(); return { _mm_mul_ps(A.v, fLenghtA.v), _mm_mul_ps(B.v, fLenghtB.v), _mm_mul_ps(C.v, fLenghtC.v), _mm_mul_ps(D.v, fLenghtD.v) }; } else if constexpr(std::is_same_v::VectorType, __m256>) { VectorF32<1, 8> lenght = LengthNoShuffle(A, C, B, D); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; __m256 one = _mm256_loadu_ps(oneArr); VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v)); VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0,4,4,4,4}}>(); VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1,5,5,5,5}}>(); VectorF32<8, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2,6,6,6,6}}>(); VectorF32<8, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3,7,7,7,7}}>(); return { _mm256_mul_ps(A.v, fLenghtA.v), _mm256_mul_ps(B.v, fLenghtB.v), _mm256_mul_ps(C.v, fLenghtC.v), _mm256_mul_ps(D.v, fLenghtD.v) }; #if defined(__AVX512F__) } else { VectorF32<1, 16> lenght = LengthNoShuffle(A, C, B, D); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m512 one = _mm512_loadu_ps(oneArr); VectorF32<16, 1> fLenght(_mm512_div_ps(one, lenght.v)); VectorF32<16, 1> fLenght2(lenght.v); VectorF32<16, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12}}>(); VectorF32<16, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13}}>(); VectorF32<16, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14}}>(); VectorF32<16, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15}}>(); return { VectorF32(_mm512_mul_ps(A.v, fLenghtA.v)), VectorF32(_mm512_mul_ps(B.v, fLenghtB.v)), VectorF32(_mm512_mul_ps(C.v, fLenghtC.v)), VectorF32(_mm512_mul_ps(D.v, fLenghtD.v)), }; #endif } } constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { VectorF32<1, 4> lenght = Length(A, B, C, D); constexpr float oneArr[] {1, 1, 1, 1}; __m128 one = _mm_loadu_ps(oneArr); VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v)); VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0}}>(); VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1}}>(); VectorF32<4, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2}}>(); VectorF32<4, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3}}>(); return { _mm_mul_ps(A.v, fLenghtA.v), _mm_mul_ps(B.v, fLenghtB.v), _mm_mul_ps(C.v, fLenghtC.v), _mm_mul_ps(D.v, fLenghtD.v) }; } constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { VectorF32<1, 8> lenght = Length(A, B, C, D); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; __m256 one = _mm256_loadu_ps(oneArr); VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v)); VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0, 1,1,1}}>(); VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{2,2,2, 3,3,3}}>(); VectorF32<8, 1> fLenghtC = fLenght.template Shuffle<{{4,4,4, 5,5,5}}>(); VectorF32<8, 1> fLenghtD = fLenght.template Shuffle<{{6,6,6, 7,7,7}}>(); return { _mm256_mul_ps(A.v, fLenghtA.v), _mm256_mul_ps(B.v, fLenghtB.v), _mm256_mul_ps(C.v, fLenghtC.v), _mm256_mul_ps(D.v, fLenghtD.v) }; } #ifdef __AVX512F__ constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { VectorF32<1, 15> lenght = Length(A, B, C); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m512 one = _mm512_loadu_ps(oneArr); VectorF32<15, 1> fLenght(_mm512_div_ps(one, lenght.v)); VectorF32<15, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0, 1,1,1, 2,2,2, 3,3,3, 4,4,4}}>(); VectorF32<15, 1> fLenghtB = fLenght.template Shuffle<{{5,5,5, 6,6,6, 7,7,7, 8,8,8, 9,9,9}}>(); VectorF32<15, 1> fLenghtC = fLenght.template Shuffle<{{10,10,10, 11,11,11, 12,12,12, 13,13,13, 14,14,14}}>(); return { _mm512_mul_ps(A.v, fLenghtA.v), _mm512_mul_ps(B.v, fLenghtB.v), _mm512_mul_ps(C.v, fLenghtC.v), }; } #endif constexpr static std::array, VectorBase::BatchSize> NormalizePack( VectorF32 A, VectorF32 B ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { VectorF32<1, 4> lenght = LengthNoShuffle(A, B); constexpr float oneArr[] {1, 1, 1, 1}; __m128 one = _mm_loadu_ps(oneArr); VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v)); VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1}}>(); VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3}}>(); return { _mm_mul_ps(A.v, fLenghtA.v), _mm_mul_ps(B.v, fLenghtB.v), }; } else if constexpr(std::is_same_v::VectorType, __m256>) { VectorF32<1, 8> lenght = LengthNoShuffle(A, B); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1}; __m256 one = _mm256_loadu_ps(oneArr); VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v)); VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,4,4,5,5}}>(); VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3,6,6,7,7}}>(); return { _mm256_mul_ps(A.v, fLenghtA.v), _mm256_mul_ps(B.v, fLenghtB.v), }; #ifdef __AVX512F__ } else { VectorF32<1, 16> lenght = LengthNoShuffle(A, B); constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; __m512 one = _mm512_loadu_ps(oneArr); VectorF32<16, 1> fLenght(_mm512_div_ps(one, lenght.v)); VectorF32<16, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,4,4,5,5,8,8,9,9,12,12,13,13}}>(); VectorF32<16, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3,6,6,7,7,10,10,11,11,14,14,15,15}}>(); return { _mm512_mul_ps(A.v, fLenghtA.v), _mm512_mul_ps(B.v, fLenghtB.v), }; #endif } } constexpr static VectorF32<1, Packing*4> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { VectorF32<1, Packing*4> lenghtSq = LengthSqPack(A, B, C, D); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); } else { return VectorF32<1, Packing*4>(_mm512_sqrt_ps(lenghtSq.v)); } } constexpr static VectorF32<1, 4> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { VectorF32<1, 4> lenghtSq = LengthSqPack(A, B, C, D); return VectorF32<1, 4>(_mm_sqrt_ps(lenghtSq.v)); } constexpr static VectorF32<1, 8> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { VectorF32<1, 8> lenghtSq = LengthSqPack(A, B, C, D); return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); } #ifdef __AVX512F__ constexpr static VectorF32<1, 15> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { VectorF32<1, 15> lenghtSq = LengthSqPack(A, B, C); return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v)); } #endif constexpr static VectorF32<1, Packing*2> LengthPack( VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { VectorF32<1, Packing*2> lenghtSq = LengthSqPack(A, C); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v)); #ifdef __AVX512F__ } else { return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v)); #endif } } constexpr static VectorF32<1, Packing*4> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { return DotPack(A, A, B, B, C, C, D, D); } constexpr static VectorF32<1, 4> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { return DotPack(A, A, B, B, C, C, D, D); } constexpr static VectorF32<1, 8> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { return DotPack(A, A, B, B, C, C, D, D); } #ifdef __AVX512F__ constexpr static VectorF32<1, 15> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { return DotPack(A, A, B, B, C, C); } #endif constexpr static VectorF32<1, Packing*2> LengthSqPack( VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { return DotPack(A, A, C, C); } constexpr static VectorF32<1, Packing*4> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1, VectorF32 D0, VectorF32 D1 ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { return DotNoShuffle(A0, A1, C0, C1, B0, B1, D0, D1); } else if constexpr(std::is_same_v::VectorType, __m256>) { VectorF32<8, 1> vec(DotNoShuffle(A0, A1, B0, B1, C0, C1, D0, D1).v); vec = vec.template Shuffle<{{ 0,4,2,6, 1,5,3,7, }}>(); return vec.v; #ifdef __AVX512F__ } else { VectorF32<16, 1> vec(DotNoShuffle(A0, A1, B0, B1, C0, C1, D0, D1).v); vec = vec.template Shuffle<{{ 0,4,8,12, 2,6,10,14, 1,5,9,13, 3,7,11,15 }}>(); return vec.v; #endif } } constexpr static VectorF32<1, 4> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1, VectorF32 D0, VectorF32 D1 ) requires(Len == 3 && Packing == 1) { // Each register: [X1 X2 X3 _] // 4 pairs (A,B,C,D) → 4 dot products → 1 x __m128 // // After element-wise multiply: // mulA = [a1 a2 a3 _] (where ai = A0[i]*A1[i]) // mulB = [b1 b2 b3 _] // mulC = [c1 c2 c3 _] // mulD = [d1 d2 d3 _] // // We need: result = [a1+a2+a3, b1+b2+b3, c1+c2+c3, d1+d2+d3] // // Transpose to get: // row1 = [a1 b1 c1 d1] // row2 = [a2 b2 c2 d2] // row3 = [a3 b3 c3 d3] // Then sum rows. __m128 mulA = _mm_mul_ps(A0.v, A1.v); __m128 mulB = _mm_mul_ps(B0.v, B1.v); __m128 mulC = _mm_mul_ps(C0.v, C1.v); __m128 mulD = _mm_mul_ps(D0.v, D1.v); // Standard 4x4 transpose (only first 3 rows matter, 4th is garbage) // unpacklo/hi interleave pairs of 32-bit elements __m128 tmp0 = _mm_unpacklo_ps(mulA, mulB); // a1 b1 a2 b2 __m128 tmp1 = _mm_unpackhi_ps(mulA, mulB); // a3 b3 _ _ __m128 tmp2 = _mm_unpacklo_ps(mulC, mulD); // c1 d1 c2 d2 __m128 tmp3 = _mm_unpackhi_ps(mulC, mulD); // c3 d3 _ _ __m128 row1 = _mm_movelh_ps(tmp0, tmp2); // a1 b1 c1 d1 __m128 row2 = _mm_movehl_ps(tmp2, tmp0); // a2 b2 c2 d2 __m128 row3 = _mm_movelh_ps(tmp1, tmp3); // a3 b3 c3 d3 row1 = _mm_add_ps(row1, row2); row1 = _mm_add_ps(row1, row3); return row1; } constexpr static VectorF32<1, 8> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1, VectorF32 D0, VectorF32 D1 ) requires(Len == 3 && Packing == 2) { // Each register: [X1 X2 X3 Y1 Y2 Y3 _ _] // 4 pairs × 2 vectors each = 8 dot products → 1 x __m256 // // After multiply: // mulA = [a1 a2 a3 b1 b2 b3 _ _] // mulB = [c1 c2 c3 d1 d2 d3 _ _] // mulC = [e1 e2 e3 f1 f2 f3 _ _] // mulD = [g1 g2 g3 h1 h2 h3 _ _] // // We need result = [a·, b·, c·, d·, e·, f·, g·, h·] // where x· = x1+x2+x3 // // Strategy: use permute to gather element 1s, 2s, 3s across all 8 vectors, // then add. // // Gather indices (from the concatenated view of mulA|mulB|mulC|mulD): // vec: a a a b b b _ _ c c c d d d _ _ e e e f f f _ _ g g g h h h _ _ // idx: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // // elem1 = [a1, b1, c1, d1, e1, f1, g1, h1] → indices [0, 3, 8, 11, 16, 19, 24, 27] // elem2 = [a2, b2, c2, d2, e2, f2, g2, h2] → indices [1, 4, 9, 12, 17, 20, 25, 28] // elem3 = [a3, b3, c3, d3, e3, f3, g3, h3] → indices [2, 5, 10, 13, 18, 21, 26, 29] // // Unfortunately AVX2 doesn't have cross-register permutes for 8x32 easily. // Use vpermd (_mm256_permutevar8x32) within pairs, then blend/combine. // // Within each 256-bit register [X1 X2 X3 Y1 Y2 Y3 _ _]: // elem1_local = [X1 Y1 ...] → gather from indices 0,3 // elem2_local = [X2 Y2 ...] → gather from indices 1,4 // elem3_local = [X3 Y3 ...] → gather from indices 2,5 // // After permutevar8x32 on each mul register: // From mulA: row1_part = [a1 b1 _ _ _ _ _ _] // From mulB: row1_part = [c1 d1 _ _ _ _ _ _] // From mulC: row1_part = [e1 f1 _ _ _ _ _ _] // From mulD: row1_part = [g1 h1 _ _ _ _ _ _] // // Then combine with unpack/shuffle to get full rows. __m256 mulA = _mm256_mul_ps(A0.v, A1.v); // a1 a2 a3 b1 b2 b3 _ _ __m256 mulB = _mm256_mul_ps(B0.v, B1.v); // c1 c2 c3 d1 d2 d3 _ _ __m256 mulC = _mm256_mul_ps(C0.v, C1.v); // e1 e2 e3 f1 f2 f3 _ _ __m256 mulD = _mm256_mul_ps(D0.v, D1.v); // g1 g2 g3 h1 h2 h3 _ _ // Permute each register to gather elements by position. // For each register [X1 X2 X3 Y1 Y2 Y3 U U]: // perm1: [X1 Y1 X2 Y2 X3 Y3 _ _] → indices {0,3,1,4,2,5,6,7} __m256i permIdx = _mm256_setr_epi32(0, 3, 1, 4, 2, 5, 6, 7); // After permute: [X1 Y1 X2 Y2 X3 Y3 _ _] __m256 pA = _mm256_permutevar8x32_ps(mulA, permIdx); // a1 b1 a2 b2 a3 b3 _ _ __m256 pB = _mm256_permutevar8x32_ps(mulB, permIdx); // c1 d1 c2 d2 c3 d3 _ _ __m256 pC = _mm256_permutevar8x32_ps(mulC, permIdx); // e1 f1 e2 f2 e3 f3 _ _ __m256 pD = _mm256_permutevar8x32_ps(mulD, permIdx); // g1 h1 g2 h2 g3 h3 _ _ // Now combine pairs. Each pair contributes 4 consecutive results. // pA has [a1 b1 a2 b2 a3 b3 _ _], pB has [c1 d1 c2 d2 c3 d3 _ _] // We want: // row1 = [a1 b1 c1 d1 | e1 f1 g1 h1] // row2 = [a2 b2 c2 d2 | e2 f2 g2 h2] // row3 = [a3 b3 c3 d3 | e3 f3 g3 h3] // // From pA: elements at [0,1] are elem1, [2,3] are elem2, [4,5] are elem3 // From pB: elements at [0,1] are elem1, [2,3] are elem2, [4,5] are elem3 // // Use unpacklo_epi64 to interleave 64-bit chunks: // unpacklo64(pA, pB) within 128-bit lanes: // lo lane: pA[0:1]=a1,b1 | pB[0:1]=c1,d1 → [a1 b1 c1 d1] // hi lane: pA[4:5]=a3,b3 | pB[4:5]=c3,d3 → [a3 b3 c3 d3] // → [a1 b1 c1 d1 | a3 b3 c3 d3] // // unpackhi64(pA, pB) within 128-bit lanes: // lo lane: pA[2:3]=a2,b2 | pB[2:3]=c2,d2 → [a2 b2 c2 d2] // hi lane: pA[6:7]=_,_ | pB[6:7]=_,_ → garbage // → [a2 b2 c2 d2 | _ _ _ _] __m256i AB_lo = _mm256_unpacklo_epi64( _mm256_castps_si256(pA), _mm256_castps_si256(pB)); // [a1 b1 c1 d1 | a3 b3 c3 d3] __m256i AB_hi = _mm256_unpackhi_epi64( _mm256_castps_si256(pA), _mm256_castps_si256(pB)); // [a2 b2 c2 d2 | _ _ _ _] __m256i CD_lo = _mm256_unpacklo_epi64( _mm256_castps_si256(pC), _mm256_castps_si256(pD)); // [e1 f1 g1 h1 | e3 f3 g3 h3] __m256i CD_hi = _mm256_unpackhi_epi64( _mm256_castps_si256(pC), _mm256_castps_si256(pD)); // [e2 f2 g2 h2 | _ _ _ _] // row1 = [a1 b1 c1 d1 | e1 f1 g1 h1] → lo 128 of AB_lo, lo 128 of CD_lo // row2 = [a2 b2 c2 d2 | e2 f2 g2 h2] → lo 128 of AB_hi, lo 128 of CD_hi // row3 = [a3 b3 c3 d3 | e3 f3 g3 h3] → hi 128 of AB_lo, hi 128 of CD_lo __m256 row1 = _mm256_castsi256_ps(_mm256_permute2x128_si256(AB_lo, CD_lo, 0x20)); // lo,lo __m256 row2 = _mm256_castsi256_ps(_mm256_permute2x128_si256(AB_hi, CD_hi, 0x20)); // lo,lo __m256 row3 = _mm256_castsi256_ps(_mm256_permute2x128_si256(AB_lo, CD_lo, 0x31)); // hi,hi row1 = _mm256_add_ps(row1, row2); row1 = _mm256_add_ps(row1, row3); return row1; } #ifdef __AVX512F__ constexpr static VectorF32<1, 15> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1 ) requires(Len == 3 && Packing == 5) { // __m512: Each register: [A1 A2 A3 B1 B2 B3 C1 C2 C3 D1 D2 D3 E1 E2 E3 _] // 3 pairs × 5 vectors each = 15 dot products → fits in 1 x __m512 (slot 16 unused) // // After multiply of 3 pairs: // mul0 = [a1 a2 a3 b1 b2 b3 c1 c2 c3 d1 d2 d3 e1 e2 e3 _] // mul1 = [f1 f2 f3 g1 g2 g3 h1 h2 h3 i1 i2 i3 j1 j2 j3 _] // mul2 = [k1 k2 k3 l1 l2 l3 m1 m2 m3 n1 n2 n3 o1 o2 o3 _] // // Result = [a· b· c· d· e· f· g· h· i· j· k· l· m· n· o· _] // // Strategy: for each mul register, gather element 1s, 2s, 3s with vpermps, // then combine across registers. // // From mul0: 5 vectors at positions {0,1,2}, {3,4,5}, {6,7,8}, {9,10,11}, {12,13,14} // elem1 = indices {0, 3, 6, 9, 12} → positions 0..4 of result // elem2 = indices {1, 4, 7, 10, 13} // elem3 = indices {2, 5, 8, 11, 14} __m512 mul0 = _mm512_mul_ps(A0.v, A1.v); __m512 mul1 = _mm512_mul_ps(B0.v, B1.v); __m512 mul2 = _mm512_mul_ps(C0.v, C1.v); // Gather elem1, elem2, elem3 from each mul register // Each register has 5 vec3s: extract element 1,2,3 of each into consecutive positions __m512i idx1 = _mm512_setr_epi32(0, 3, 6, 9, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); __m512i idx2 = _mm512_setr_epi32(1, 4, 7, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); __m512i idx3 = _mm512_setr_epi32(2, 5, 8, 11, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // From mul0 → results 0..4, from mul1 → results 5..9, from mul2 → results 10..14 // Gather from each, then combine. __m512 e1_0 = _mm512_permutexvar_ps(idx1, mul0); // [a1 b1 c1 d1 e1 ...] __m512 e2_0 = _mm512_permutexvar_ps(idx2, mul0); // [a2 b2 c2 d2 e2 ...] __m512 e3_0 = _mm512_permutexvar_ps(idx3, mul0); // [a3 b3 c3 d3 e3 ...] __m512 e1_1 = _mm512_permutexvar_ps(idx1, mul1); // [f1 g1 h1 i1 j1 ...] __m512 e2_1 = _mm512_permutexvar_ps(idx2, mul1); // [f2 g2 h2 i2 j2 ...] __m512 e3_1 = _mm512_permutexvar_ps(idx3, mul1); // [f3 g3 h3 i3 j3 ...] __m512 e1_2 = _mm512_permutexvar_ps(idx1, mul2); // [k1 l1 m1 n1 o1 ...] __m512 e2_2 = _mm512_permutexvar_ps(idx2, mul2); // [k2 l2 m2 n2 o2 ...] __m512 e3_2 = _mm512_permutexvar_ps(idx3, mul2); // [k3 l3 m3 n3 o3 ...] // Now combine: we need positions 0..4 from reg0, 5..9 from reg1, 10..14 from reg2 // Use masked moves to assemble the final row vectors. // mask for positions 0-4: 0b0000000000011111 = 0x001F // mask for positions 5-9: 0b0000001111100000 = 0x03E0 // mask for positions 10-14: 0b0111110000000000 = 0x7C00 // For reg1, its results are in positions 0..4 but need to go to 5..9. // For reg2, its results are in positions 0..4 but need to go to 10..14. // Use a different approach: permute reg1/reg2 results to their target positions. // Shift reg1 results from slots 0..4 to slots 5..9 __m512i shiftIdx1 = _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0); // Shift reg2 results from slots 0..4 to slots 10..14 __m512i shiftIdx2 = _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 0); __m512 e1_1_shifted = _mm512_permutexvar_ps(shiftIdx1, e1_1); __m512 e2_1_shifted = _mm512_permutexvar_ps(shiftIdx1, e2_1); __m512 e3_1_shifted = _mm512_permutexvar_ps(shiftIdx1, e3_1); __m512 e1_2_shifted = _mm512_permutexvar_ps(shiftIdx2, e1_2); __m512 e2_2_shifted = _mm512_permutexvar_ps(shiftIdx2, e2_2); __m512 e3_2_shifted = _mm512_permutexvar_ps(shiftIdx2, e3_2); // Blend: take positions 0..4 from reg0, 5..9 from reg1, 10..14 from reg2 __mmask16 mask_5_9 = 0x03E0u; // bits 5-9 __mmask16 mask_10_14 = 0x7C00u; // bits 10-14 __m512 row1 = _mm512_mask_mov_ps(e1_0, mask_5_9, e1_1_shifted); row1 = _mm512_mask_mov_ps(row1, mask_10_14, e1_2_shifted); __m512 row2 = _mm512_mask_mov_ps(e2_0, mask_5_9, e2_1_shifted); row2 = _mm512_mask_mov_ps(row2, mask_10_14, e2_2_shifted); __m512 row3 = _mm512_mask_mov_ps(e3_0, mask_5_9, e3_1_shifted); row3 = _mm512_mask_mov_ps(row3, mask_10_14, e3_2_shifted); row1 = _mm512_add_ps(row1, row2); row1 = _mm512_add_ps(row1, row3); return row1; } #endif constexpr static VectorF32<1, Packing*2> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 C0, VectorF32 C1 ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { return DotNoShuffle(A0, A1, C0, C1); } else if constexpr(std::is_same_v::VectorType, __m256>) { VectorF32<8, 1> vec(DotNoShuffle(A0, A1, C0, C1).v); vec = vec.template Shuffle<{{ 0,1, 4,5, 2,3, 6,7, }}>(); return vec.v; #ifdef __AVX512F__ } else { VectorF32<16, 1> vec(DotNoShuffle(A0, A1, C0, C1).v); vec = vec.template Shuffle<{{ 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15 }}>(); return vec.v; #endif } } private: constexpr static VectorF32<1, Packing*4> LengthNoShuffle( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { VectorF32<1, Packing*4> lenghtSq = LengthSqNoShuffle(A, B, C, D); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); #ifdef __AVX512F__ } else { return VectorF32<1, Packing*4>(_mm512_sqrt_ps(lenghtSq.v)); #endif } } constexpr static VectorF32<1, Packing*2> LengthNoShuffle( VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { VectorF32<1, Packing*2> lenghtSq = LengthSqNoShuffle(A, C); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v)); #ifdef __AVX512F__ } else { return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v)); #endif } } constexpr static VectorF32<1, Packing*4> LengthSqNoShuffle( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { return DotNoShuffle(A, A, B, B, C, C, D, D); } constexpr static VectorF32<1, Packing*2> LengthSqNoShuffle( VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { return DotNoShuffle(A, A, C, C); } constexpr static VectorF32<1, Packing*4> DotNoShuffle( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1, VectorF32 D0, VectorF32 D1 ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { __m128 mulA = _mm_mul_ps(A0.v, A1.v); __m128 mulB = _mm_mul_ps(B0.v, B1.v); __m128i row12Temp1 = _mm_unpacklo_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulB)); // A1 B1 A2 B2 __m128i row34Temp1 = _mm_unpackhi_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulB)); // A3 B3 A4 B4 __m128 mulC = _mm_mul_ps(C0.v, C1.v); __m128 mulD = _mm_mul_ps(D0.v, D1.v); __m128i row12Temp2 = _mm_unpacklo_epi32(_mm_castps_si128(mulC), _mm_castps_si128(mulD)); // C1 D1 C2 D2 __m128i row34Temp2 = _mm_unpackhi_epi32(_mm_castps_si128(mulC), _mm_castps_si128(mulD)); // C3 D3 C4 D4 __m128 row1 = _mm_unpacklo_epi32(row12Temp1, row12Temp2); // A1 C1 B1 D1 __m128 row2 = _mm_unpackhi_epi32(row12Temp1, row12Temp2); // A2 C2 B2 D2 __m128 row3 = _mm_unpacklo_epi32(row34Temp1, row34Temp2); // A3 C3 B3 D3 __m128 row4 = _mm_unpackhi_epi32(row34Temp1, row34Temp2); // A4 C4 B4 D4 row1 = _mm_add_ps(row1, row2); row1 = _mm_add_ps(row1, row3); row1 = _mm_add_ps(row1, row4); return row1; } else if constexpr(std::is_same_v::VectorType, __m256>) { __m256 mulA = _mm256_mul_ps(A0.v, A1.v); __m256 mulB = _mm256_mul_ps(B0.v, B1.v); __m256i row12Temp1 = _mm256_unpacklo_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulB)); // A1 B1 A2 B2 __m256i row34Temp1 = _mm256_unpackhi_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulB)); // A3 B3 A4 B4 __m256 mulC = _mm256_mul_ps(C0.v, C1.v); __m256 mulD = _mm256_mul_ps(D0.v, D1.v); __m256i row12Temp2 = _mm256_unpacklo_epi32(_mm256_castps_si256(mulC), _mm256_castps_si256(mulD)); // C1 D1 C2 D2 __m256i row34Temp2 = _mm256_unpackhi_epi32(_mm256_castps_si256(mulC), _mm256_castps_si256(mulD)); // C3 D3 C4 D4 __m256 row1 = _mm256_unpacklo_epi32(row12Temp1, row12Temp2); // A1 C1 B1 D1 __m256 row2 = _mm256_unpackhi_epi32(row12Temp1, row12Temp2); //A2 C2 B2 D2 __m256 row3 = _mm256_unpacklo_epi32(row34Temp1, row34Temp2); // A3 C3 B3 D3 __m256 row4 = _mm256_unpackhi_epi32(row34Temp1, row34Temp2); // A4 C4 B4 D4 row1 = _mm256_add_ps(row1, row2); row1 = _mm256_add_ps(row1, row3); row1 = _mm256_add_ps(row1, row4); return row1; #ifdef __AVX512F__ } else { __m512 mulA = _mm512_mul_ps(A0.v, A1.v); __m512 mulB = _mm512_mul_ps(B0.v, B1.v); __m512i row12Temp1 = _mm512_unpacklo_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulB)); // A1 B1 A2 B2 __m512i row34Temp1 = _mm512_unpackhi_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulB)); // A3 B3 A4 B4 __m512 mulC = _mm512_mul_ps(C0.v, C1.v); __m512 mulD = _mm512_mul_ps(D0.v, D1.v); __m512i row12Temp2 = _mm512_unpacklo_epi32(_mm512_castps_si512(mulC), _mm512_castps_si512(mulD)); // C1 D1 C2 D2 __m512i row34Temp2 = _mm512_unpackhi_epi32(_mm512_castps_si512(mulC), _mm512_castps_si512(mulD)); // C3 D3 C4 D4 __m512 row1 = _mm512_unpacklo_epi32(row12Temp1, row12Temp2); // A1 C1 B1 D1 __m512 row2 = _mm512_unpackhi_epi32(row12Temp1, row12Temp2); //A2 C2 B2 D2 __m512 row3 = _mm512_unpacklo_epi32(row34Temp1, row34Temp2); // A3 C3 B3 D3 __m512 row4 = _mm512_unpackhi_epi32(row34Temp1, row34Temp2); // A4 C4 B4 D4 row1 = _mm512_add_ps(row1, row2); row1 = _mm512_add_ps(row1, row3); row1 = _mm512_add_ps(row1, row4); return row1; #endif } } constexpr static VectorF32<1, Packing*2> DotNoShuffle( VectorF32 A0, VectorF32 A1, VectorF32 C0, VectorF32 C1 ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { __m128 mulA = _mm_mul_ps(A0.v, A1.v); __m128 mulC = _mm_mul_ps(C0.v, C1.v); __m128i row12Temp1 = _mm_unpacklo_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulC)); // A1 C1 A2 C2 __m128i row56Temp1 = _mm_unpackhi_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulC)); // B1 D1 B2 D2 __m128i row1TempTemp1 = row12Temp1; __m128i row5TempTemp1 = row56Temp1; row12Temp1 = _mm_unpacklo_epi32(row12Temp1, row56Temp1); // A1 B1 C1 D1 row56Temp1 = _mm_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2 return _mm_add_ps(row12Temp1, row56Temp1); } else if constexpr(std::is_same_v::VectorType, __m256>) { __m256 mulA = _mm256_mul_ps(A0.v, A1.v); __m256 mulC = _mm256_mul_ps(C0.v, C1.v); __m256i row12Temp1 = _mm256_unpacklo_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulC)); // A1 C1 A2 C2 __m256i row56Temp1 = _mm256_unpackhi_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulC)); // B1 D1 B2 D2 __m256i row1TempTemp1 = row12Temp1; __m256i row5TempTemp1 = row56Temp1; row12Temp1 = _mm256_unpacklo_epi32(row12Temp1, row56Temp1); // A1 B1 C1 D1 row56Temp1 = _mm256_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2 return _mm256_add_ps(row12Temp1, row56Temp1); #ifdef __AVX512F__ } else { __m512 mulA = _mm512_mul_ps(A0.v, A1.v); __m512 mulC = _mm512_mul_ps(C0.v, C1.v); __m512i row12Temp1 = _mm512_unpacklo_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulC)); // A1 C1 A2 C2 __m512i row56Temp1 = _mm512_unpackhi_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulC)); // B1 D1 B2 D2 __m512i row1TempTemp1 = row12Temp1; __m512i row5TempTemp1 = row56Temp1; row12Temp1 = _mm512_unpacklo_epi32(row12Temp1, row56Temp1); // A1 B1 C1 D1 row56Temp1 = _mm512_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2 return _mm512_add_ps(row12Temp1, row56Temp1); #endif } } public: template ShuffleValues> constexpr static VectorF32 Blend(VectorF32 a, VectorF32 b) { constexpr auto mask = VectorBase::template GetBlendMaskEpi32(); if constexpr (std::is_same_v::VectorType, __m128>) { return _mm_castsi128_ps(_mm_blend_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v), mask)); } else if constexpr (std::is_same_v::VectorType, __m256>) { return _mm256_castsi256_ps(_mm256_blend_epi32(_mm256_castps_si256(a.v), _mm256_castps_si256(b.v), mask)); #ifdef __AVX512F__ } else if constexpr (std::is_same_v::VectorType, __m512>) { return _mm512_castsi512_ps(_mm512_mask_blend_epi32(mask, _mm512_castps_si512(a.v), _mm512_castps_si512(b.v))); #endif } } constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { VectorF32<3, Packing> qv(q); VectorF32 t = Cross(qv, v) * float(2); return v + t * q.template Shuffle<{{3,3,3,3}}>() + Cross(qv, t); } constexpr static VectorF32<4, 2> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) { VectorF32 translated = v - pivot; VectorF32<3, Packing> qv(q.v); VectorF32 t = Cross(qv, translated) * float(2); VectorF32 rotated = translated + t * q.template Shuffle<{{3,3,3,3}}>() + Cross(qv, t); return rotated + pivot; } constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> EulerHalf) requires(Len == 4) { std::tuple, VectorF32<3, Packing>> sinCos = EulerHalf.SinCos(); VectorF32<4, Packing> sin = std::get<0>(sinCos); VectorF32<4, Packing> cos = std::get<1>(sinCos); VectorF32<4, Packing> row1 = cos.template Shuffle<{{0,0,0,0}}>(); row1 = Blend<{{0,1,1,1}}>(sin, row1); VectorF32<4, Packing> row2 = cos.template Shuffle<{{1,1,1,1}}>(); row2 = Blend<{{1,0,1,1}}>(sin, row2); row1 *= row2; VectorF32<4, Packing> row3 = cos.template Shuffle<{{2,2,2,2}}>(); row3 = Blend<{{1,1,0,1}}>(sin, row3); row1 *= row3; VectorF32<4, Packing> row4 = sin.template Shuffle<{{0,0,0,0}}>(); row4 = Blend<{{0,1,1,1}}>(cos, row4); VectorF32<4, Packing> row5 = sin.template Shuffle<{{1,1,1,1}}>(); row5 = Blend<{{1,0,1,1}}>(cos, row5); row4 *= row5; VectorF32<4, Packing> row6 = sin.template Shuffle<{{2,2,2,2}}>(); row6 = Blend<{{1,1,0,1}}>(cos, row6); row6 = row6.template Negate<{{true,false,true,false}}>(); row1 = MulitplyAdd(row4, row6, row1); return row1; } }; #elif defined(__wasm_simd128__) // WebAssembly SIMD128 implementation. VectorType is always v128_t and we // cap Len*Packing*sizeof(float) at 16 bytes (i.e. up to 4 floats per // vector) in Common.cppm so a single v128_t covers every instantiation. // Operations without a direct SIMD equivalent (Shuffle with runtime indices, // transcendentals, etc.) round-trip through a float[4] scratch buffer. export template struct VectorF32 : public VectorBase { template friend struct VectorF32; using Base = VectorBase; static constexpr std::uint8_t NElems = Base::AlignmentElement; static_assert(NElems == 4, "WASM SIMD VectorF32 assumes 4-lane vectors"); constexpr VectorF32() = default; constexpr VectorF32(v128_t vv) { this->v = vv; } constexpr VectorF32(const float* vB) { Load(vB); } constexpr VectorF32(float val) { this->v = wasm_f32x4_splat(val); } constexpr void Load(const float* vB) { this->v = wasm_v128_load(vB); } constexpr void Store(float* vB) const { wasm_v128_store(vB, this->v); } template constexpr std::array Store() const { std::array r{}; Store(r.data()); return r; } template constexpr operator VectorF32() const { alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); alignas(16) float out[4] = {0,0,0,0}; const std::uint8_t copyLen = (BLen < Len) ? BLen : Len; const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing; for (std::uint8_t p = 0; p < copyPack; ++p) for (std::uint8_t i = 0; i < copyLen; ++i) out[p * BLen + i] = tmp[p * Len + i]; return VectorF32(wasm_v128_load(out)); } constexpr VectorF32 operator+(VectorF32 b) const { return VectorF32(wasm_f32x4_add(this->v, b.v)); } constexpr VectorF32 operator-(VectorF32 b) const { return VectorF32(wasm_f32x4_sub(this->v, b.v)); } constexpr VectorF32 operator*(VectorF32 b) const { return VectorF32(wasm_f32x4_mul(this->v, b.v)); } constexpr VectorF32 operator/(VectorF32 b) const { return VectorF32(wasm_f32x4_div(this->v, b.v)); } constexpr void operator+=(VectorF32 b) { this->v = wasm_f32x4_add(this->v, b.v); } constexpr void operator-=(VectorF32 b) { this->v = wasm_f32x4_sub(this->v, b.v); } constexpr void operator*=(VectorF32 b) { this->v = wasm_f32x4_mul(this->v, b.v); } constexpr void operator/=(VectorF32 b) { this->v = wasm_f32x4_div(this->v, b.v); } constexpr VectorF32 operator+(float b) const { return *this + VectorF32(b); } constexpr VectorF32 operator-(float b) const { return *this - VectorF32(b); } constexpr VectorF32 operator*(float b) const { return *this * VectorF32(b); } constexpr VectorF32 operator/(float b) const { return *this / VectorF32(b); } constexpr void operator+=(float b) { *this += VectorF32(b); } constexpr void operator-=(float b) { *this -= VectorF32(b); } constexpr void operator*=(float b) { *this *= VectorF32(b); } constexpr void operator/=(float b) { *this /= VectorF32(b); } constexpr VectorF32 operator-() const { return VectorF32(wasm_f32x4_neg(this->v)); } constexpr bool operator==(VectorF32 b) const { return wasm_i32x4_bitmask(wasm_f32x4_eq(this->v, b.v)) == 0b1111; } constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } template constexpr VectorF32 ExtractLo() const { alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); alignas(16) float out[4] = {0,0,0,0}; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < ExtractLen; ++i) out[p * ExtractLen + i] = tmp[p * Len + i]; return VectorF32(wasm_v128_load(out)); } constexpr VectorF32 Cos() const { alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); for (int i = 0; i < 4; ++i) tmp[i] = std::cos(tmp[i]); return VectorF32(wasm_v128_load(tmp)); } constexpr VectorF32 Sin() const { alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); for (int i = 0; i < 4; ++i) tmp[i] = std::sin(tmp[i]); return VectorF32(wasm_v128_load(tmp)); } constexpr std::tuple, VectorF32> SinCos() const { return { Sin(), Cos() }; } template values> constexpr VectorF32 Negate() const { constexpr auto mask = []() { std::array m{}; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) m[p * Len + i] = values[i] ? 0x80000000u : 0u; return m; }(); v128_t maskVec = wasm_v128_load(mask.data()); return VectorF32(wasm_v128_xor(this->v, maskVec)); } static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { #ifdef __wasm_relaxed_simd__ // Single-rounded FMA (a*b + c). Host-defined when FMA hardware is // missing — accuracy may differ from the strict-SIMD wasm path. return VectorF32(wasm_f32x4_relaxed_madd(a.v, b.v, add.v)); #else return VectorF32(wasm_f32x4_add(wasm_f32x4_mul(a.v, b.v), add.v)); #endif } static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { #ifdef __wasm_relaxed_simd__ // a*b - c is fused as madd(a, b, -c) — same op count as mul+sub // but one rounding instead of two. return VectorF32(wasm_f32x4_relaxed_madd(a.v, b.v, wasm_f32x4_neg(sub.v))); #else return VectorF32(wasm_f32x4_sub(wasm_f32x4_mul(a.v, b.v), sub.v)); #endif } constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { v128_t a_yzx = wasm_i32x4_shuffle(a.v, a.v, 1, 2, 0, 3); v128_t a_zxy = wasm_i32x4_shuffle(a.v, a.v, 2, 0, 1, 3); v128_t b_yzx = wasm_i32x4_shuffle(b.v, b.v, 1, 2, 0, 3); v128_t b_zxy = wasm_i32x4_shuffle(b.v, b.v, 2, 0, 1, 3); #ifdef __wasm_relaxed_simd__ // a_yzx*b_zxy - a_zxy*b_yzx fused as nmadd(a_zxy, b_yzx, a_yzx*b_zxy) // = -(a_zxy*b_yzx) + a_yzx*b_zxy. Replaces a mul+sub pair with a // single FMA. return VectorF32(wasm_f32x4_relaxed_nmadd(a_zxy, b_yzx, wasm_f32x4_mul(a_yzx, b_zxy))); #else return VectorF32(wasm_f32x4_sub(wasm_f32x4_mul(a_yzx, b_zxy), wasm_f32x4_mul(a_zxy, b_yzx))); #endif } template ShuffleValues> constexpr VectorF32 Shuffle() const { alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); alignas(16) float out[4] = {0,0,0,0}; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) out[p * Len + i] = tmp[p * Len + ShuffleValues[i]]; return VectorF32(wasm_v128_load(out)); } template ShuffleValues> constexpr static VectorF32 Blend(VectorF32 a, VectorF32 b) { constexpr auto mask = []() { std::array m{}; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) m[p * Len + i] = ShuffleValues[i] ? 0xFFFFFFFFu : 0u; return m; }(); v128_t maskVec = wasm_v128_load(mask.data()); return VectorF32(wasm_v128_bitselect(b.v, a.v, maskVec)); } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF32 first, Rest... rest) { constexpr std::uint8_t N = VectorBase::BatchSize; VectorF32<1, static_cast(Packing * N)> r; std::array, N> args{ first, rest... }; alignas(16) float buf[4] = {0,0,0,0}; for (std::uint8_t i = 0; i < N; ++i) { alignas(16) float tmp[4]; wasm_v128_store(tmp, args[i].v); for (std::uint8_t p = 0; p < Packing; ++p) { float acc = 0.0f; for (std::uint8_t k = 0; k < Len; ++k) { float x = tmp[p * Len + k]; acc += x * x; } buf[i * Packing + p] = acc; } } r.v = wasm_v128_load(buf); return r; } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF32 first, Rest... rest) { auto sq = LengthSq(first, rest...); sq.v = wasm_f32x4_sqrt(sq.v); return sq; } // Pairwise dot products packed into one v128. Only the first Len // lanes contribute, so the same routine handles 3- and 4-component // inputs — the 4th lane of Len==3 inputs may be garbage from Cross() // and must not be summed. Takes BatchSize pairs (== 4 here since // WASM AlignmentElement is always 4 and Packing must be 1). template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize) && (Len == 3 || Len == 4) && Packing == 1) constexpr static VectorF32<1, 4> Dot(VectorF32 first, Rest... rest) { constexpr std::uint8_t N = VectorBase::BatchSize; std::array, 2 * N> args{ first, rest... }; alignas(16) float out[4] = {0,0,0,0}; for (std::uint8_t i = 0; i < N; ++i) { alignas(16) float a[4], b[4]; wasm_v128_store(a, args[2 * i].v); wasm_v128_store(b, args[2 * i + 1].v); for (std::uint8_t k = 0; k < Len; ++k) out[i] += a[k] * b[k]; } return VectorF32<1, 4>(wasm_v128_load(out)); } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF32 first, Rest... rest) { auto normOne = [](VectorF32 u) { alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v); alignas(16) float out[4] = {0,0,0,0}; for (std::uint8_t p = 0; p < Packing; ++p) { float acc = 0.0f; for (std::uint8_t k = 0; k < Len; ++k) { float x = tmp[p * Len + k]; acc += x * x; } float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f; for (std::uint8_t k = 0; k < Len; ++k) out[p * Len + k] = tmp[p * Len + k] * invLen; } return VectorF32(wasm_v128_load(out)); }; return std::array, VectorBase::BatchSize>{ normOne(first), normOne(rest)... }; } constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { alignas(16) float qBuf[4]; wasm_v128_store(qBuf, q.v); alignas(16) float qvBuf[4] = {0,0,0,0}; alignas(16) float qwBuf[4] = {0,0,0,0}; for (std::uint8_t p = 0; p < Packing; ++p) { qvBuf[p * 3 + 0] = qBuf[p * 4 + 0]; qvBuf[p * 3 + 1] = qBuf[p * 4 + 1]; qvBuf[p * 3 + 2] = qBuf[p * 4 + 2]; for (std::uint8_t i = 0; i < 3; ++i) qwBuf[p * 3 + i] = qBuf[p * 4 + 3]; } VectorF32<3, Packing> qv(wasm_v128_load(qvBuf)); VectorF32<3, Packing> qwBroadcast(wasm_v128_load(qwBuf)); VectorF32<3, Packing> t = Cross(qv, v) * 2.0f; return v + t * qwBroadcast + Cross(qv, t); } constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) { VectorF32<3, Packing> translated = v - pivot; return Rotate(translated, q) + pivot; } constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) { alignas(16) float eulerBuf[4]; wasm_v128_store(eulerBuf, eulerHalf.v); alignas(16) float outBuf[4] = {0,0,0,0}; for (std::uint8_t p = 0; p < Packing; ++p) { float roll = eulerBuf[p * 3 + 0]; float pitch = eulerBuf[p * 3 + 1]; float yaw = eulerBuf[p * 3 + 2]; float sr = std::sin(roll), cr = std::cos(roll); float sp = std::sin(pitch), cp = std::cos(pitch); float sy = std::sin(yaw), cy = std::cos(yaw); outBuf[p * 4 + 0] = sr * cp * cy - cr * sp * sy; outBuf[p * 4 + 1] = cr * sp * cy + sr * cp * sy; outBuf[p * 4 + 2] = cr * cp * sy - sr * sp * cy; outBuf[p * 4 + 3] = cr * cp * cy + sr * sp * sy; } return VectorF32<4, Packing>(wasm_v128_load(outBuf)); } }; #elif defined(__riscv_vector) // RISC-V V extension implementation. Storage is a GNU vector of 16/32/64 // bytes (picked in Common.cppm from the guaranteed VLEN); native operators // map to vfadd/vfsub/vfmul/vfdiv. Per-element loops compile to vrgather/ // vmerge/vfsqrt when the autovectoriser can see the pattern, and to // scalar fallback otherwise. Hand-tuned intrinsic paths // (e.g. vsetvl + vfwmacc for batched dot) can land incrementally. export template struct VectorF32 : public VectorBase { template friend struct VectorF32; using Base = VectorBase; static constexpr std::uint8_t NElems = Base::AlignmentElement; constexpr VectorF32() = default; constexpr VectorF32(typename Base::VectorType vv) { this->v = vv; } constexpr VectorF32(const float* vB) { Load(vB); } constexpr VectorF32(float val) { for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val; } constexpr void Load(const float* vB) { for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i]; } constexpr void Store(float* vB) const { for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i]; } template constexpr std::array Store() const { std::array r{}; Store(r.data()); return r; } template constexpr operator VectorF32() const { VectorF32 r; const std::uint8_t copyLen = (BLen < Len) ? BLen : Len; const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing; for (std::uint8_t p = 0; p < copyPack; ++p) for (std::uint8_t i = 0; i < copyLen; ++i) r.v[p * BLen + i] = this->v[p * Len + i]; return r; } constexpr VectorF32 operator+(VectorF32 b) const { return VectorF32(this->v + b.v); } constexpr VectorF32 operator-(VectorF32 b) const { return VectorF32(this->v - b.v); } constexpr VectorF32 operator*(VectorF32 b) const { return VectorF32(this->v * b.v); } constexpr VectorF32 operator/(VectorF32 b) const { return VectorF32(this->v / b.v); } constexpr void operator+=(VectorF32 b) { this->v = this->v + b.v; } constexpr void operator-=(VectorF32 b) { this->v = this->v - b.v; } constexpr void operator*=(VectorF32 b) { this->v = this->v * b.v; } constexpr void operator/=(VectorF32 b) { this->v = this->v / b.v; } constexpr VectorF32 operator+(float b) const { return *this + VectorF32(b); } constexpr VectorF32 operator-(float b) const { return *this - VectorF32(b); } constexpr VectorF32 operator*(float b) const { return *this * VectorF32(b); } constexpr VectorF32 operator/(float b) const { return *this / VectorF32(b); } constexpr void operator+=(float b) { *this += VectorF32(b); } constexpr void operator-=(float b) { *this -= VectorF32(b); } constexpr void operator*=(float b) { *this *= VectorF32(b); } constexpr void operator/=(float b) { *this /= VectorF32(b); } constexpr VectorF32 operator-() const { return VectorF32(-this->v); } constexpr bool operator==(VectorF32 b) const { for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) if (this->v[p * Len + i] != b.v[p * Len + i]) return false; return true; } constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } template constexpr VectorF32 ExtractLo() const { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < ExtractLen; ++i) r.v[p * ExtractLen + i] = this->v[p * Len + i]; return r; } constexpr VectorF32 Cos() const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::cos(this->v[i]); return r; } constexpr VectorF32 Sin() const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::sin(this->v[i]); return r; } constexpr std::tuple, VectorF32> SinCos() const { return { Sin(), Cos() }; } template values> constexpr VectorF32 Negate() const { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i]; return r; } // a*b + c — the compiler fuses to vfmacc.vv under the default // -ffp-contract=on. No explicit intrinsic needed. static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { return VectorF32(a.v * b.v + add.v); } static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { return VectorF32(a.v * b.v - sub.v); } constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) { const std::uint8_t base = p * 3; r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1]; r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2]; r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0]; } return r; } template ShuffleValues> constexpr VectorF32 Shuffle() const { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]]; return r; } template ShuffleValues> constexpr static VectorF32 Blend(VectorF32 a, VectorF32 b) { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i]; return r; } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF32 first, Rest... rest) { constexpr std::uint8_t N = VectorBase::BatchSize; VectorF32<1, static_cast(Packing * N)> r; std::array, N> args{ first, rest... }; for (std::uint8_t i = 0; i < N; ++i) for (std::uint8_t p = 0; p < Packing; ++p) { float acc = 0.0f; for (std::uint8_t k = 0; k < Len; ++k) { float x = args[i].v[p * Len + k]; acc += x * x; } r.v[i * Packing + p] = acc; } return r; } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF32 first, Rest... rest) { auto sq = LengthSq(first, rest...); for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]); return sq; } // Pairwise dot products across BatchSize pairs. The 4th lane of Len==3 // inputs may carry garbage from Cross(), so only the first Len lanes // are summed per pair. template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize)) constexpr static auto Dot(VectorF32 first, Rest... rest) { constexpr std::uint8_t N = VectorBase::BatchSize; VectorF32<1, static_cast(Packing * N)> r; std::array, 2 * N> args{ first, rest... }; for (std::uint8_t i = 0; i < N; ++i) for (std::uint8_t p = 0; p < Packing; ++p) { float acc = 0.0f; for (std::uint8_t k = 0; k < Len; ++k) acc += args[2 * i].v[p * Len + k] * args[2 * i + 1].v[p * Len + k]; r.v[i * Packing + p] = acc; } return r; } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF32 first, Rest... rest) { auto normOne = [](VectorF32 u) { VectorF32 out; for (std::uint8_t p = 0; p < Packing; ++p) { float acc = 0.0f; for (std::uint8_t k = 0; k < Len; ++k) { float x = u.v[p * Len + k]; acc += x * x; } float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f; for (std::uint8_t k = 0; k < Len; ++k) out.v[p * Len + k] = u.v[p * Len + k] * invLen; } return out; }; return std::array, VectorBase::BatchSize>{ normOne(first), normOne(rest)... }; } constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { VectorF32<3, Packing> qv; VectorF32<3, Packing> qwBroadcast; for (std::uint8_t p = 0; p < Packing; ++p) { qv.v[p * 3 + 0] = q.v[p * 4 + 0]; qv.v[p * 3 + 1] = q.v[p * 4 + 1]; qv.v[p * 3 + 2] = q.v[p * 4 + 2]; for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3]; } VectorF32<3, Packing> t = Cross(qv, v) * 2.0f; return v + t * qwBroadcast + Cross(qv, t); } constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) { VectorF32<3, Packing> translated = v - pivot; return Rotate(translated, q) + pivot; } constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) { VectorF32<4, Packing> r; for (std::uint8_t p = 0; p < Packing; ++p) { float roll = eulerHalf.v[p * 3 + 0]; float pitch = eulerHalf.v[p * 3 + 1]; float yaw = eulerHalf.v[p * 3 + 2]; float sr = std::sin(roll), cr = std::cos(roll); float sp = std::sin(pitch), cp = std::cos(pitch); float sy = std::sin(yaw), cy = std::cos(yaw); r.v[p * 4 + 0] = sr * cp * cy - cr * sp * sy; r.v[p * 4 + 1] = cr * sp * cy + sr * cp * sy; r.v[p * 4 + 2] = cr * cp * sy - sr * sp * cy; r.v[p * 4 + 3] = cr * cp * cy + sr * sp * sy; } return r; } }; #else // Scalar software fallback for non-x86_64 targets. Future arches can swap // in their own intrinsic implementation by adding an arch-specific branch // above and gating this one out. export template struct VectorF32 : public VectorBase { template friend struct VectorF32; using Base = VectorBase; static constexpr std::uint8_t NElems = Base::AlignmentElement; constexpr VectorF32() = default; constexpr VectorF32(typename Base::VectorType vv) { this->v = vv; } constexpr VectorF32(const float* vB) { Load(vB); } #ifdef __FLT16_MAX__ constexpr VectorF32(const _Float16* vB) { Load(vB); } #endif constexpr VectorF32(float val) { for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val; } constexpr void Load(const float* vB) { for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i]; } constexpr void Store(float* vB) const { for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i]; } #ifdef __FLT16_MAX__ constexpr void Load(const _Float16* vB) { for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = static_cast(vB[i]); } constexpr void Store(_Float16* vB) const { for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = static_cast<_Float16>(this->v[i]); } #endif template constexpr std::array Store() const { std::array r{}; Store(r.data()); return r; } template constexpr operator VectorF32() const { VectorF32 r; const std::uint8_t copyLen = (BLen < Len) ? BLen : Len; const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing; for (std::uint8_t p = 0; p < copyPack; ++p) for (std::uint8_t i = 0; i < copyLen; ++i) r.v[p * BLen + i] = this->v[p * Len + i]; return r; } constexpr VectorF32 operator+(VectorF32 b) const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] + b.v[i]; return r; } constexpr VectorF32 operator-(VectorF32 b) const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] - b.v[i]; return r; } constexpr VectorF32 operator*(VectorF32 b) const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] * b.v[i]; return r; } constexpr VectorF32 operator/(VectorF32 b) const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] / b.v[i]; return r; } constexpr void operator+=(VectorF32 b) { for (std::uint8_t i=0;iv[i] += b.v[i]; } constexpr void operator-=(VectorF32 b) { for (std::uint8_t i=0;iv[i] -= b.v[i]; } constexpr void operator*=(VectorF32 b) { for (std::uint8_t i=0;iv[i] *= b.v[i]; } constexpr void operator/=(VectorF32 b) { for (std::uint8_t i=0;iv[i] /= b.v[i]; } constexpr VectorF32 operator+(float b) const { return *this + VectorF32(b); } constexpr VectorF32 operator-(float b) const { return *this - VectorF32(b); } constexpr VectorF32 operator*(float b) const { return *this * VectorF32(b); } constexpr VectorF32 operator/(float b) const { return *this / VectorF32(b); } constexpr void operator+=(float b) { *this += VectorF32(b); } constexpr void operator-=(float b) { *this -= VectorF32(b); } constexpr void operator*=(float b) { *this *= VectorF32(b); } constexpr void operator/=(float b) { *this /= VectorF32(b); } constexpr VectorF32 operator-() const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = -this->v[i]; return r; } constexpr bool operator==(VectorF32 b) const { for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) if (this->v[p * Len + i] != b.v[p * Len + i]) return false; return true; } constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } template constexpr VectorF32 ExtractLo() const { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < ExtractLen; ++i) r.v[p * ExtractLen + i] = this->v[p * Len + i]; return r; } constexpr VectorF32 Cos() const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::cos(this->v[i]); return r; } constexpr VectorF32 Sin() const { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::sin(this->v[i]); return r; } constexpr std::tuple, VectorF32> SinCos() const { return { Sin(), Cos() }; } template values> constexpr VectorF32 Negate() const { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i]; return r; } static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] + add.v[i]; return r; } static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { VectorF32 r; for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] - sub.v[i]; return r; } constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) { const std::uint8_t base = p * 3; r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1]; r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2]; r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0]; } return r; } template ShuffleValues> constexpr VectorF32 Shuffle() const { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]]; return r; } template ShuffleValues> constexpr static VectorF32 Blend(VectorF32 a, VectorF32 b) { VectorF32 r; for (std::uint8_t p = 0; p < Packing; ++p) for (std::uint8_t i = 0; i < Len; ++i) r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i]; return r; } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF32 first, Rest... rest) { constexpr std::uint8_t N = VectorBase::BatchSize; VectorF32<1, static_cast(Packing * N)> r; std::array, N> args{ first, rest... }; for (std::uint8_t i = 0; i < N; ++i) for (std::uint8_t p = 0; p < Packing; ++p) { float acc = 0.0f; for (std::uint8_t k = 0; k < Len; ++k) { float x = args[i].v[p * Len + k]; acc += x * x; } r.v[i * Packing + p] = acc; } return r; } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF32 first, Rest... rest) { auto sq = LengthSq(first, rest...); for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]); return sq; } template requires((std::is_same_v> && ...) && (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF32 first, Rest... rest) { auto normOne = [](VectorF32 u) { VectorF32 out; for (std::uint8_t p = 0; p < Packing; ++p) { float acc = 0.0f; for (std::uint8_t k = 0; k < Len; ++k) { float x = u.v[p * Len + k]; acc += x * x; } float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f; for (std::uint8_t k = 0; k < Len; ++k) out.v[p * Len + k] = u.v[p * Len + k] * invLen; } return out; }; return std::array, VectorBase::BatchSize>{ normOne(first), normOne(rest)... }; } constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { VectorF32<3, Packing> qv; VectorF32<3, Packing> qwBroadcast; for (std::uint8_t p = 0; p < Packing; ++p) { qv.v[p * 3 + 0] = q.v[p * 4 + 0]; qv.v[p * 3 + 1] = q.v[p * 4 + 1]; qv.v[p * 3 + 2] = q.v[p * 4 + 2]; for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3]; } VectorF32<3, Packing> t = Cross(qv, v) * 2.0f; return v + t * qwBroadcast + Cross(qv, t); } constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) { VectorF32<3, Packing> translated = v - pivot; return Rotate(translated, q) + pivot; } constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) { VectorF32<4, Packing> r; for (std::uint8_t p = 0; p < Packing; ++p) { float roll = eulerHalf.v[p * 3 + 0]; float pitch = eulerHalf.v[p * 3 + 1]; float yaw = eulerHalf.v[p * 3 + 2]; float sr = std::sin(roll), cr = std::cos(roll); float sp = std::sin(pitch), cp = std::cos(pitch); float sy = std::sin(yaw), cy = std::cos(yaw); r.v[p * 4 + 0] = sr * cp * cy - cr * sp * sy; r.v[p * 4 + 1] = cr * sp * cy + sr * cp * sy; r.v[p * 4 + 2] = cr * cp * sy - sr * sp * cy; r.v[p * 4 + 3] = cr * cp * cy + sr * sp * sy; } return r; } }; #endif } export template struct std::formatter> : std::formatter { constexpr auto format(const Crafter::VectorF32& obj, format_context& ctx) const { std::array::AlignmentElement> vec = obj.template Store(); std::string out = "{"; for(std::uint32_t i = 0; i < Packing; i++) { out += "{"; for(std::uint32_t i2 = 0; i2 < Len; i2++) { out += std::format("{}", static_cast(vec[i * Len + i2])); if (i2 + 1 < Len) out += ","; } out += "}"; } out += "}"; return std::formatter::format(out, ctx); } };