Crafter.Math/interfaces/Crafter.Math-VectorF32.cppm

1978 lines
No EOL
106 KiB
C++
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
Crafter®.Math
Copyright (C) 2026 Catcrafts®
catcrafts.net
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License version 3.0 as published by the Free Software Foundation;
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
module;
#ifdef __x86_64
#include <immintrin.h>
#endif
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#endif
export module Crafter.Math:VectorF32;
import std;
import :Common;
namespace Crafter {
#ifdef __x86_64
export template <std::uint8_t Len, std::uint8_t Packing>
struct VectorF32 : public VectorBase<Len, Packing, float> {
template <std::uint8_t Len2, std::uint8_t Packing2>
friend struct VectorF32;
constexpr VectorF32() = default;
constexpr VectorF32(VectorBase<Len, Packing, float>::VectorType v) {
this->v = v;
}
constexpr VectorF32(const float* vB) {
Load(vB);
};
constexpr VectorF32(const _Float16* vB) {
Load(vB);
};
constexpr VectorF32(float val) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_set1_ps(val);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_set1_ps(val);
} else {
this->v = _mm512_set1_ps(val);
}
};
constexpr void Load(const float* vB) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_loadu_ps(vB);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_loadu_ps(vB);
} else {
this->v = _mm512_loadu_ps(vB);
}
}
constexpr void Store(float* vB) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
_mm_storeu_ps(vB, this->v);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
_mm256_storeu_ps(vB, this->v);
} else {
_mm512_storeu_ps(vB, this->v);
}
}
constexpr void Load(const _Float16* vB) {
#ifdef __F16C__
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_cvtph_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(vB)));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(vB)));
} else {
this->v = _mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(vB)));
}
#else
alignas(64) float tmp[Len];
for (int i = 0; i < Len; ++i)
tmp[i] = static_cast<float>(vB[i]);
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_load_ps(tmp);
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_load_ps(tmp);
} else {
this->v = _mm512_load_ps(tmp);
}
#endif
}
constexpr void Store(_Float16* vB) const {
#ifdef __F16C__
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
_mm_storel_epi64(reinterpret_cast<__m128i*>(vB), _mm_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(vB), _mm256_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT));
} else {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(vB), _mm512_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT));
}
#else
alignas(64) float tmp[Len];
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
_mm_store_ps(tmp, this->v);
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
_mm256_store_ps(tmp, this->v);
} else {
_mm512_store_ps(tmp, this->v);
}
for (int i = 0; i < Len; ++i)
vB[i] = static_cast<_Float16>(tmp[i]);
#endif
}
template<typename T>
constexpr std::array<T, VectorBase<Len, Packing, float>::AlignmentElement> Store() const {
std::array<T, VectorBase<Len, Packing, float>::AlignmentElement> returnArray;
Store(returnArray.data());
return returnArray;
}
template <std::uint8_t BLen, std::uint8_t BPacking>
constexpr operator VectorF32<BLen, BPacking>() const {
if constexpr (Len == BLen) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256> && std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<BLen, BPacking>(_mm256_castps256_ps128(this->v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<BLen, BPacking>(_mm512_castps512_ps128(this->v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<BLen, BPacking>(_mm512_castps512_ps256(this->v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128> && std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<BLen, BPacking>(_mm256_castps128_ps256(this->v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128> && std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512>) {
return VectorF32<BLen, BPacking>(_mm512_castps128_ps512(this->v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256> && std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512>) {
return VectorF32<BLen, BPacking>(_mm512_castps256_ps512(this->v));
} else {
return VectorF32<BLen, BPacking>(this->v);
}
} else if constexpr (BLen <= Len) {
return this->template ExtractLo<BLen>();
} else {
if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m128>) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi8<BLen>();
__m128i shuffleVec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(shuffleMask.data()));
return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask =VectorBase<Len, Packing, float>::template GetExtractLoMaskepi32<BLen>();
__m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm_castps_si256(this->v));
return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm256_castsi256_si128(result)));
#ifdef __AVX512F__
} else {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data());
__m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v));
return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm512_castsi512_si128(result)));
#endif
}
} else if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m256>) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castsi128_si256(_mm_castps_si128(this->v)));
return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v));
return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result));
#ifdef __AVX512F__
} else {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m256i permIdx = _mm512_loadu_epi32(permMask.data());
__m256i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi512_si256(_mm512_castps_si512(this->v)));
return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result));
#endif
}
#ifdef __AVX512F__
} else {
if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m128>) {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data());
__m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi128_si512(_mm_castps_si128(this->v)));
return VectorF32<BLen, BPacking>(_mm512_castsi512_ps(result));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data());
__m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi256_si512(_mm256_castps_si256(this->v)));
return VectorF32<BLen, BPacking>(_mm512_castsi512_ps(result));
} else {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data());
__m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v));
return VectorF32<BLen, BPacking>(_mm512_castsi512_ps(result));
}
#endif
}
}
}
constexpr VectorF32<Len, Packing> operator+(VectorF32<Len, Packing> b) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_add_ps(this->v, b.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_add_ps(this->v, b.v));
} else {
return VectorF32<Len, Packing>(_mm512_add_ps(this->v, b.v));
}
}
constexpr VectorF32<Len, Packing> operator-(VectorF32<Len, Packing> b) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_sub_ps(this->v, b.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_sub_ps(this->v, b.v));
} else {
return VectorF32<Len, Packing>(_mm512_sub_ps(this->v, b.v));
}
}
constexpr VectorF32<Len, Packing> operator*(VectorF32<Len, Packing> b) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_mul_ps(this->v, b.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_mul_ps(this->v, b.v));
} else {
return VectorF32<Len, Packing>(_mm512_mul_ps(this->v, b.v));
}
}
constexpr VectorF32<Len, Packing> operator/(VectorF32<Len, Packing> b) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_div_ps(this->v, b.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_div_ps(this->v, b.v));
} else {
return VectorF32<Len, Packing>(_mm512_div_ps(this->v, b.v));
}
}
constexpr void operator+=(VectorF32<Len, Packing> b) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_add_ps(this->v, b.v);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_add_ps(this->v, b.v);
} else {
this->v = _mm512_add_ps(this->v, b.v);
}
}
constexpr void operator-=(VectorF32<Len, Packing> b) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_sub_ps(this->v, b.v);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_sub_ps(this->v, b.v);
} else {
this->v = _mm512_sub_ps(this->v, b.v);
}
}
constexpr void operator*=(VectorF32<Len, Packing> b) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_mul_ps(this->v, b.v);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_mul_ps(this->v, b.v);
} else {
this->v = _mm512_mul_ps(this->v, b.v);
}
}
constexpr void operator/=(VectorF32<Len, Packing> b) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_div_ps(this->v, b.v);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_div_ps(this->v, b.v);
} else {
this->v = _mm512_div_ps(this->v, b.v);
}
}
constexpr VectorF32<Len, Packing> operator+(float b) {
VectorF32<Len, Packing> vB(b);
return *this + vB;
}
constexpr VectorF32<Len, Packing> operator-(float b) {
VectorF32<Len, Packing> vB(b);
return *this - vB;
}
constexpr VectorF32<Len, Packing> operator*(float b) {
VectorF32<Len, Packing> vB(b);
return *this * vB;
}
constexpr VectorF32<Len, Packing> operator/(float b) {
VectorF32<Len, Packing> vB(b);
return *this / vB;
}
constexpr void operator+=(float b) {
VectorF32<Len, Packing> vB(b);
*this += vB;
}
constexpr void operator-=(float b) {
VectorF32<Len, Packing> vB(b);
*this -= vB;
}
constexpr void operator*=(float b) {
VectorF32<Len, Packing> vB(b);
*this *= vB;
}
constexpr void operator/=(float b) {
VectorF32<Len, Packing> vB(b);
*this /= vB;
}
constexpr VectorF32<Len, Packing> operator-(){
return Negate<VectorBase<Len, Packing, float>::GetAllTrue()>();
}
constexpr bool operator==(VectorF32<Len, Packing> b) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
#ifdef __AVX512VL__
return _mm_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xF;
#else
return _mm_movemask_ps(_mm_cmpeq_ps(this->v, b.v)) == 0xF;
#endif
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
#ifdef __AVX512VL__
return _mm256_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFF;
#else
return _mm256_movemask_ps(_mm256_cmp_ps(this->v, b.v, _CMP_EQ_OQ)) == 0xFF;
#endif
} else {
return _mm512_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFFFF;
}
}
constexpr bool operator!=(VectorF32<Len, Packing> b) const {
return !(*this == b);
}
template<std::uint32_t ExtractLen>
constexpr VectorF32<ExtractLen, Packing> ExtractLo() const {
if constexpr(Packing > 1) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
constexpr std::array<std::uint8_t,VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi8<ExtractLen>();
__m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data());
return VectorF32<ExtractLen, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi32<ExtractLen>();
__m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v));
if constexpr(std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) {
return VectorF32<ExtractLen, Packing>(_mm256_castps256_ps128(_mm256_castsi256_ps(result)));
} else {
return VectorF32<ExtractLen, Packing>(_mm256_castsi256_ps(result));
}
} else {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi32<ExtractLen>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data());
__m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v));
if constexpr(std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) {
return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps128(_mm512_castsi512_ps(result)));
} else if constexpr(std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m256>) {
return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps256(_mm512_castsi512_ps(result)));
} else {
return VectorF32<ExtractLen, Packing>(_mm512_castsi512_ps(result));
}
}
} else {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) {
return VectorF32<ExtractLen, Packing>(_mm256_castps256_ps128(this->v));
#ifdef __AVX512F__
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) {
return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps128(this->v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m256>) {
return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps256(this->v));
#endif
} else {
return VectorF32<ExtractLen, Packing>(this->v);
}
}
}
constexpr VectorF32<Len, Packing> Cos() {
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x4(this->v));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x8(this->v));
#ifdef __AVX512F__
} else {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x16(this->v));
#endif
}
}
constexpr VectorF32<Len, Packing> Sin() {
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x4(this->v));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x8(this->v));
#ifdef __AVX512F__
} else {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x16(this->v));
#endif
}
}
std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> SinCos() {
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
__m128 s, c;
VectorBase<Len, Packing, float>::sincos_f32x4(this->v, s, c);
return {
VectorF32<Len, Packing>(s),
VectorF32<Len, Packing>(c)
};
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
__m256 s, c;
VectorBase<Len, Packing, float>::sincos_f32x8(this->v, s, c);
return {
VectorF32<Len, Packing>(s),
VectorF32<Len, Packing>(c)
};
#ifdef __AVX512F__
} else {
__m512 s, c;
VectorBase<Len, Packing, float>::sincos_f32x16(this->v, s, c);
return {
VectorF32<Len, Packing>(s),
VectorF32<Len, Packing>(c)
};
#endif
}
}
template <std::array<bool, Len> values>
constexpr VectorF32<Len, Packing> Negate() const {
std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>();
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data())))));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(this->v), _mm256_loadu_si256(reinterpret_cast<__m256i*>(mask.data())))));
#ifdef __AVX512F__
} else {
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(this->v), _mm512_loadu_epi32(mask.data()))));
#endif
}
}
static constexpr VectorF32<Len, Packing> MulitplyAdd(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> add) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_fmadd_ps(a.v, b.v, add.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_fmadd_ps(a.v, b.v, add.v));
#ifdef __AVX512F__
} else {
return VectorF32<Len, Packing>(_mm512_fmadd_ps(a.v, b.v, add.v));
#endif
}
}
static constexpr VectorF32<Len, Packing> MulitplySub(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> sub) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_fmsub_ps(a.v, b.v, sub.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_fmsub_ps(a.v, b.v, sub.v));
#ifdef __AVX512F__
} else {
return VectorF32<Len, Packing>(_mm512_fmsub_ps(a.v, b.v, sub.v));
#endif
}
}
constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) {
VectorF32<Len, Packing> row1 = a.template Shuffle<{{1,2,0}}>();
VectorF32<Len, Packing> row4 = b.template Shuffle<{{1,2,0}}>();
VectorF32<Len, Packing> row3 = a.template Shuffle<{{2,0,1}}>();
VectorF32<Len, Packing> row2 = b.template Shuffle<{{2,0,1}}>();
VectorF32<Len, Packing> result = row3 * row4;
return VectorF32<Len, Packing>::MulitplySub(row1, row2, result);
}
template <const std::array<std::uint8_t, Len> ShuffleValues>
constexpr VectorF32<Len, Packing> Shuffle() {
if constexpr(VectorBase<Len, Packing, float>::template CheckEpi32Shuffle<ShuffleValues>()) {
constexpr std::uint8_t imm = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi32<ShuffleValues>();
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(this->v), imm)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_shuffle_epi32(_mm256_castps_si256(this->v), imm)));
#ifdef __AVX512F__
} else {
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_shuffle_epi32(_mm512_castps_si512(this->v), imm)));
#endif
}
} else if constexpr(VectorBase<Len, Packing, float>::template CheckEpi8Shuffle<ShuffleValues>()) {
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<ShuffleValues>();
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
__m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data());
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
#ifdef __AVX512BW__
__m256i shuffleVec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(shuffleMask.data()));
return VectorF32<Len, Packing>(_mm256_castsi256_ps( _mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(this->v)),_mm512_castsi256_si512(shuffleVec)))));
#else
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>();
__m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx)));
#endif
#ifdef __AVX512F__
} else {
__m512i shuffleVec = _mm512_loadu_si512(reinterpret_cast<const __m256i*>(shuffleMask.data()));
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(this->v), shuffleVec)));
#endif
}
} else {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<ShuffleValues>();
__m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data());
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>();
__m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx)));
#ifdef __AVX512F__
} else {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data());
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v))));
#endif
}
}
}
// Public variadic surface — one name per op, arity locked to BatchSize.
// The Pack helpers below carry the SIMD bodies and the per-(Len,Packing)
// requires clauses; this wrapper just forwards once arity matches.
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
return NormalizePack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
return LengthPack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
return LengthSqPack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Dot(VectorF32<Len, Packing> first, Rest... rest) {
return DotPack(first, rest...);
}
private:
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
VectorF32<1, 4> lenght = LengthNoShuffle(A, C, B, D);
constexpr float oneArr[] {1, 1, 1, 1};
__m128 one = _mm_loadu_ps(oneArr);
VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v));
VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0}}>();
VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1}}>();
VectorF32<4, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2}}>();
VectorF32<4, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3}}>();
return {
_mm_mul_ps(A.v, fLenghtA.v),
_mm_mul_ps(B.v, fLenghtB.v),
_mm_mul_ps(C.v, fLenghtC.v),
_mm_mul_ps(D.v, fLenghtD.v)
};
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
VectorF32<1, 8> lenght = LengthNoShuffle(A, C, B, D);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1};
__m256 one = _mm256_loadu_ps(oneArr);
VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v));
VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0,4,4,4,4}}>();
VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1,5,5,5,5}}>();
VectorF32<8, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2,6,6,6,6}}>();
VectorF32<8, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3,7,7,7,7}}>();
return {
_mm256_mul_ps(A.v, fLenghtA.v),
_mm256_mul_ps(B.v, fLenghtB.v),
_mm256_mul_ps(C.v, fLenghtC.v),
_mm256_mul_ps(D.v, fLenghtD.v)
};
#if defined(__AVX512F__)
} else {
VectorF32<1, 16> lenght = LengthNoShuffle(A, C, B, D);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m512 one = _mm512_loadu_ps(oneArr);
VectorF32<16, 1> fLenght(_mm512_div_ps(one, lenght.v));
VectorF32<16, 1> fLenght2(lenght.v);
VectorF32<16, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12}}>();
VectorF32<16, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13}}>();
VectorF32<16, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14}}>();
VectorF32<16, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15}}>();
return {
VectorF32<Len, Packing>(_mm512_mul_ps(A.v, fLenghtA.v)),
VectorF32<Len, Packing>(_mm512_mul_ps(B.v, fLenghtB.v)),
VectorF32<Len, Packing>(_mm512_mul_ps(C.v, fLenghtC.v)),
VectorF32<Len, Packing>(_mm512_mul_ps(D.v, fLenghtD.v)),
};
#endif
}
}
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 1) {
VectorF32<1, 4> lenght = Length(A, B, C, D);
constexpr float oneArr[] {1, 1, 1, 1};
__m128 one = _mm_loadu_ps(oneArr);
VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v));
VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0,0}}>();
VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{1,1,1,1}}>();
VectorF32<4, 1> fLenghtC = fLenght.template Shuffle<{{2,2,2,2}}>();
VectorF32<4, 1> fLenghtD = fLenght.template Shuffle<{{3,3,3,3}}>();
return {
_mm_mul_ps(A.v, fLenghtA.v),
_mm_mul_ps(B.v, fLenghtB.v),
_mm_mul_ps(C.v, fLenghtC.v),
_mm_mul_ps(D.v, fLenghtD.v)
};
}
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 2) {
VectorF32<1, 8> lenght = Length(A, B, C, D);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1};
__m256 one = _mm256_loadu_ps(oneArr);
VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v));
VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0, 1,1,1}}>();
VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{2,2,2, 3,3,3}}>();
VectorF32<8, 1> fLenghtC = fLenght.template Shuffle<{{4,4,4, 5,5,5}}>();
VectorF32<8, 1> fLenghtD = fLenght.template Shuffle<{{6,6,6, 7,7,7}}>();
return {
_mm256_mul_ps(A.v, fLenghtA.v),
_mm256_mul_ps(B.v, fLenghtB.v),
_mm256_mul_ps(C.v, fLenghtC.v),
_mm256_mul_ps(D.v, fLenghtD.v)
};
}
#ifdef __AVX512F__
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C
) requires(Len == 3 && Packing == 5) {
VectorF32<1, 15> lenght = Length(A, B, C);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m512 one = _mm512_loadu_ps(oneArr);
VectorF32<15, 1> fLenght(_mm512_div_ps(one, lenght.v));
VectorF32<15, 1> fLenghtA = fLenght.template Shuffle<{{0,0,0, 1,1,1, 2,2,2, 3,3,3, 4,4,4}}>();
VectorF32<15, 1> fLenghtB = fLenght.template Shuffle<{{5,5,5, 6,6,6, 7,7,7, 8,8,8, 9,9,9}}>();
VectorF32<15, 1> fLenghtC = fLenght.template Shuffle<{{10,10,10, 11,11,11, 12,12,12, 13,13,13, 14,14,14}}>();
return {
_mm512_mul_ps(A.v, fLenghtA.v),
_mm512_mul_ps(B.v, fLenghtB.v),
_mm512_mul_ps(C.v, fLenghtC.v),
};
}
#endif
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
VectorF32<1, 4> lenght = LengthNoShuffle(A, B);
constexpr float oneArr[] {1, 1, 1, 1};
__m128 one = _mm_loadu_ps(oneArr);
VectorF32<4, 1> fLenght(_mm_div_ps(one, lenght.v));
VectorF32<4, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1}}>();
VectorF32<4, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3}}>();
return {
_mm_mul_ps(A.v, fLenghtA.v),
_mm_mul_ps(B.v, fLenghtB.v),
};
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
VectorF32<1, 8> lenght = LengthNoShuffle(A, B);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1};
__m256 one = _mm256_loadu_ps(oneArr);
VectorF32<8, 1> fLenght(_mm256_div_ps(one, lenght.v));
VectorF32<8, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,4,4,5,5}}>();
VectorF32<8, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3,6,6,7,7}}>();
return {
_mm256_mul_ps(A.v, fLenghtA.v),
_mm256_mul_ps(B.v, fLenghtB.v),
};
#ifdef __AVX512F__
} else {
VectorF32<1, 16> lenght = LengthNoShuffle(A, B);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
__m512 one = _mm512_loadu_ps(oneArr);
VectorF32<16, 1> fLenght(_mm512_div_ps(one, lenght.v));
VectorF32<16, 1> fLenghtA = fLenght.template Shuffle<{{0,0,1,1,4,4,5,5,8,8,9,9,12,12,13,13}}>();
VectorF32<16, 1> fLenghtB = fLenght.template Shuffle<{{2,2,3,3,6,6,7,7,10,10,11,11,14,14,15,15}}>();
return {
_mm512_mul_ps(A.v, fLenghtA.v),
_mm512_mul_ps(B.v, fLenghtB.v),
};
#endif
}
}
constexpr static VectorF32<1, Packing*4> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
VectorF32<1, Packing*4> lenghtSq = LengthSqPack(A, B, C, D);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
} else {
return VectorF32<1, Packing*4>(_mm512_sqrt_ps(lenghtSq.v));
}
}
constexpr static VectorF32<1, 4> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 1) {
VectorF32<1, 4> lenghtSq = LengthSqPack(A, B, C, D);
return VectorF32<1, 4>(_mm_sqrt_ps(lenghtSq.v));
}
constexpr static VectorF32<1, 8> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 2) {
VectorF32<1, 8> lenghtSq = LengthSqPack(A, B, C, D);
return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
}
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C
) requires(Len == 3 && Packing == 5) {
VectorF32<1, 15> lenghtSq = LengthSqPack(A, B, C);
return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v));
}
#endif
constexpr static VectorF32<1, Packing*2> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> C
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
VectorF32<1, Packing*2> lenghtSq = LengthSqPack(A, C);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v));
#ifdef __AVX512F__
} else {
return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v));
#endif
}
}
constexpr static VectorF32<1, Packing*4> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
return DotPack(A, A, B, B, C, C, D, D);
}
constexpr static VectorF32<1, 4> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 1) {
return DotPack(A, A, B, B, C, C, D, D);
}
constexpr static VectorF32<1, 8> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 2) {
return DotPack(A, A, B, B, C, C, D, D);
}
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C
) requires(Len == 3 && Packing == 5) {
return DotPack(A, A, B, B, C, C);
}
#endif
constexpr static VectorF32<1, Packing*2> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> C
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
return DotPack(A, A, C, C);
}
constexpr static VectorF32<1, Packing*4> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
VectorF32<Len, Packing> D0, VectorF32<Len, Packing> D1
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return DotNoShuffle(A0, A1, C0, C1, B0, B1, D0, D1);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
VectorF32<8, 1> vec(DotNoShuffle(A0, A1, B0, B1, C0, C1, D0, D1).v);
vec = vec.template Shuffle<{{
0,4,2,6,
1,5,3,7,
}}>();
return vec.v;
#ifdef __AVX512F__
} else {
VectorF32<16, 1> vec(DotNoShuffle(A0, A1, B0, B1, C0, C1, D0, D1).v);
vec = vec.template Shuffle<{{
0,4,8,12,
2,6,10,14,
1,5,9,13,
3,7,11,15
}}>();
return vec.v;
#endif
}
}
constexpr static VectorF32<1, 4> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
VectorF32<Len, Packing> D0, VectorF32<Len, Packing> D1
) requires(Len == 3 && Packing == 1) {
// Each register: [X1 X2 X3 _]
// 4 pairs (A,B,C,D) → 4 dot products → 1 x __m128
//
// After element-wise multiply:
// mulA = [a1 a2 a3 _] (where ai = A0[i]*A1[i])
// mulB = [b1 b2 b3 _]
// mulC = [c1 c2 c3 _]
// mulD = [d1 d2 d3 _]
//
// We need: result = [a1+a2+a3, b1+b2+b3, c1+c2+c3, d1+d2+d3]
//
// Transpose to get:
// row1 = [a1 b1 c1 d1]
// row2 = [a2 b2 c2 d2]
// row3 = [a3 b3 c3 d3]
// Then sum rows.
__m128 mulA = _mm_mul_ps(A0.v, A1.v);
__m128 mulB = _mm_mul_ps(B0.v, B1.v);
__m128 mulC = _mm_mul_ps(C0.v, C1.v);
__m128 mulD = _mm_mul_ps(D0.v, D1.v);
// Standard 4x4 transpose (only first 3 rows matter, 4th is garbage)
// unpacklo/hi interleave pairs of 32-bit elements
__m128 tmp0 = _mm_unpacklo_ps(mulA, mulB); // a1 b1 a2 b2
__m128 tmp1 = _mm_unpackhi_ps(mulA, mulB); // a3 b3 _ _
__m128 tmp2 = _mm_unpacklo_ps(mulC, mulD); // c1 d1 c2 d2
__m128 tmp3 = _mm_unpackhi_ps(mulC, mulD); // c3 d3 _ _
__m128 row1 = _mm_movelh_ps(tmp0, tmp2); // a1 b1 c1 d1
__m128 row2 = _mm_movehl_ps(tmp2, tmp0); // a2 b2 c2 d2
__m128 row3 = _mm_movelh_ps(tmp1, tmp3); // a3 b3 c3 d3
row1 = _mm_add_ps(row1, row2);
row1 = _mm_add_ps(row1, row3);
return row1;
}
constexpr static VectorF32<1, 8> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
VectorF32<Len, Packing> D0, VectorF32<Len, Packing> D1
) requires(Len == 3 && Packing == 2) {
// Each register: [X1 X2 X3 Y1 Y2 Y3 _ _]
// 4 pairs × 2 vectors each = 8 dot products → 1 x __m256
//
// After multiply:
// mulA = [a1 a2 a3 b1 b2 b3 _ _]
// mulB = [c1 c2 c3 d1 d2 d3 _ _]
// mulC = [e1 e2 e3 f1 f2 f3 _ _]
// mulD = [g1 g2 g3 h1 h2 h3 _ _]
//
// We need result = [a·, b·, c·, d·, e·, f·, g·, h·]
// where x· = x1+x2+x3
//
// Strategy: use permute to gather element 1s, 2s, 3s across all 8 vectors,
// then add.
//
// Gather indices (from the concatenated view of mulA|mulB|mulC|mulD):
// vec: a a a b b b _ _ c c c d d d _ _ e e e f f f _ _ g g g h h h _ _
// idx: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
//
// elem1 = [a1, b1, c1, d1, e1, f1, g1, h1] → indices [0, 3, 8, 11, 16, 19, 24, 27]
// elem2 = [a2, b2, c2, d2, e2, f2, g2, h2] → indices [1, 4, 9, 12, 17, 20, 25, 28]
// elem3 = [a3, b3, c3, d3, e3, f3, g3, h3] → indices [2, 5, 10, 13, 18, 21, 26, 29]
//
// Unfortunately AVX2 doesn't have cross-register permutes for 8x32 easily.
// Use vpermd (_mm256_permutevar8x32) within pairs, then blend/combine.
//
// Within each 256-bit register [X1 X2 X3 Y1 Y2 Y3 _ _]:
// elem1_local = [X1 Y1 ...] → gather from indices 0,3
// elem2_local = [X2 Y2 ...] → gather from indices 1,4
// elem3_local = [X3 Y3 ...] → gather from indices 2,5
//
// After permutevar8x32 on each mul register:
// From mulA: row1_part = [a1 b1 _ _ _ _ _ _]
// From mulB: row1_part = [c1 d1 _ _ _ _ _ _]
// From mulC: row1_part = [e1 f1 _ _ _ _ _ _]
// From mulD: row1_part = [g1 h1 _ _ _ _ _ _]
//
// Then combine with unpack/shuffle to get full rows.
__m256 mulA = _mm256_mul_ps(A0.v, A1.v); // a1 a2 a3 b1 b2 b3 _ _
__m256 mulB = _mm256_mul_ps(B0.v, B1.v); // c1 c2 c3 d1 d2 d3 _ _
__m256 mulC = _mm256_mul_ps(C0.v, C1.v); // e1 e2 e3 f1 f2 f3 _ _
__m256 mulD = _mm256_mul_ps(D0.v, D1.v); // g1 g2 g3 h1 h2 h3 _ _
// Permute each register to gather elements by position.
// For each register [X1 X2 X3 Y1 Y2 Y3 U U]:
// perm1: [X1 Y1 X2 Y2 X3 Y3 _ _] → indices {0,3,1,4,2,5,6,7}
__m256i permIdx = _mm256_setr_epi32(0, 3, 1, 4, 2, 5, 6, 7);
// After permute: [X1 Y1 X2 Y2 X3 Y3 _ _]
__m256 pA = _mm256_permutevar8x32_ps(mulA, permIdx); // a1 b1 a2 b2 a3 b3 _ _
__m256 pB = _mm256_permutevar8x32_ps(mulB, permIdx); // c1 d1 c2 d2 c3 d3 _ _
__m256 pC = _mm256_permutevar8x32_ps(mulC, permIdx); // e1 f1 e2 f2 e3 f3 _ _
__m256 pD = _mm256_permutevar8x32_ps(mulD, permIdx); // g1 h1 g2 h2 g3 h3 _ _
// Now combine pairs. Each pair contributes 4 consecutive results.
// pA has [a1 b1 a2 b2 a3 b3 _ _], pB has [c1 d1 c2 d2 c3 d3 _ _]
// We want:
// row1 = [a1 b1 c1 d1 | e1 f1 g1 h1]
// row2 = [a2 b2 c2 d2 | e2 f2 g2 h2]
// row3 = [a3 b3 c3 d3 | e3 f3 g3 h3]
//
// From pA: elements at [0,1] are elem1, [2,3] are elem2, [4,5] are elem3
// From pB: elements at [0,1] are elem1, [2,3] are elem2, [4,5] are elem3
//
// Use unpacklo_epi64 to interleave 64-bit chunks:
// unpacklo64(pA, pB) within 128-bit lanes:
// lo lane: pA[0:1]=a1,b1 | pB[0:1]=c1,d1 → [a1 b1 c1 d1]
// hi lane: pA[4:5]=a3,b3 | pB[4:5]=c3,d3 → [a3 b3 c3 d3]
// → [a1 b1 c1 d1 | a3 b3 c3 d3]
//
// unpackhi64(pA, pB) within 128-bit lanes:
// lo lane: pA[2:3]=a2,b2 | pB[2:3]=c2,d2 → [a2 b2 c2 d2]
// hi lane: pA[6:7]=_,_ | pB[6:7]=_,_ → garbage
// → [a2 b2 c2 d2 | _ _ _ _]
__m256i AB_lo = _mm256_unpacklo_epi64(
_mm256_castps_si256(pA), _mm256_castps_si256(pB)); // [a1 b1 c1 d1 | a3 b3 c3 d3]
__m256i AB_hi = _mm256_unpackhi_epi64(
_mm256_castps_si256(pA), _mm256_castps_si256(pB)); // [a2 b2 c2 d2 | _ _ _ _]
__m256i CD_lo = _mm256_unpacklo_epi64(
_mm256_castps_si256(pC), _mm256_castps_si256(pD)); // [e1 f1 g1 h1 | e3 f3 g3 h3]
__m256i CD_hi = _mm256_unpackhi_epi64(
_mm256_castps_si256(pC), _mm256_castps_si256(pD)); // [e2 f2 g2 h2 | _ _ _ _]
// row1 = [a1 b1 c1 d1 | e1 f1 g1 h1] → lo 128 of AB_lo, lo 128 of CD_lo
// row2 = [a2 b2 c2 d2 | e2 f2 g2 h2] → lo 128 of AB_hi, lo 128 of CD_hi
// row3 = [a3 b3 c3 d3 | e3 f3 g3 h3] → hi 128 of AB_lo, hi 128 of CD_lo
__m256 row1 = _mm256_castsi256_ps(_mm256_permute2x128_si256(AB_lo, CD_lo, 0x20)); // lo,lo
__m256 row2 = _mm256_castsi256_ps(_mm256_permute2x128_si256(AB_hi, CD_hi, 0x20)); // lo,lo
__m256 row3 = _mm256_castsi256_ps(_mm256_permute2x128_si256(AB_lo, CD_lo, 0x31)); // hi,hi
row1 = _mm256_add_ps(row1, row2);
row1 = _mm256_add_ps(row1, row3);
return row1;
}
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
) requires(Len == 3 && Packing == 5) {
// __m512: Each register: [A1 A2 A3 B1 B2 B3 C1 C2 C3 D1 D2 D3 E1 E2 E3 _]
// 3 pairs × 5 vectors each = 15 dot products → fits in 1 x __m512 (slot 16 unused)
//
// After multiply of 3 pairs:
// mul0 = [a1 a2 a3 b1 b2 b3 c1 c2 c3 d1 d2 d3 e1 e2 e3 _]
// mul1 = [f1 f2 f3 g1 g2 g3 h1 h2 h3 i1 i2 i3 j1 j2 j3 _]
// mul2 = [k1 k2 k3 l1 l2 l3 m1 m2 m3 n1 n2 n3 o1 o2 o3 _]
//
// Result = [a· b· c· d· e· f· g· h· i· j· k· l· m· n· o· _]
//
// Strategy: for each mul register, gather element 1s, 2s, 3s with vpermps,
// then combine across registers.
//
// From mul0: 5 vectors at positions {0,1,2}, {3,4,5}, {6,7,8}, {9,10,11}, {12,13,14}
// elem1 = indices {0, 3, 6, 9, 12} → positions 0..4 of result
// elem2 = indices {1, 4, 7, 10, 13}
// elem3 = indices {2, 5, 8, 11, 14}
__m512 mul0 = _mm512_mul_ps(A0.v, A1.v);
__m512 mul1 = _mm512_mul_ps(B0.v, B1.v);
__m512 mul2 = _mm512_mul_ps(C0.v, C1.v);
// Gather elem1, elem2, elem3 from each mul register
// Each register has 5 vec3s: extract element 1,2,3 of each into consecutive positions
__m512i idx1 = _mm512_setr_epi32(0, 3, 6, 9, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512i idx2 = _mm512_setr_epi32(1, 4, 7, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
__m512i idx3 = _mm512_setr_epi32(2, 5, 8, 11, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
// From mul0 → results 0..4, from mul1 → results 5..9, from mul2 → results 10..14
// Gather from each, then combine.
__m512 e1_0 = _mm512_permutexvar_ps(idx1, mul0); // [a1 b1 c1 d1 e1 ...]
__m512 e2_0 = _mm512_permutexvar_ps(idx2, mul0); // [a2 b2 c2 d2 e2 ...]
__m512 e3_0 = _mm512_permutexvar_ps(idx3, mul0); // [a3 b3 c3 d3 e3 ...]
__m512 e1_1 = _mm512_permutexvar_ps(idx1, mul1); // [f1 g1 h1 i1 j1 ...]
__m512 e2_1 = _mm512_permutexvar_ps(idx2, mul1); // [f2 g2 h2 i2 j2 ...]
__m512 e3_1 = _mm512_permutexvar_ps(idx3, mul1); // [f3 g3 h3 i3 j3 ...]
__m512 e1_2 = _mm512_permutexvar_ps(idx1, mul2); // [k1 l1 m1 n1 o1 ...]
__m512 e2_2 = _mm512_permutexvar_ps(idx2, mul2); // [k2 l2 m2 n2 o2 ...]
__m512 e3_2 = _mm512_permutexvar_ps(idx3, mul2); // [k3 l3 m3 n3 o3 ...]
// Now combine: we need positions 0..4 from reg0, 5..9 from reg1, 10..14 from reg2
// Use masked moves to assemble the final row vectors.
// mask for positions 0-4: 0b0000000000011111 = 0x001F
// mask for positions 5-9: 0b0000001111100000 = 0x03E0
// mask for positions 10-14: 0b0111110000000000 = 0x7C00
// For reg1, its results are in positions 0..4 but need to go to 5..9.
// For reg2, its results are in positions 0..4 but need to go to 10..14.
// Use a different approach: permute reg1/reg2 results to their target positions.
// Shift reg1 results from slots 0..4 to slots 5..9
__m512i shiftIdx1 = _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0);
// Shift reg2 results from slots 0..4 to slots 10..14
__m512i shiftIdx2 = _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 0);
__m512 e1_1_shifted = _mm512_permutexvar_ps(shiftIdx1, e1_1);
__m512 e2_1_shifted = _mm512_permutexvar_ps(shiftIdx1, e2_1);
__m512 e3_1_shifted = _mm512_permutexvar_ps(shiftIdx1, e3_1);
__m512 e1_2_shifted = _mm512_permutexvar_ps(shiftIdx2, e1_2);
__m512 e2_2_shifted = _mm512_permutexvar_ps(shiftIdx2, e2_2);
__m512 e3_2_shifted = _mm512_permutexvar_ps(shiftIdx2, e3_2);
// Blend: take positions 0..4 from reg0, 5..9 from reg1, 10..14 from reg2
__mmask16 mask_5_9 = 0x03E0u; // bits 5-9
__mmask16 mask_10_14 = 0x7C00u; // bits 10-14
__m512 row1 = _mm512_mask_mov_ps(e1_0, mask_5_9, e1_1_shifted);
row1 = _mm512_mask_mov_ps(row1, mask_10_14, e1_2_shifted);
__m512 row2 = _mm512_mask_mov_ps(e2_0, mask_5_9, e2_1_shifted);
row2 = _mm512_mask_mov_ps(row2, mask_10_14, e2_2_shifted);
__m512 row3 = _mm512_mask_mov_ps(e3_0, mask_5_9, e3_1_shifted);
row3 = _mm512_mask_mov_ps(row3, mask_10_14, e3_2_shifted);
row1 = _mm512_add_ps(row1, row2);
row1 = _mm512_add_ps(row1, row3);
return row1;
}
#endif
constexpr static VectorF32<1, Packing*2> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return DotNoShuffle(A0, A1, C0, C1);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
VectorF32<8, 1> vec(DotNoShuffle(A0, A1, C0, C1).v);
vec = vec.template Shuffle<{{
0,1, 4,5,
2,3, 6,7,
}}>();
return vec.v;
#ifdef __AVX512F__
} else {
VectorF32<16, 1> vec(DotNoShuffle(A0, A1, C0, C1).v);
vec = vec.template Shuffle<{{
0,1, 4,5,
8,9, 12,13,
2,3, 6,7,
10,11, 14,15
}}>();
return vec.v;
#endif
}
}
private:
constexpr static VectorF32<1, Packing*4> LengthNoShuffle(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
VectorF32<1, Packing*4> lenghtSq = LengthSqNoShuffle(A, B, C, D);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
#ifdef __AVX512F__
} else {
return VectorF32<1, Packing*4>(_mm512_sqrt_ps(lenghtSq.v));
#endif
}
}
constexpr static VectorF32<1, Packing*2> LengthNoShuffle(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> C
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
VectorF32<1, Packing*2> lenghtSq = LengthSqNoShuffle(A, C);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v));
#ifdef __AVX512F__
} else {
return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v));
#endif
}
}
constexpr static VectorF32<1, Packing*4> LengthSqNoShuffle(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
return DotNoShuffle(A, A, B, B, C, C, D, D);
}
constexpr static VectorF32<1, Packing*2> LengthSqNoShuffle(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> C
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
return DotNoShuffle(A, A, C, C);
}
constexpr static VectorF32<1, Packing*4> DotNoShuffle(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
VectorF32<Len, Packing> D0, VectorF32<Len, Packing> D1
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
__m128 mulA = _mm_mul_ps(A0.v, A1.v);
__m128 mulB = _mm_mul_ps(B0.v, B1.v);
__m128i row12Temp1 = _mm_unpacklo_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulB)); // A1 B1 A2 B2
__m128i row34Temp1 = _mm_unpackhi_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulB)); // A3 B3 A4 B4
__m128 mulC = _mm_mul_ps(C0.v, C1.v);
__m128 mulD = _mm_mul_ps(D0.v, D1.v);
__m128i row12Temp2 = _mm_unpacklo_epi32(_mm_castps_si128(mulC), _mm_castps_si128(mulD)); // C1 D1 C2 D2
__m128i row34Temp2 = _mm_unpackhi_epi32(_mm_castps_si128(mulC), _mm_castps_si128(mulD)); // C3 D3 C4 D4
__m128 row1 = _mm_unpacklo_epi32(row12Temp1, row12Temp2); // A1 C1 B1 D1
__m128 row2 = _mm_unpackhi_epi32(row12Temp1, row12Temp2); // A2 C2 B2 D2
__m128 row3 = _mm_unpacklo_epi32(row34Temp1, row34Temp2); // A3 C3 B3 D3
__m128 row4 = _mm_unpackhi_epi32(row34Temp1, row34Temp2); // A4 C4 B4 D4
row1 = _mm_add_ps(row1, row2);
row1 = _mm_add_ps(row1, row3);
row1 = _mm_add_ps(row1, row4);
return row1;
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
__m256 mulA = _mm256_mul_ps(A0.v, A1.v);
__m256 mulB = _mm256_mul_ps(B0.v, B1.v);
__m256i row12Temp1 = _mm256_unpacklo_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulB)); // A1 B1 A2 B2
__m256i row34Temp1 = _mm256_unpackhi_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulB)); // A3 B3 A4 B4
__m256 mulC = _mm256_mul_ps(C0.v, C1.v);
__m256 mulD = _mm256_mul_ps(D0.v, D1.v);
__m256i row12Temp2 = _mm256_unpacklo_epi32(_mm256_castps_si256(mulC), _mm256_castps_si256(mulD)); // C1 D1 C2 D2
__m256i row34Temp2 = _mm256_unpackhi_epi32(_mm256_castps_si256(mulC), _mm256_castps_si256(mulD)); // C3 D3 C4 D4
__m256 row1 = _mm256_unpacklo_epi32(row12Temp1, row12Temp2); // A1 C1 B1 D1
__m256 row2 = _mm256_unpackhi_epi32(row12Temp1, row12Temp2); //A2 C2 B2 D2
__m256 row3 = _mm256_unpacklo_epi32(row34Temp1, row34Temp2); // A3 C3 B3 D3
__m256 row4 = _mm256_unpackhi_epi32(row34Temp1, row34Temp2); // A4 C4 B4 D4
row1 = _mm256_add_ps(row1, row2);
row1 = _mm256_add_ps(row1, row3);
row1 = _mm256_add_ps(row1, row4);
return row1;
#ifdef __AVX512F__
} else {
__m512 mulA = _mm512_mul_ps(A0.v, A1.v);
__m512 mulB = _mm512_mul_ps(B0.v, B1.v);
__m512i row12Temp1 = _mm512_unpacklo_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulB)); // A1 B1 A2 B2
__m512i row34Temp1 = _mm512_unpackhi_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulB)); // A3 B3 A4 B4
__m512 mulC = _mm512_mul_ps(C0.v, C1.v);
__m512 mulD = _mm512_mul_ps(D0.v, D1.v);
__m512i row12Temp2 = _mm512_unpacklo_epi32(_mm512_castps_si512(mulC), _mm512_castps_si512(mulD)); // C1 D1 C2 D2
__m512i row34Temp2 = _mm512_unpackhi_epi32(_mm512_castps_si512(mulC), _mm512_castps_si512(mulD)); // C3 D3 C4 D4
__m512 row1 = _mm512_unpacklo_epi32(row12Temp1, row12Temp2); // A1 C1 B1 D1
__m512 row2 = _mm512_unpackhi_epi32(row12Temp1, row12Temp2); //A2 C2 B2 D2
__m512 row3 = _mm512_unpacklo_epi32(row34Temp1, row34Temp2); // A3 C3 B3 D3
__m512 row4 = _mm512_unpackhi_epi32(row34Temp1, row34Temp2); // A4 C4 B4 D4
row1 = _mm512_add_ps(row1, row2);
row1 = _mm512_add_ps(row1, row3);
row1 = _mm512_add_ps(row1, row4);
return row1;
#endif
}
}
constexpr static VectorF32<1, Packing*2> DotNoShuffle(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
__m128 mulA = _mm_mul_ps(A0.v, A1.v);
__m128 mulC = _mm_mul_ps(C0.v, C1.v);
__m128i row12Temp1 = _mm_unpacklo_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulC)); // A1 C1 A2 C2
__m128i row56Temp1 = _mm_unpackhi_epi32(_mm_castps_si128(mulA), _mm_castps_si128(mulC)); // B1 D1 B2 D2
__m128i row1TempTemp1 = row12Temp1;
__m128i row5TempTemp1 = row56Temp1;
row12Temp1 = _mm_unpacklo_epi32(row12Temp1, row56Temp1); // A1 B1 C1 D1
row56Temp1 = _mm_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2
return _mm_add_ps(row12Temp1, row56Temp1);
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
__m256 mulA = _mm256_mul_ps(A0.v, A1.v);
__m256 mulC = _mm256_mul_ps(C0.v, C1.v);
__m256i row12Temp1 = _mm256_unpacklo_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulC)); // A1 C1 A2 C2
__m256i row56Temp1 = _mm256_unpackhi_epi32(_mm256_castps_si256(mulA), _mm256_castps_si256(mulC)); // B1 D1 B2 D2
__m256i row1TempTemp1 = row12Temp1;
__m256i row5TempTemp1 = row56Temp1;
row12Temp1 = _mm256_unpacklo_epi32(row12Temp1, row56Temp1); // A1 B1 C1 D1
row56Temp1 = _mm256_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2
return _mm256_add_ps(row12Temp1, row56Temp1);
#ifdef __AVX512F__
} else {
__m512 mulA = _mm512_mul_ps(A0.v, A1.v);
__m512 mulC = _mm512_mul_ps(C0.v, C1.v);
__m512i row12Temp1 = _mm512_unpacklo_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulC)); // A1 C1 A2 C2
__m512i row56Temp1 = _mm512_unpackhi_epi32(_mm512_castps_si512(mulA), _mm512_castps_si512(mulC)); // B1 D1 B2 D2
__m512i row1TempTemp1 = row12Temp1;
__m512i row5TempTemp1 = row56Temp1;
row12Temp1 = _mm512_unpacklo_epi32(row12Temp1, row56Temp1); // A1 B1 C1 D1
row56Temp1 = _mm512_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2
return _mm512_add_ps(row12Temp1, row56Temp1);
#endif
}
}
public:
template <std::array<bool, Len> ShuffleValues>
constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) {
constexpr auto mask = VectorBase<Len, Packing, float>::template GetBlendMaskEpi32<ShuffleValues>();
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return _mm_castsi128_ps(_mm_blend_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v), mask));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return _mm256_castsi256_ps(_mm256_blend_epi32(_mm256_castps_si256(a.v), _mm256_castps_si256(b.v), mask));
#ifdef __AVX512F__
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512>) {
return _mm512_castsi512_ps(_mm512_mask_blend_epi32(mask, _mm512_castps_si512(a.v), _mm512_castps_si512(b.v)));
#endif
}
}
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
VectorF32<3, Packing> qv(q);
VectorF32<Len, Packing> t = Cross(qv, v) * float(2);
return v + t * q.template Shuffle<{{3,3,3,3}}>() + Cross(qv, t);
}
constexpr static VectorF32<4, 2> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) {
VectorF32<Len, Packing> translated = v - pivot;
VectorF32<3, Packing> qv(q.v);
VectorF32<Len, Packing> t = Cross(qv, translated) * float(2);
VectorF32<Len, Packing> rotated = translated + t * q.template Shuffle<{{3,3,3,3}}>() + Cross(qv, t);
return rotated + pivot;
}
constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> EulerHalf) requires(Len == 4) {
std::tuple<VectorF32<3, Packing>, VectorF32<3, Packing>> sinCos = EulerHalf.SinCos();
VectorF32<4, Packing> sin = std::get<0>(sinCos);
VectorF32<4, Packing> cos = std::get<1>(sinCos);
VectorF32<4, Packing> row1 = cos.template Shuffle<{{0,0,0,0}}>();
row1 = Blend<{{0,1,1,1}}>(sin, row1);
VectorF32<4, Packing> row2 = cos.template Shuffle<{{1,1,1,1}}>();
row2 = Blend<{{1,0,1,1}}>(sin, row2);
row1 *= row2;
VectorF32<4, Packing> row3 = cos.template Shuffle<{{2,2,2,2}}>();
row3 = Blend<{{1,1,0,1}}>(sin, row3);
row1 *= row3;
VectorF32<4, Packing> row4 = sin.template Shuffle<{{0,0,0,0}}>();
row4 = Blend<{{0,1,1,1}}>(cos, row4);
VectorF32<4, Packing> row5 = sin.template Shuffle<{{1,1,1,1}}>();
row5 = Blend<{{1,0,1,1}}>(cos, row5);
row4 *= row5;
VectorF32<4, Packing> row6 = sin.template Shuffle<{{2,2,2,2}}>();
row6 = Blend<{{1,1,0,1}}>(cos, row6);
row6 = row6.template Negate<{{true,false,true,false}}>();
row1 = MulitplyAdd(row4, row6, row1);
return row1;
}
};
#elif defined(__wasm_simd128__)
// WebAssembly SIMD128 implementation. VectorType is always v128_t and we
// cap Len*Packing*sizeof(float) at 16 bytes (i.e. up to 4 floats per
// vector) in Common.cppm so a single v128_t covers every instantiation.
// Operations without a direct SIMD equivalent (Shuffle with runtime indices,
// transcendentals, etc.) round-trip through a float[4] scratch buffer.
export template <std::uint8_t Len, std::uint8_t Packing>
struct VectorF32 : public VectorBase<Len, Packing, float> {
template <std::uint8_t Len2, std::uint8_t Packing2>
friend struct VectorF32;
using Base = VectorBase<Len, Packing, float>;
static constexpr std::uint8_t NElems = Base::AlignmentElement;
static_assert(NElems == 4, "WASM SIMD VectorF32 assumes 4-lane vectors");
constexpr VectorF32() = default;
constexpr VectorF32(v128_t vv) { this->v = vv; }
constexpr VectorF32(const float* vB) { Load(vB); }
constexpr VectorF32(float val) { this->v = wasm_f32x4_splat(val); }
constexpr void Load(const float* vB) { this->v = wasm_v128_load(vB); }
constexpr void Store(float* vB) const { wasm_v128_store(vB, this->v); }
template<typename T>
constexpr std::array<T, NElems> Store() const {
std::array<T, NElems> r{};
Store(r.data());
return r;
}
template <std::uint8_t BLen, std::uint8_t BPacking>
constexpr operator VectorF32<BLen, BPacking>() const {
alignas(16) float tmp[4];
wasm_v128_store(tmp, this->v);
alignas(16) float out[4] = {0,0,0,0};
const std::uint8_t copyLen = (BLen < Len) ? BLen : Len;
const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing;
for (std::uint8_t p = 0; p < copyPack; ++p)
for (std::uint8_t i = 0; i < copyLen; ++i)
out[p * BLen + i] = tmp[p * Len + i];
return VectorF32<BLen, BPacking>(wasm_v128_load(out));
}
constexpr VectorF32<Len, Packing> operator+(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_add(this->v, b.v)); }
constexpr VectorF32<Len, Packing> operator-(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_sub(this->v, b.v)); }
constexpr VectorF32<Len, Packing> operator*(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_mul(this->v, b.v)); }
constexpr VectorF32<Len, Packing> operator/(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_div(this->v, b.v)); }
constexpr void operator+=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_add(this->v, b.v); }
constexpr void operator-=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_sub(this->v, b.v); }
constexpr void operator*=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_mul(this->v, b.v); }
constexpr void operator/=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_div(this->v, b.v); }
constexpr VectorF32<Len, Packing> operator+(float b) const { return *this + VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator-(float b) const { return *this - VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator*(float b) const { return *this * VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator/(float b) const { return *this / VectorF32<Len, Packing>(b); }
constexpr void operator+=(float b) { *this += VectorF32<Len, Packing>(b); }
constexpr void operator-=(float b) { *this -= VectorF32<Len, Packing>(b); }
constexpr void operator*=(float b) { *this *= VectorF32<Len, Packing>(b); }
constexpr void operator/=(float b) { *this /= VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator-() const { return VectorF32<Len, Packing>(wasm_f32x4_neg(this->v)); }
constexpr bool operator==(VectorF32<Len, Packing> b) const {
return wasm_i32x4_bitmask(wasm_f32x4_eq(this->v, b.v)) == 0b1111;
}
constexpr bool operator!=(VectorF32<Len, Packing> b) const { return !(*this == b); }
template<std::uint32_t ExtractLen>
constexpr VectorF32<ExtractLen, Packing> ExtractLo() const {
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
alignas(16) float out[4] = {0,0,0,0};
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < ExtractLen; ++i)
out[p * ExtractLen + i] = tmp[p * Len + i];
return VectorF32<ExtractLen, Packing>(wasm_v128_load(out));
}
constexpr VectorF32<Len, Packing> Cos() const {
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
for (int i = 0; i < 4; ++i) tmp[i] = std::cos(tmp[i]);
return VectorF32<Len, Packing>(wasm_v128_load(tmp));
}
constexpr VectorF32<Len, Packing> Sin() const {
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
for (int i = 0; i < 4; ++i) tmp[i] = std::sin(tmp[i]);
return VectorF32<Len, Packing>(wasm_v128_load(tmp));
}
constexpr std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> SinCos() const {
return { Sin(), Cos() };
}
template <std::array<bool, Len> values>
constexpr VectorF32<Len, Packing> Negate() const {
constexpr auto mask = []() {
std::array<std::uint32_t, 4> m{};
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
m[p * Len + i] = values[i] ? 0x80000000u : 0u;
return m;
}();
v128_t maskVec = wasm_v128_load(mask.data());
return VectorF32<Len, Packing>(wasm_v128_xor(this->v, maskVec));
}
static constexpr VectorF32<Len, Packing> MulitplyAdd(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> add) {
#ifdef __wasm_relaxed_simd__
// Single-rounded FMA (a*b + c). Host-defined when FMA hardware is
// missing — accuracy may differ from the strict-SIMD wasm path.
return VectorF32<Len, Packing>(wasm_f32x4_relaxed_madd(a.v, b.v, add.v));
#else
return VectorF32<Len, Packing>(wasm_f32x4_add(wasm_f32x4_mul(a.v, b.v), add.v));
#endif
}
static constexpr VectorF32<Len, Packing> MulitplySub(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> sub) {
#ifdef __wasm_relaxed_simd__
// a*b - c is fused as madd(a, b, -c) — same op count as mul+sub
// but one rounding instead of two.
return VectorF32<Len, Packing>(wasm_f32x4_relaxed_madd(a.v, b.v, wasm_f32x4_neg(sub.v)));
#else
return VectorF32<Len, Packing>(wasm_f32x4_sub(wasm_f32x4_mul(a.v, b.v), sub.v));
#endif
}
constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) {
v128_t a_yzx = wasm_i32x4_shuffle(a.v, a.v, 1, 2, 0, 3);
v128_t a_zxy = wasm_i32x4_shuffle(a.v, a.v, 2, 0, 1, 3);
v128_t b_yzx = wasm_i32x4_shuffle(b.v, b.v, 1, 2, 0, 3);
v128_t b_zxy = wasm_i32x4_shuffle(b.v, b.v, 2, 0, 1, 3);
#ifdef __wasm_relaxed_simd__
// a_yzx*b_zxy - a_zxy*b_yzx fused as nmadd(a_zxy, b_yzx, a_yzx*b_zxy)
// = -(a_zxy*b_yzx) + a_yzx*b_zxy. Replaces a mul+sub pair with a
// single FMA.
return VectorF32<Len, Packing>(wasm_f32x4_relaxed_nmadd(a_zxy, b_yzx, wasm_f32x4_mul(a_yzx, b_zxy)));
#else
return VectorF32<Len, Packing>(wasm_f32x4_sub(wasm_f32x4_mul(a_yzx, b_zxy), wasm_f32x4_mul(a_zxy, b_yzx)));
#endif
}
template <const std::array<std::uint8_t, Len> ShuffleValues>
constexpr VectorF32<Len, Packing> Shuffle() const {
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
alignas(16) float out[4] = {0,0,0,0};
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
out[p * Len + i] = tmp[p * Len + ShuffleValues[i]];
return VectorF32<Len, Packing>(wasm_v128_load(out));
}
template <std::array<bool, Len> ShuffleValues>
constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) {
constexpr auto mask = []() {
std::array<std::uint32_t, 4> m{};
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
m[p * Len + i] = ShuffleValues[i] ? 0xFFFFFFFFu : 0u;
return m;
}();
v128_t maskVec = wasm_v128_load(mask.data());
return VectorF32<Len, Packing>(wasm_v128_bitselect(b.v, a.v, maskVec));
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
alignas(16) float buf[4] = {0,0,0,0};
for (std::uint8_t i = 0; i < N; ++i) {
alignas(16) float tmp[4];
wasm_v128_store(tmp, args[i].v);
for (std::uint8_t p = 0; p < Packing; ++p) {
float acc = 0.0f;
for (std::uint8_t k = 0; k < Len; ++k) {
float x = tmp[p * Len + k];
acc += x * x;
}
buf[i * Packing + p] = acc;
}
}
r.v = wasm_v128_load(buf);
return r;
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
auto sq = LengthSq(first, rest...);
sq.v = wasm_f32x4_sqrt(sq.v);
return sq;
}
// Pairwise dot products packed into one v128. Only the first Len
// lanes contribute, so the same routine handles 3- and 4-component
// inputs — the 4th lane of Len==3 inputs may be garbage from Cross()
// and must not be summed. Takes BatchSize pairs (== 4 here since
// WASM AlignmentElement is always 4 and Packing must be 1).
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize) &&
(Len == 3 || Len == 4) && Packing == 1)
constexpr static VectorF32<1, 4> Dot(VectorF32<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
std::array<VectorF32<Len, Packing>, 2 * N> args{ first, rest... };
alignas(16) float out[4] = {0,0,0,0};
for (std::uint8_t i = 0; i < N; ++i) {
alignas(16) float a[4], b[4];
wasm_v128_store(a, args[2 * i].v);
wasm_v128_store(b, args[2 * i + 1].v);
for (std::uint8_t k = 0; k < Len; ++k) out[i] += a[k] * b[k];
}
return VectorF32<1, 4>(wasm_v128_load(out));
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
auto normOne = [](VectorF32<Len, Packing> u) {
alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v);
alignas(16) float out[4] = {0,0,0,0};
for (std::uint8_t p = 0; p < Packing; ++p) {
float acc = 0.0f;
for (std::uint8_t k = 0; k < Len; ++k) {
float x = tmp[p * Len + k];
acc += x * x;
}
float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f;
for (std::uint8_t k = 0; k < Len; ++k)
out[p * Len + k] = tmp[p * Len + k] * invLen;
}
return VectorF32<Len, Packing>(wasm_v128_load(out));
};
return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
}
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
alignas(16) float qBuf[4]; wasm_v128_store(qBuf, q.v);
alignas(16) float qvBuf[4] = {0,0,0,0};
alignas(16) float qwBuf[4] = {0,0,0,0};
for (std::uint8_t p = 0; p < Packing; ++p) {
qvBuf[p * 3 + 0] = qBuf[p * 4 + 0];
qvBuf[p * 3 + 1] = qBuf[p * 4 + 1];
qvBuf[p * 3 + 2] = qBuf[p * 4 + 2];
for (std::uint8_t i = 0; i < 3; ++i) qwBuf[p * 3 + i] = qBuf[p * 4 + 3];
}
VectorF32<3, Packing> qv(wasm_v128_load(qvBuf));
VectorF32<3, Packing> qwBroadcast(wasm_v128_load(qwBuf));
VectorF32<3, Packing> t = Cross(qv, v) * 2.0f;
return v + t * qwBroadcast + Cross(qv, t);
}
constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) {
VectorF32<3, Packing> translated = v - pivot;
return Rotate(translated, q) + pivot;
}
constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) {
alignas(16) float eulerBuf[4]; wasm_v128_store(eulerBuf, eulerHalf.v);
alignas(16) float outBuf[4] = {0,0,0,0};
for (std::uint8_t p = 0; p < Packing; ++p) {
float roll = eulerBuf[p * 3 + 0];
float pitch = eulerBuf[p * 3 + 1];
float yaw = eulerBuf[p * 3 + 2];
float sr = std::sin(roll), cr = std::cos(roll);
float sp = std::sin(pitch), cp = std::cos(pitch);
float sy = std::sin(yaw), cy = std::cos(yaw);
outBuf[p * 4 + 0] = sr * cp * cy - cr * sp * sy;
outBuf[p * 4 + 1] = cr * sp * cy + sr * cp * sy;
outBuf[p * 4 + 2] = cr * cp * sy - sr * sp * cy;
outBuf[p * 4 + 3] = cr * cp * cy + sr * sp * sy;
}
return VectorF32<4, Packing>(wasm_v128_load(outBuf));
}
};
#else
// Scalar software fallback for non-x86_64 targets. Future arches can swap
// in their own intrinsic implementation by adding an arch-specific branch
// above and gating this one out.
export template <std::uint8_t Len, std::uint8_t Packing>
struct VectorF32 : public VectorBase<Len, Packing, float> {
template <std::uint8_t Len2, std::uint8_t Packing2>
friend struct VectorF32;
using Base = VectorBase<Len, Packing, float>;
static constexpr std::uint8_t NElems = Base::AlignmentElement;
constexpr VectorF32() = default;
constexpr VectorF32(typename Base::VectorType vv) {
this->v = vv;
}
constexpr VectorF32(const float* vB) { Load(vB); }
#ifdef __FLT16_MAX__
constexpr VectorF32(const _Float16* vB) { Load(vB); }
#endif
constexpr VectorF32(float val) {
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val;
}
constexpr void Load(const float* vB) {
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i];
}
constexpr void Store(float* vB) const {
for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i];
}
#ifdef __FLT16_MAX__
constexpr void Load(const _Float16* vB) {
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = static_cast<float>(vB[i]);
}
constexpr void Store(_Float16* vB) const {
for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = static_cast<_Float16>(this->v[i]);
}
#endif
template<typename T>
constexpr std::array<T, NElems> Store() const {
std::array<T, NElems> r{};
Store(r.data());
return r;
}
template <std::uint8_t BLen, std::uint8_t BPacking>
constexpr operator VectorF32<BLen, BPacking>() const {
VectorF32<BLen, BPacking> r;
const std::uint8_t copyLen = (BLen < Len) ? BLen : Len;
const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing;
for (std::uint8_t p = 0; p < copyPack; ++p)
for (std::uint8_t i = 0; i < copyLen; ++i)
r.v[p * BLen + i] = this->v[p * Len + i];
return r;
}
constexpr VectorF32<Len, Packing> operator+(VectorF32<Len, Packing> b) const {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] + b.v[i];
return r;
}
constexpr VectorF32<Len, Packing> operator-(VectorF32<Len, Packing> b) const {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] - b.v[i];
return r;
}
constexpr VectorF32<Len, Packing> operator*(VectorF32<Len, Packing> b) const {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] * b.v[i];
return r;
}
constexpr VectorF32<Len, Packing> operator/(VectorF32<Len, Packing> b) const {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] / b.v[i];
return r;
}
constexpr void operator+=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] += b.v[i]; }
constexpr void operator-=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] -= b.v[i]; }
constexpr void operator*=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] *= b.v[i]; }
constexpr void operator/=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] /= b.v[i]; }
constexpr VectorF32<Len, Packing> operator+(float b) const { return *this + VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator-(float b) const { return *this - VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator*(float b) const { return *this * VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator/(float b) const { return *this / VectorF32<Len, Packing>(b); }
constexpr void operator+=(float b) { *this += VectorF32<Len, Packing>(b); }
constexpr void operator-=(float b) { *this -= VectorF32<Len, Packing>(b); }
constexpr void operator*=(float b) { *this *= VectorF32<Len, Packing>(b); }
constexpr void operator/=(float b) { *this /= VectorF32<Len, Packing>(b); }
constexpr VectorF32<Len, Packing> operator-() const {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = -this->v[i];
return r;
}
constexpr bool operator==(VectorF32<Len, Packing> b) const {
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
if (this->v[p * Len + i] != b.v[p * Len + i]) return false;
return true;
}
constexpr bool operator!=(VectorF32<Len, Packing> b) const { return !(*this == b); }
template<std::uint32_t ExtractLen>
constexpr VectorF32<ExtractLen, Packing> ExtractLo() const {
VectorF32<ExtractLen, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < ExtractLen; ++i)
r.v[p * ExtractLen + i] = this->v[p * Len + i];
return r;
}
constexpr VectorF32<Len, Packing> Cos() const {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::cos(this->v[i]);
return r;
}
constexpr VectorF32<Len, Packing> Sin() const {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::sin(this->v[i]);
return r;
}
constexpr std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> SinCos() const {
return { Sin(), Cos() };
}
template <std::array<bool, Len> values>
constexpr VectorF32<Len, Packing> Negate() const {
VectorF32<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i];
return r;
}
static constexpr VectorF32<Len, Packing> MulitplyAdd(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> add) {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] + add.v[i];
return r;
}
static constexpr VectorF32<Len, Packing> MulitplySub(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> sub) {
VectorF32<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] - sub.v[i];
return r;
}
constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) {
VectorF32<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p) {
const std::uint8_t base = p * 3;
r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1];
r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2];
r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0];
}
return r;
}
template <const std::array<std::uint8_t, Len> ShuffleValues>
constexpr VectorF32<Len, Packing> Shuffle() const {
VectorF32<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]];
return r;
}
template <std::array<bool, Len> ShuffleValues>
constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) {
VectorF32<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i];
return r;
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
for (std::uint8_t i = 0; i < N; ++i)
for (std::uint8_t p = 0; p < Packing; ++p) {
float acc = 0.0f;
for (std::uint8_t k = 0; k < Len; ++k) {
float x = args[i].v[p * Len + k];
acc += x * x;
}
r.v[i * Packing + p] = acc;
}
return r;
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
auto sq = LengthSq(first, rest...);
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]);
return sq;
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
auto normOne = [](VectorF32<Len, Packing> u) {
VectorF32<Len, Packing> out;
for (std::uint8_t p = 0; p < Packing; ++p) {
float acc = 0.0f;
for (std::uint8_t k = 0; k < Len; ++k) {
float x = u.v[p * Len + k];
acc += x * x;
}
float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f;
for (std::uint8_t k = 0; k < Len; ++k)
out.v[p * Len + k] = u.v[p * Len + k] * invLen;
}
return out;
};
return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
}
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
VectorF32<3, Packing> qv;
VectorF32<3, Packing> qwBroadcast;
for (std::uint8_t p = 0; p < Packing; ++p) {
qv.v[p * 3 + 0] = q.v[p * 4 + 0];
qv.v[p * 3 + 1] = q.v[p * 4 + 1];
qv.v[p * 3 + 2] = q.v[p * 4 + 2];
for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3];
}
VectorF32<3, Packing> t = Cross(qv, v) * 2.0f;
return v + t * qwBroadcast + Cross(qv, t);
}
constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) {
VectorF32<3, Packing> translated = v - pivot;
return Rotate(translated, q) + pivot;
}
constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) {
VectorF32<4, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p) {
float roll = eulerHalf.v[p * 3 + 0];
float pitch = eulerHalf.v[p * 3 + 1];
float yaw = eulerHalf.v[p * 3 + 2];
float sr = std::sin(roll), cr = std::cos(roll);
float sp = std::sin(pitch), cp = std::cos(pitch);
float sy = std::sin(yaw), cy = std::cos(yaw);
r.v[p * 4 + 0] = sr * cp * cy - cr * sp * sy;
r.v[p * 4 + 1] = cr * sp * cy + sr * cp * sy;
r.v[p * 4 + 2] = cr * cp * sy - sr * sp * cy;
r.v[p * 4 + 3] = cr * cp * cy + sr * sp * sy;
}
return r;
}
};
#endif
}
export template <std::uint32_t Len, std::uint32_t Packing>
struct std::formatter<Crafter::VectorF32<Len, Packing>> : std::formatter<std::string> {
constexpr auto format(const Crafter::VectorF32<Len, Packing>& obj, format_context& ctx) const {
std::array<float, Crafter::VectorF32<Len, Packing>::AlignmentElement> vec = obj.template Store<float>();
std::string out = "{";
for(std::uint32_t i = 0; i < Packing; i++) {
out += "{";
for(std::uint32_t i2 = 0; i2 < Len; i2++) {
out += std::format("{}", static_cast<float>(vec[i * Len + i2]));
if (i2 + 1 < Len) out += ",";
}
out += "}";
}
out += "}";
return std::formatter<std::string>::format(out, ctx);
}
};