This commit is contained in:
Jorijn van der Graaf 2026-03-31 14:22:18 +02:00
commit a16f8ffbde
7 changed files with 251 additions and 133 deletions

View file

@ -20,7 +20,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
export module Crafter.Math:Basic; export module Crafter.Math:Basic;
import std; import std;
import :VectorF16; import :VectorF32;
namespace Crafter { namespace Crafter {
template<typename T> template<typename T>

View file

@ -6,15 +6,19 @@ export module Crafter.Math:Common;
import std; import std;
namespace Crafter { namespace Crafter {
#ifdef __AVX512FP16__
export template <std::uint8_t Len, std::uint8_t Packing> export template <std::uint8_t Len, std::uint8_t Packing>
struct VectorF16; struct VectorF16;
#endif
export template <std::uint8_t Len, std::uint8_t Packing> export template <std::uint8_t Len, std::uint8_t Packing>
struct VectorF32; struct VectorF32;
template <std::uint8_t Len, std::uint8_t Packing, typename T> template <std::uint8_t Len, std::uint8_t Packing, typename T>
struct VectorBase { struct VectorBase {
#ifdef __AVX512FP16__
template <std::uint8_t L, std::uint8_t P> template <std::uint8_t L, std::uint8_t P>
friend struct VectorF16; friend struct VectorF16;
#endif
template <std::uint8_t L, std::uint8_t P> template <std::uint8_t L, std::uint8_t P>
friend struct VectorF32; friend struct VectorF32;
protected: protected:

View file

@ -23,13 +23,36 @@ import :MatrixRowMajor;
import std; import std;
namespace Crafter { namespace Crafter {
export template<typename T> export
constexpr T IntersectionTestRayTriangle(Vector<T, 3, 0> vert0, Vector<T, 3, 0> vert1, Vector<T, 3, 0> vert2, Vector<T, 3, 0> rayOrigin, Vector<T, 3, 0> rayDir) { constexpr std::array<float, 15> IntersectionTestRayTriangle(
Vector<T, 3, 0> edge1 = vert1 - vert0; VectorF32<3, 5> vertA0,
Vector<T, 3, 0> edge2 = vert2 - vert0; VectorF32<3, 5> vertA1,
VectorF32<3, 5> vertA2,
VectorF32<3, 5> vertB0,
VectorF32<3, 5> vertB1,
VectorF32<3, 5> vertB2,
VectorF32<3, 5> vertC0,
VectorF32<3, 5> vertC1,
VectorF32<3, 5> vertC2,
VectorF32<3, 5> rayOrigin,
VectorF32<3, 5> rayDir
) {
VectorF32<3, Packing> edgeA1 = vertA1 - vertA0;
VectorF32<3, Packing> edgeA2 = vertA2 - vertA0;
VectorF32<3, Packing> crossA = VectorF32<3, Packing> ::Cross(rayDir, edgeA2);
VectorF32<3, Packing> edgeB1 = vertB1 - vertB0;
VectorF32<3, Packing> edgeB2 = vertB2 - vertB0;
VectorF32<3, Packing> crossB = VectorF32<3, Packing> ::Cross(rayDir, edgeB2);
VectorF32<3, Packing> edgeC1 = vertC1 - vertC0;
VectorF32<3, Packing> edgeC2 = vertC2 - vertC0;
VectorF32<3, Packing> crossC = VectorF32<3, Packing> ::Cross(rayDir, edgeC2);
Vector<T, 3, 0> h = Vector<T, 3, 0>::Cross(rayDir, edge2);
T determinant = Vector<T, 3, 0>::Dot(edge1, h);
if (determinant <= std::numeric_limits<T>::epsilon()) { if (determinant <= std::numeric_limits<T>::epsilon()) {
return std::numeric_limits<T>::max(); return std::numeric_limits<T>::max();

View file

@ -66,6 +66,7 @@ namespace Crafter {
} }
} }
template<typename T>
constexpr std::array<_Float16, VectorBase<Len, Packing, _Float16>::AlignmentElement> Store() const { constexpr std::array<_Float16, VectorBase<Len, Packing, _Float16>::AlignmentElement> Store() const {
std::array<_Float16, VectorBase<Len, Packing, _Float16>::AlignmentElement> returnArray; std::array<_Float16, VectorBase<Len, Packing, _Float16>::AlignmentElement> returnArray;
Store(returnArray.data()); Store(returnArray.data());
@ -1029,7 +1030,7 @@ namespace Crafter {
export template <std::uint32_t Len, std::uint32_t Packing> export template <std::uint32_t Len, std::uint32_t Packing>
struct std::formatter<Crafter::VectorF16<Len, Packing>> : std::formatter<std::string> { struct std::formatter<Crafter::VectorF16<Len, Packing>> : std::formatter<std::string> {
constexpr auto format(const Crafter::VectorF16<Len, Packing>& obj, format_context& ctx) const { constexpr auto format(const Crafter::VectorF16<Len, Packing>& obj, format_context& ctx) const {
std::array<_Float16, Crafter::VectorF16<Len, Packing>::AlignmentElement> vec = obj.Store(); std::array<_Float16, Crafter::VectorF16<Len, Packing>::AlignmentElement> vec = obj.template Store<float>();
std::string out = "{"; std::string out = "{";
for(std::uint32_t i = 0; i < Packing; i++) { for(std::uint32_t i = 0; i < Packing; i++) {
out += "{"; out += "{";

View file

@ -24,7 +24,6 @@ export module Crafter.Math:VectorF32;
import std; import std;
import :Common; import :Common;
#ifdef __AVX512FP16__
namespace Crafter { namespace Crafter {
export template <std::uint8_t Len, std::uint8_t Packing> export template <std::uint8_t Len, std::uint8_t Packing>
struct VectorF32 : public VectorBase<Len, Packing, float> { struct VectorF32 : public VectorBase<Len, Packing, float> {
@ -38,6 +37,9 @@ namespace Crafter {
constexpr VectorF32(const float* vB) { constexpr VectorF32(const float* vB) {
Load(vB); Load(vB);
}; };
constexpr VectorF32(const _Float16* vB) {
Load(vB);
};
constexpr VectorF32(float val) { constexpr VectorF32(float val) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_set1_ps(val); this->v = _mm_set1_ps(val);
@ -66,8 +68,55 @@ namespace Crafter {
} }
} }
constexpr std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> Store() const { constexpr void Load(const _Float16* vB) {
std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> returnArray; #ifdef __F16C__
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_cvtph_ps(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(vB)));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i*>(vB)));
} else {
this->v = _mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(vB)));
}
#else
alignas(64) float tmp[Len];
for (int i = 0; i < Len; ++i)
tmp[i] = static_cast<float>(vB[i]);
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
this->v = _mm_load_ps(tmp);
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
this->v = _mm256_load_ps(tmp);
} else {
this->v = _mm512_load_ps(tmp);
}
#endif
}
constexpr void Store(_Float16* vB) const {
#ifdef __F16C__
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
_mm_storel_epi64(reinterpret_cast<__m128i*>(vB), _mm_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(vB), _mm256_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT));
} else {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(vB), _mm512_cvtps_ph(this->v, _MM_FROUND_TO_NEAREST_INT));
}
#else
alignas(64) float tmp[Len];
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
_mm_store_ps(tmp, this->v);
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
_mm256_store_ps(tmp, this->v);
} else {
_mm512_store_ps(tmp, this->v);
}
for (int i = 0; i < Len; ++i)
vB[i] = static_cast<_Float16>(tmp[i]);
#endif
}
template<typename T>
constexpr std::array<T, VectorBase<Len, Packing, float>::AlignmentElement> Store() const {
std::array<T, VectorBase<Len, Packing, float>::AlignmentElement> returnArray;
Store(returnArray.data()); Store(returnArray.data());
return returnArray; return returnArray;
} }
@ -96,36 +145,41 @@ namespace Crafter {
if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m128>) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi8<BLen>(); constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi8<BLen>();
__m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); __m128i shuffleVec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(shuffleMask.data()));
return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask =VectorBase<Len, Packing, float>::template GetExtractLoMaskepi32<BLen>(); constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask =VectorBase<Len, Packing, float>::template GetExtractLoMaskepi32<BLen>();
__m256i permIdx = _mm256_loadu_epi32(permMask.data()); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm_castps_si256(this->v)); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm_castps_si256(this->v));
return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm256_castsi256_si128(result))); return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm256_castsi256_si128(result)));
#ifdef __AVX512F__
} else { } else {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi32<BLen>(); constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i permIdx = _mm512_loadu_epi32(permMask.data());
__m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v));
return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm512_castsi512_si128(result))); return VectorF32<BLen, BPacking>(_mm_castsi128_ps(_mm512_castsi512_si128(result)));
#endif
} }
} else if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m256>) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>(); constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m256i permIdx = _mm256_loadu_epi32(permMask.data()); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castsi128_si256(_mm_castps_si128(this->v))); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castsi128_si256(_mm_castps_si128(this->v)));
return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result)); return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>(); constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m256i permIdx = _mm256_loadu_epi32(permMask.data()); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v)); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v));
return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result)); return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result));
#ifdef __AVX512F__
} else { } else {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>(); constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
__m256i permIdx = _mm512_loadu_epi32(permMask.data()); __m256i permIdx = _mm512_loadu_epi32(permMask.data());
__m256i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi512_si256(_mm512_castps_si512(this->v))); __m256i result = _mm512_permutexvar_epi32(permIdx, _mm512_castsi512_si256(_mm512_castps_si512(this->v)));
return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result)); return VectorF32<BLen, BPacking>(_mm256_castsi256_ps(result));
#endif
} }
#ifdef __AVX512F__
} else { } else {
if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<BLen, BPacking, float>::VectorType, __m128>) {
constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>(); constexpr std::array<std::uint32_t, VectorBase<BLen, Packing, float>::AlignmentElement> permMask = VectorBase<BLen, Packing, float>::template GetExtractLoMaskEpi32<BLen>();
@ -143,6 +197,7 @@ namespace Crafter {
__m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)); __m512i result = _mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v));
return VectorF32<BLen, BPacking>(_mm512_castsi512_ps(result)); return VectorF32<BLen, BPacking>(_mm512_castsi512_ps(result));
} }
#endif
} }
} }
} }
@ -274,22 +329,24 @@ namespace Crafter {
constexpr bool operator==(VectorF32<Len, Packing> b) const { constexpr bool operator==(VectorF32<Len, Packing> b) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return _mm_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 15; #ifdef __AVX512VL__
return _mm_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xF;
#else
return _mm_movemask_ps(_mm_cmpeq_ps(this->v, b.v)) == 0xF;
#endif
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return _mm256_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 255; #ifdef __AVX512VL__
return _mm256_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFF;
#else
return _mm256_movemask_ps(_mm256_cmp_ps(this->v, b.v, _CMP_EQ_OQ)) == 0xFF;
#endif
} else { } else {
return _mm512_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 65535; return _mm512_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) == 0xFFFF;
} }
} }
constexpr bool operator!=(VectorF32<Len, Packing> b) const { constexpr bool operator!=(VectorF32<Len, Packing> b) const {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { return !(*this == b);
return _mm_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) != 15;
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return _mm256_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) != 255;
} else {
return _mm512_cmp_ps_mask(this->v, b.v, _CMP_EQ_OQ) != 65535;
}
} }
template<std::uint32_t ExtractLen> template<std::uint32_t ExtractLen>
@ -301,7 +358,7 @@ namespace Crafter {
return VectorF32<ExtractLen, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); return VectorF32<ExtractLen, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi32<ExtractLen>(); constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetExtractLoMaskEpi32<ExtractLen>();
__m256i permIdx = _mm256_loadu_epi32(permMask.data()); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
__m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v)); __m256i result = _mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v));
if constexpr(std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) {
return VectorF32<ExtractLen, Packing>(_mm256_castps256_ps128(_mm256_castsi256_ps(result))); return VectorF32<ExtractLen, Packing>(_mm256_castps256_ps128(_mm256_castsi256_ps(result)));
@ -323,10 +380,12 @@ namespace Crafter {
} else { } else {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) {
return VectorF32<ExtractLen, Packing>(_mm256_castps256_ps128(this->v)); return VectorF32<ExtractLen, Packing>(_mm256_castps256_ps128(this->v));
#ifdef __AVX512F__
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m128>) {
return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps128(this->v)); return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps128(this->v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512> && std::is_same_v<typename VectorBase<ExtractLen, Packing, float>::VectorType, __m256>) {
return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps256(this->v)); return VectorF32<ExtractLen, Packing>(_mm512_castps512_ps256(this->v));
#endif
} else { } else {
return VectorF32<ExtractLen, Packing>(this->v); return VectorF32<ExtractLen, Packing>(this->v);
} }
@ -338,8 +397,10 @@ namespace Crafter {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x4(this->v)); return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x4(this->v));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x8(this->v)); return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x8(this->v));
#ifdef __AVX512F__
} else { } else {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x16(this->v)); return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::cos_f32x16(this->v));
#endif
} }
} }
@ -348,8 +409,10 @@ namespace Crafter {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x4(this->v)); return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x4(this->v));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x8(this->v)); return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x8(this->v));
#ifdef __AVX512F__
} else { } else {
return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x16(this->v)); return VectorF32<Len, Packing>(VectorBase<Len, Packing, float>::sin_f32x16(this->v));
#endif
} }
} }
@ -369,7 +432,7 @@ namespace Crafter {
VectorF32<Len, Packing>(s), VectorF32<Len, Packing>(s),
VectorF32<Len, Packing>(c) VectorF32<Len, Packing>(c)
}; };
#ifdef __AVX512F__
} else { } else {
__m512 s, c; __m512 s, c;
VectorBase<Len, Packing, float>::sincos_f32x16(this->v, s, c); VectorBase<Len, Packing, float>::sincos_f32x16(this->v, s, c);
@ -377,6 +440,7 @@ namespace Crafter {
VectorF32<Len, Packing>(s), VectorF32<Len, Packing>(s),
VectorF32<Len, Packing>(c) VectorF32<Len, Packing>(c)
}; };
#endif
} }
} }
@ -384,11 +448,13 @@ namespace Crafter {
constexpr VectorF32<Len, Packing> Negate() { constexpr VectorF32<Len, Packing> Negate() {
std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>(); std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>();
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_epi32(mask.data())))); return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data())))));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(this->v), _mm256_loadu_epi32(mask.data())))); return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_xor_si256(_mm256_castps_si256(this->v), _mm256_loadu_si256(reinterpret_cast<__m256i*>(mask.data())))));
#ifdef __AVX512F__
} else { } else {
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(this->v), _mm512_loadu_epi32(mask.data())))); return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(this->v), _mm512_loadu_epi32(mask.data()))));
#endif
} }
} }
@ -397,8 +463,10 @@ namespace Crafter {
return VectorF32<Len, Packing>(_mm_fmadd_ps(a.v, b.v, add.v)); return VectorF32<Len, Packing>(_mm_fmadd_ps(a.v, b.v, add.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_fmadd_ps(a.v, b.v, add.v)); return VectorF32<Len, Packing>(_mm256_fmadd_ps(a.v, b.v, add.v));
#ifdef __AVX512F__
} else { } else {
return VectorF32<Len, Packing>(_mm512_fmadd_ps(a.v, b.v, add.v)); return VectorF32<Len, Packing>(_mm512_fmadd_ps(a.v, b.v, add.v));
#endif
} }
} }
@ -407,55 +475,22 @@ namespace Crafter {
return VectorF32<Len, Packing>(_mm_fmsub_ps(a.v, b.v, sub.v)); return VectorF32<Len, Packing>(_mm_fmsub_ps(a.v, b.v, sub.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_fmsub_ps(a.v, b.v, sub.v)); return VectorF32<Len, Packing>(_mm256_fmsub_ps(a.v, b.v, sub.v));
#ifdef __AVX512F__
} else { } else {
return VectorF32<Len, Packing>(_mm512_fmsub_ps(a.v, b.v, sub.v)); return VectorF32<Len, Packing>(_mm512_fmsub_ps(a.v, b.v, sub.v));
#endif
} }
} }
constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) { constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { VectorF32<Len, Packing> row1 = a.template Shuffle<{{1,2,0}}>();
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask1 = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<{{1,2,0}}>(); VectorF32<Len, Packing> row4 = b.template Shuffle<{{1,2,0}}>();
__m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1.data());
__m128 row1 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a.v), shuffleVec1));
__m128 row4 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(b.v), shuffleVec1));
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask3 = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<{{2,0,1}}>(); VectorF32<Len, Packing> row3 = a.template Shuffle<{{2,0,1}}>();
__m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3.data()); VectorF32<Len, Packing> row2 = b.template Shuffle<{{2,0,1}}>();
__m128 row3 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a.v), shuffleVec3));
__m128 row2 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(b.v), shuffleVec3));
__m128 result = _mm_mul_ps(row3, row4); VectorF32<Len, Packing> result = row3 * row4;
return _mm_fmsub_ps(row1,row2,result); return VectorF32<Len, Packing>::MulitplySub(row1, row2, result);
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask1 = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<{{1,2,0}}>();
__m512i shuffleVec1 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask1.data()));
__m256 row1 = _mm256_castsi256_ps(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(a.v)), shuffleVec1)));
__m256 row4 = _mm256_castsi256_ps(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(b.v)), shuffleVec1)));
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask3 = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<{{2,0,1}}>();
__m512i shuffleVec3 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask3.data()));
__m256 row3 = _mm256_castsi256_ps(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(a.v)), shuffleVec3)));
__m256 row2 = _mm256_castsi256_ps(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(b.v)), shuffleVec3)));
__m256 result = _mm256_mul_ps(row3, row4);
return _mm256_fmsub_ps(row1,row2,result);
} else {
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask1 = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<{{1,2,0}}>();
__m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1.data());
__m512 row1 = _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(a.v), shuffleVec1));
__m512 row4 = _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(b.v), shuffleVec1));
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask3 = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<{{2,0,1}}>();
__m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3.data());
__m512 row3 = _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(a.v), shuffleVec3));
__m512 row2 = _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(b.v), shuffleVec3));
__m512 result = _mm512_mul_ps(row3, row4);
return _mm512_fmsub_ps(row1,row2,result);
}
} }
template <const std::array<std::uint8_t, Len> ShuffleValues> template <const std::array<std::uint8_t, Len> ShuffleValues>
@ -466,8 +501,10 @@ namespace Crafter {
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(this->v), imm))); return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(this->v), imm)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_shuffle_epi32(_mm256_castps_si256(this->v), imm))); return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_shuffle_epi32(_mm256_castps_si256(this->v), imm)));
#ifdef __AVX512F__
} else { } else {
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_shuffle_epi32(_mm512_castps_si512(this->v), imm))); return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_shuffle_epi32(_mm512_castps_si512(this->v), imm)));
#endif
} }
} else if constexpr(VectorBase<Len, Packing, float>::template CheckEpi8Shuffle<ShuffleValues>()) { } else if constexpr(VectorBase<Len, Packing, float>::template CheckEpi8Shuffle<ShuffleValues>()) {
constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<ShuffleValues>(); constexpr std::array<std::uint8_t, VectorBase<Len, Packing, float>::Alignment> shuffleMask = VectorBase<Len, Packing, float>::template GetShuffleMaskEpi8<ShuffleValues>();
@ -475,11 +512,19 @@ namespace Crafter {
__m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data()); __m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data());
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
__m256i shuffleVec = _mm256_loadu_epi8(shuffleMask.data()); #ifdef __AVX512BW__
__m256i shuffleVec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(shuffleMask.data()));
return VectorF32<Len, Packing>(_mm256_castsi256_ps( _mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(this->v)),_mm512_castsi256_si512(shuffleVec))))); return VectorF32<Len, Packing>(_mm256_castsi256_ps( _mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castps_si256(this->v)),_mm512_castsi256_si512(shuffleVec)))));
#else
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>();
__m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx)));
#endif
#ifdef __AVX512F__
} else { } else {
__m512i shuffleVec = _mm512_loadu_epi8(shuffleMask.data()); __m512i shuffleVec = _mm512_loadu_si512(reinterpret_cast<const __m256i*>(shuffleMask.data()));
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(this->v), shuffleVec))); return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(this->v), shuffleVec)));
#endif
} }
} else { } else {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
@ -488,12 +533,14 @@ namespace Crafter {
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec))); return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(this->v), shuffleVec)));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>(); constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>();
__m256i permIdx = _mm256_loadu_epi32(permMask.data()); __m256i permIdx = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(permMask.data()));
return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_permutexvar_epi32(permIdx, _mm256_castps_si256(this->v)))); return VectorF32<Len, Packing>(_mm256_castsi256_ps(_mm256_permutevar8x32_epi32(_mm256_castps_si256(this->v), permIdx)));
#ifdef __AVX512F__
} else { } else {
constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>(); constexpr std::array<std::uint32_t, VectorBase<Len, Packing, float>::AlignmentElement> permMask = VectorBase<Len, Packing, float>::template GetPermuteMaskEpi32<ShuffleValues>();
__m512i permIdx = _mm512_loadu_epi32(permMask.data()); __m512i permIdx = _mm512_loadu_epi32(permMask.data());
return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v)))); return VectorF32<Len, Packing>(_mm512_castsi512_ps(_mm512_permutexvar_epi32(permIdx, _mm512_castps_si512(this->v))));
#endif
} }
} }
} }
@ -539,6 +586,7 @@ namespace Crafter {
_mm256_mul_ps(C.v, fLenghtC.v), _mm256_mul_ps(C.v, fLenghtC.v),
_mm256_mul_ps(D.v, fLenghtD.v) _mm256_mul_ps(D.v, fLenghtD.v)
}; };
#if defined(__AVX512F__)
} else { } else {
VectorF32<1, 16> lenght = LengthNoShuffle(A, C, B, D); VectorF32<1, 16> lenght = LengthNoShuffle(A, C, B, D);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
@ -558,6 +606,7 @@ namespace Crafter {
VectorF32<Len, Packing>(_mm512_mul_ps(C.v, fLenghtC.v)), VectorF32<Len, Packing>(_mm512_mul_ps(C.v, fLenghtC.v)),
VectorF32<Len, Packing>(_mm512_mul_ps(D.v, fLenghtD.v)), VectorF32<Len, Packing>(_mm512_mul_ps(D.v, fLenghtD.v)),
}; };
#endif
} }
} }
@ -609,6 +658,7 @@ namespace Crafter {
}; };
} }
#ifdef __AVX512F__
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize( constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
VectorF32<Len, Packing> A, VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B, VectorF32<Len, Packing> B,
@ -629,6 +679,7 @@ namespace Crafter {
_mm512_mul_ps(C.v, fLenghtC.v), _mm512_mul_ps(C.v, fLenghtC.v),
}; };
} }
#endif
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize( constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
VectorF32<Len, Packing> A, VectorF32<Len, Packing> A,
@ -660,6 +711,7 @@ namespace Crafter {
_mm256_mul_ps(A.v, fLenghtA.v), _mm256_mul_ps(A.v, fLenghtA.v),
_mm256_mul_ps(B.v, fLenghtB.v), _mm256_mul_ps(B.v, fLenghtB.v),
}; };
#ifdef __AVX512F__
} else { } else {
VectorF32<1, 16> lenght = LengthNoShuffle(A, B); VectorF32<1, 16> lenght = LengthNoShuffle(A, B);
constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; constexpr float oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
@ -673,6 +725,7 @@ namespace Crafter {
_mm512_mul_ps(A.v, fLenghtA.v), _mm512_mul_ps(A.v, fLenghtA.v),
_mm512_mul_ps(B.v, fLenghtB.v), _mm512_mul_ps(B.v, fLenghtB.v),
}; };
#endif
} }
} }
@ -712,6 +765,7 @@ namespace Crafter {
return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
} }
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> Length( constexpr static VectorF32<1, 15> Length(
VectorF32<Len, Packing> A, VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B, VectorF32<Len, Packing> B,
@ -720,6 +774,7 @@ namespace Crafter {
VectorF32<1, 15> lenghtSq = LengthSq(A, B, C); VectorF32<1, 15> lenghtSq = LengthSq(A, B, C);
return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v)); return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v));
} }
#endif
constexpr static VectorF32<1, Packing*2> Length( constexpr static VectorF32<1, Packing*2> Length(
VectorF32<Len, Packing> A, VectorF32<Len, Packing> A,
@ -730,8 +785,10 @@ namespace Crafter {
return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v));
#ifdef __AVX512F__
} else { } else {
return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v));
#endif
} }
} }
@ -762,6 +819,7 @@ namespace Crafter {
return Dot(A, A, B, B, C, C, D, D); return Dot(A, A, B, B, C, C, D, D);
} }
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> LengthSq( constexpr static VectorF32<1, 15> LengthSq(
VectorF32<Len, Packing> A, VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B, VectorF32<Len, Packing> B,
@ -769,6 +827,7 @@ namespace Crafter {
) requires(Len == 3 && Packing == 5) { ) requires(Len == 3 && Packing == 5) {
return Dot(A, A, B, B, C, C); return Dot(A, A, B, B, C, C);
} }
#endif
constexpr static VectorF32<1, Packing*2> LengthSq( constexpr static VectorF32<1, Packing*2> LengthSq(
VectorF32<Len, Packing> A, VectorF32<Len, Packing> A,
@ -792,6 +851,7 @@ namespace Crafter {
1,5,3,7, 1,5,3,7,
}}>(); }}>();
return vec.v; return vec.v;
#ifdef __AVX512F__
} else { } else {
VectorF32<16, 1> vec(DotNoShuffle(A0, A1, B0, B1, C0, C1, D0, D1).v); VectorF32<16, 1> vec(DotNoShuffle(A0, A1, B0, B1, C0, C1, D0, D1).v);
vec = vec.template Shuffle<{{ vec = vec.template Shuffle<{{
@ -801,6 +861,7 @@ namespace Crafter {
3,7,11,15 3,7,11,15
}}>(); }}>();
return vec.v; return vec.v;
#endif
} }
} }
@ -955,6 +1016,7 @@ namespace Crafter {
return row1; return row1;
} }
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> Dot( constexpr static VectorF32<1, 15> Dot(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1, VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1, VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
@ -1044,6 +1106,7 @@ namespace Crafter {
return row1; return row1;
} }
#endif
constexpr static VectorF32<1, Packing*2> Dot( constexpr static VectorF32<1, Packing*2> Dot(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1, VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
@ -1058,6 +1121,7 @@ namespace Crafter {
2,3, 6,7, 2,3, 6,7,
}}>(); }}>();
return vec.v; return vec.v;
#ifdef __AVX512F__
} else { } else {
VectorF32<16, 1> vec(DotNoShuffle(A0, A1, C0, C1).v); VectorF32<16, 1> vec(DotNoShuffle(A0, A1, C0, C1).v);
vec = vec.template Shuffle<{{ vec = vec.template Shuffle<{{
@ -1067,6 +1131,7 @@ namespace Crafter {
10,11, 14,15 10,11, 14,15
}}>(); }}>();
return vec.v; return vec.v;
#endif
} }
} }
@ -1083,8 +1148,10 @@ namespace Crafter {
return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
#ifdef __AVX512F__
} else { } else {
return VectorF32<1, Packing*4>(_mm512_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*4>(_mm512_sqrt_ps(lenghtSq.v));
#endif
} }
} }
@ -1097,8 +1164,10 @@ namespace Crafter {
return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*2>(_mm256_sqrt_ps(lenghtSq.v));
#ifdef __AVX512F__
} else { } else {
return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v)); return VectorF32<1, Packing*2>(_mm512_sqrt_ps(lenghtSq.v));
#endif
} }
} }
@ -1172,6 +1241,7 @@ namespace Crafter {
row1 = _mm256_add_ps(row1, row4); row1 = _mm256_add_ps(row1, row4);
return row1; return row1;
#ifdef __AVX512F__
} else { } else {
__m512 mulA = _mm512_mul_ps(A0.v, A1.v); __m512 mulA = _mm512_mul_ps(A0.v, A1.v);
__m512 mulB = _mm512_mul_ps(B0.v, B1.v); __m512 mulB = _mm512_mul_ps(B0.v, B1.v);
@ -1195,6 +1265,7 @@ namespace Crafter {
row1 = _mm512_add_ps(row1, row4); row1 = _mm512_add_ps(row1, row4);
return row1; return row1;
#endif
} }
} }
@ -1226,6 +1297,7 @@ namespace Crafter {
row56Temp1 = _mm256_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2 row56Temp1 = _mm256_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2
return _mm256_add_ps(row12Temp1, row56Temp1); return _mm256_add_ps(row12Temp1, row56Temp1);
#ifdef __AVX512F__
} else { } else {
__m512 mulA = _mm512_mul_ps(A0.v, A1.v); __m512 mulA = _mm512_mul_ps(A0.v, A1.v);
__m512 mulC = _mm512_mul_ps(C0.v, C1.v); __m512 mulC = _mm512_mul_ps(C0.v, C1.v);
@ -1238,6 +1310,7 @@ namespace Crafter {
row56Temp1 = _mm512_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2 row56Temp1 = _mm512_unpackhi_epi32(row1TempTemp1, row56Temp1); // A2 B2 C2 D2
return _mm512_add_ps(row12Temp1, row56Temp1); return _mm512_add_ps(row12Temp1, row56Temp1);
#endif
} }
} }
public: public:
@ -1245,17 +1318,18 @@ namespace Crafter {
template <std::array<bool, Len> ShuffleValues> template <std::array<bool, Len> ShuffleValues>
constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) { constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) {
constexpr auto mask = VectorBase<Len, Packing, float>::template GetBlendMaskEpi32<ShuffleValues>(); constexpr auto mask = VectorBase<Len, Packing, float>::template GetBlendMaskEpi32<ShuffleValues>();
if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) { if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return _mm_castsi128_ps(_mm_blend_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v), mask)); return _mm_castsi128_ps(_mm_blend_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v), mask));
} else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) { } else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
#ifndef __AVX512BW__ return _mm256_castsi256_ps(_mm256_blend_epi32(_mm256_castps_si256(a.v), _mm256_castps_si256(b.v), mask));
#ifndef __AVX512VL__
static_assert(false, "No __AVX512BW__ and __AVX512VL__ support"); #ifdef __AVX512F__
#endif } else if constexpr (std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m512>) {
#endif
return _mm256_castsi256_ps(_mm256_mask_blend_epi32(mask, _mm256_castps_si256(a.v), _mm256_castps_si256(b.v)));
} else {
return _mm512_castsi512_ps(_mm512_mask_blend_epi32(mask, _mm512_castps_si512(a.v), _mm512_castps_si512(b.v))); return _mm512_castsi512_ps(_mm512_mask_blend_epi32(mask, _mm512_castps_si512(a.v), _mm512_castps_si512(b.v)));
#endif
} }
} }
@ -1314,7 +1388,7 @@ namespace Crafter {
export template <std::uint32_t Len, std::uint32_t Packing> export template <std::uint32_t Len, std::uint32_t Packing>
struct std::formatter<Crafter::VectorF32<Len, Packing>> : std::formatter<std::string> { struct std::formatter<Crafter::VectorF32<Len, Packing>> : std::formatter<std::string> {
constexpr auto format(const Crafter::VectorF32<Len, Packing>& obj, format_context& ctx) const { constexpr auto format(const Crafter::VectorF32<Len, Packing>& obj, format_context& ctx) const {
std::array<float, Crafter::VectorF32<Len, Packing>::AlignmentElement> vec = obj.Store(); std::array<float, Crafter::VectorF32<Len, Packing>::AlignmentElement> vec = obj.template Store<float>();
std::string out = "{"; std::string out = "{";
for(std::uint32_t i = 0; i < Packing; i++) { for(std::uint32_t i = 0; i < Packing; i++) {
out += "{"; out += "{";
@ -1328,4 +1402,3 @@ struct std::formatter<Crafter::VectorF32<Len, Packing>> : std::formatter<std::st
return std::formatter<std::string>::format(out, ctx); return std::formatter<std::string>::format(out, ctx);
} }
}; };
#endif

View file

@ -36,6 +36,20 @@
"implementations": ["tests/Vector"], "implementations": ["tests/Vector"],
"march": "sapphirerapids", "march": "sapphirerapids",
"extends": ["lib-shared"] "extends": ["lib-shared"]
},
{
"name": "Vector-x86-64-v4",
"implementations": ["tests/Vector"],
"march": "x86-64-v4",
"mtune": "generic",
"extends": ["lib-shared"]
},
{
"name": "Vector-x86-64-v3",
"implementations": ["tests/Vector"],
"march": "x86-64-v3",
"mtune": "generic",
"extends": ["lib-shared"]
} }
] ]
} }

View file

@ -90,7 +90,7 @@ std::string* TestAllCombinations() {
if constexpr(total > 0 && (total & (total - 1)) == 0) { if constexpr(total > 0 && (total & (total - 1)) == 0) {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], floats[i])) { if (!FloatEquals(stored[i], floats[i])) {
return new std::string(std::format("Load/Store mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i]), (float)stored[i])); return new std::string(std::format("Load/Store mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i]), (float)stored[i]));
@ -101,7 +101,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec + vec; vec = vec + vec;
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], floats[i] + floats[i])) { if (!FloatEquals(stored[i], floats[i] + floats[i])) {
return new std::string(std::format("Add mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] + floats[i]), (float)stored[i])); return new std::string(std::format("Add mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] + floats[i]), (float)stored[i]));
@ -112,7 +112,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec - vec; vec = vec - vec;
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], T(0))) { if (!FloatEquals(stored[i], T(0))) {
return new std::string(std::format("Subtract mismatch at Len={} Packing={}, Expected: 0, Got: {}", Len, Packing, (float)stored[i])); return new std::string(std::format("Subtract mismatch at Len={} Packing={}, Expected: 0, Got: {}", Len, Packing, (float)stored[i]));
@ -123,7 +123,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec * vec; vec = vec * vec;
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], floats[i] * floats[i])) { if (!FloatEquals(stored[i], floats[i] * floats[i])) {
return new std::string(std::format("Multiply mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] * floats[i]), (float)stored[i])); return new std::string(std::format("Multiply mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] * floats[i]), (float)stored[i]));
@ -134,7 +134,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec / vec; vec = vec / vec;
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], T(1))) { if (!FloatEquals(stored[i], T(1))) {
return new std::string(std::format("Divide mismatch at Len={} Packing={}, Expected: 1, Got: {}", Len, Packing, (float)stored[i])); return new std::string(std::format("Divide mismatch at Len={} Packing={}, Expected: 1, Got: {}", Len, Packing, (float)stored[i]));
@ -145,7 +145,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec + T(2); vec = vec + T(2);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], floats[i] + T(2))) { if (!FloatEquals(stored[i], floats[i] + T(2))) {
return new std::string(std::format("Scalar add mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] + T(2)), (float)stored[i])); return new std::string(std::format("Scalar add mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] + T(2)), (float)stored[i]));
@ -156,7 +156,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec - T(2); vec = vec - T(2);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], floats[i] - T(2))) { if (!FloatEquals(stored[i], floats[i] - T(2))) {
return new std::string(std::format("Scalar add mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] + T(2)), (float)stored[i])); return new std::string(std::format("Scalar add mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] + T(2)), (float)stored[i]));
@ -167,7 +167,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec * T(2); vec = vec * T(2);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], floats[i] * T(2))) { if (!FloatEquals(stored[i], floats[i] * T(2))) {
return new std::string(std::format("Scalar multiply mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] * T(2)), (float)stored[i])); return new std::string(std::format("Scalar multiply mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] * T(2)), (float)stored[i]));
@ -178,7 +178,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = vec / T(2); vec = vec / T(2);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(stored[i], floats[i] / T(2))) { if (!FloatEquals(stored[i], floats[i] / T(2))) {
return new std::string(std::format("Scalar divide mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] * T(2)), (float)stored[i])); return new std::string(std::format("Scalar divide mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] * T(2)), (float)stored[i]));
@ -225,7 +225,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
vec = -vec; vec = -vec;
std::array<T, VectorType<Len, Packing>::AlignmentElement> result = vec.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> result = vec.template Store<T>();
for (std::uint32_t i = 0; i < Len * Packing; i++) { for (std::uint32_t i = 0; i < Len * Packing; i++) {
if (!FloatEquals(result[i], -floats[i])) { if (!FloatEquals(result[i], -floats[i])) {
return new std::string(std::format("Negate mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(-floats[i]), (float)result[i])); return new std::string(std::format("Negate mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(-floats[i]), (float)result[i]));
@ -237,7 +237,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecA(floats1); VectorType<Len, Packing> vecA(floats1);
VectorType<Len, Packing> vecB(floats2); VectorType<Len, Packing> vecB(floats2);
VectorType<Len, Packing> result = VectorType<Len, Packing>::template Blend<AlternateTrueFalse<Len>()>(vecA, vecB); VectorType<Len, Packing> result = VectorType<Len, Packing>::template Blend<AlternateTrueFalse<Len>()>(vecA, vecB);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
for (std::uint32_t i = 0; i < Len; i++) { for (std::uint32_t i = 0; i < Len; i++) {
bool useB = (i % 2 == 0); bool useB = (i % 2 == 0);
T expected = useB ? floats2[i]: floats1[i]; T expected = useB ? floats2[i]: floats1[i];
@ -252,7 +252,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecB(floats); VectorType<Len, Packing> vecB(floats);
VectorType<Len, Packing> vecAdd(floats); VectorType<Len, Packing> vecAdd(floats);
VectorType<Len, Packing> result = VectorType<Len, Packing>::MulitplyAdd(vecA, vecB, vecAdd); VectorType<Len, Packing> result = VectorType<Len, Packing>::MulitplyAdd(vecA, vecB, vecAdd);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
for (std::uint32_t i = 0; i < Len; i++) { for (std::uint32_t i = 0; i < Len; i++) {
T expected = floats[i] * floats[i] + floats[i]; T expected = floats[i] * floats[i] + floats[i];
if (!FloatEquals(stored[i], expected)) { if (!FloatEquals(stored[i], expected)) {
@ -266,7 +266,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecB(floats); VectorType<Len, Packing> vecB(floats);
VectorType<Len, Packing> vecSub(floats); VectorType<Len, Packing> vecSub(floats);
VectorType<Len, Packing> result = VectorType<Len, Packing>::MulitplySub(vecA, vecB, vecSub); VectorType<Len, Packing> result = VectorType<Len, Packing>::MulitplySub(vecA, vecB, vecSub);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
for (std::uint32_t i = 0; i < Len; i++) { for (std::uint32_t i = 0; i < Len; i++) {
T expected = floats[i] * floats[i] - floats[i]; T expected = floats[i] * floats[i] - floats[i];
if (!FloatEquals(stored[i], expected)) { if (!FloatEquals(stored[i], expected)) {
@ -278,7 +278,7 @@ std::string* TestAllCombinations() {
if constexpr(Len > 2){ if constexpr(Len > 2){
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
VectorType<Len-1, Packing> result = vec.template ExtractLo<Len-1>(); VectorType<Len-1, Packing> result = vec.template ExtractLo<Len-1>();
std::array<T, VectorType<Len-1, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len-1, Packing>::AlignmentElement> stored = result.template Store<T>();
for(std::uint32_t i2 = 0; i2 < Packing; i2++){ for(std::uint32_t i2 = 0; i2 < Packing; i2++){
for (std::uint32_t i = 0; i < Len-1; i++) { for (std::uint32_t i = 0; i < Len-1; i++) {
T expected = floats[i2*(Len)+i]; T expected = floats[i2*(Len)+i];
@ -292,7 +292,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
VectorType<Len, Packing> result = vec.Sin(); VectorType<Len, Packing> result = vec.Sin();
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
for(std::uint32_t i2 = 0; i2 < Packing; i2++){ for(std::uint32_t i2 = 0; i2 < Packing; i2++){
for (std::uint32_t i = 0; i < Len; i++) { for (std::uint32_t i = 0; i < Len; i++) {
T expected = (T)std::sin((float)floats[i2*Len+i]); T expected = (T)std::sin((float)floats[i2*Len+i]);
@ -306,7 +306,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
VectorType<Len, Packing> result = vec.Cos(); VectorType<Len, Packing> result = vec.Cos();
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
for(std::uint32_t i2 = 0; i2 < Packing; i2++){ for(std::uint32_t i2 = 0; i2 < Packing; i2++){
for (std::uint32_t i = 0; i < Len; i++) { for (std::uint32_t i = 0; i < Len; i++) {
T expected = (T)std::cos((float)floats[i2*Len+i]); T expected = (T)std::cos((float)floats[i2*Len+i]);
@ -320,8 +320,8 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
auto result = vec.SinCos(); auto result = vec.SinCos();
std::array<T, VectorType<Len, Packing>::AlignmentElement> storedSin = std::get<0>(result).Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> storedSin = std::get<0>(result).template Store<T>();
std::array<T, VectorType<Len, Packing>::AlignmentElement> storedCos = std::get<1>(result).Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> storedCos = std::get<1>(result).template Store<T>();
for(std::uint32_t i2 = 0; i2 < Packing; i2++){ for(std::uint32_t i2 = 0; i2 < Packing; i2++){
for (std::uint32_t i = 0; i < Len; i++) { for (std::uint32_t i = 0; i < Len; i++) {
T expected = (T)std::sin((float)floats[i2*Len+i]); T expected = (T)std::sin((float)floats[i2*Len+i]);
@ -340,7 +340,7 @@ std::string* TestAllCombinations() {
{ {
VectorType<Len, Packing> vec(floats); VectorType<Len, Packing> vec(floats);
VectorType<Len, Packing> result = vec.template Shuffle<GetCountReverse<Len>()>(); VectorType<Len, Packing> result = vec.template Shuffle<GetCountReverse<Len>()>();
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
for (std::uint32_t i = 0; i < Len; i++) { for (std::uint32_t i = 0; i < Len; i++) {
T expected = floats[Len - 1 - i]; T expected = floats[Len - 1 - i];
if (!FloatEquals(stored[i], expected)) { if (!FloatEquals(stored[i], expected)) {
@ -355,7 +355,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vec1(floats1); VectorType<Len, Packing> vec1(floats1);
VectorType<Len, Packing> vec2(floats2); VectorType<Len, Packing> vec2(floats2);
VectorType<Len, Packing> result = VectorType<Len, Packing>::Cross(vec1, vec2); VectorType<Len, Packing> result = VectorType<Len, Packing>::Cross(vec1, vec2);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
if (!FloatEquals(stored[0], T(-3)) || !FloatEquals(stored[1], T(6)) || !FloatEquals(stored[2], T(-3))) { if (!FloatEquals(stored[0], T(-3)) || !FloatEquals(stored[1], T(6)) || !FloatEquals(stored[2], T(-3))) {
return new std::string(std::format("Cross mismatch at Len={} Packing={}, Expected: -3,6,-3, Got: {},{},{}", Len, Packing, (float)stored[0], (float)stored[1], (float)stored[2])); return new std::string(std::format("Cross mismatch at Len={} Packing={}, Expected: -3,6,-3, Got: {},{},{}", Len, Packing, (float)stored[0], (float)stored[1], (float)stored[2]));
} }
@ -370,7 +370,7 @@ std::string* TestAllCombinations() {
VectorType<3, Packing> vecV(floats); VectorType<3, Packing> vecV(floats);
VectorType<4, Packing> vecQ(qData); VectorType<4, Packing> vecQ(qData);
VectorType<3, Packing> result = VectorType<3, Packing>::Rotate(vecV, vecQ); VectorType<3, Packing> result = VectorType<3, Packing>::Rotate(vecV, vecQ);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
for (std::uint32_t i = 0; i < 3; i++) { for (std::uint32_t i = 0; i < 3; i++) {
if (!FloatEquals(stored[i], floats[i])) { if (!FloatEquals(stored[i], floats[i])) {
@ -389,7 +389,7 @@ std::string* TestAllCombinations() {
} }
VectorType<3, Packing> eulerVec(eulerData); VectorType<3, Packing> eulerVec(eulerData);
VectorType<4, Packing> result = VectorType<4, Packing>::QuanternionFromEuler(eulerVec); VectorType<4, Packing> result = VectorType<4, Packing>::QuanternionFromEuler(eulerVec);
std::array<T, VectorType<4, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<4, Packing>::AlignmentElement> stored = result.template Store<T>();
if (!FloatEquals(stored[0], T(0.63720703)) || !FloatEquals(stored[1], T(0.30688477)) || if (!FloatEquals(stored[0], T(0.63720703)) || !FloatEquals(stored[1], T(0.30688477)) ||
!FloatEquals(stored[2], T(0.14074707)) || !FloatEquals(stored[3], T(0.6933594))) { !FloatEquals(stored[2], T(0.14074707)) || !FloatEquals(stored[3], T(0.6933594))) {
@ -397,14 +397,14 @@ std::string* TestAllCombinations() {
} }
} }
if constexpr(Len == 3 && Packing == 1) { if constexpr(Len == 3 && Packing == 1 && std::same_as<T, float>) {
{ {
VectorType<Len, Packing> vecA(floats); VectorType<Len, Packing> vecA(floats);
VectorType<Len, Packing> vecB = vecA * 2; VectorType<Len, Packing> vecB = vecA * 2;
VectorType<Len, Packing> vecC = vecA * 3; VectorType<Len, Packing> vecC = vecA * 3;
VectorType<Len, Packing> vecD = vecA * 4; VectorType<Len, Packing> vecD = vecA * 4;
VectorType<1, 4> result = VectorType<Len, Packing>::Length(vecA, vecB, vecC, vecD); VectorType<1, 4> result = VectorType<Len, Packing>::Length(vecA, vecB, vecC, vecD);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
if (!FloatEquals(stored[0], expectedLength[0])) { if (!FloatEquals(stored[0], expectedLength[0])) {
return new std::string(std::format("Length 3 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0])); return new std::string(std::format("Length 3 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0]));
@ -430,7 +430,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecD = vecA * 4; VectorType<Len, Packing> vecD = vecA * 4;
auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC, vecD); auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC, vecD);
VectorType<1, 4> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result)); VectorType<1, 4> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result));
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
for(std::uint8_t i = 0; i < Len*Packing; i++) { for(std::uint8_t i = 0; i < Len*Packing; i++) {
if (!FloatEquals(stored[i], T(1))) { if (!FloatEquals(stored[i], T(1))) {
@ -440,14 +440,14 @@ std::string* TestAllCombinations() {
} }
} }
if constexpr(Len == 3 && Packing == 2) { if constexpr(Len == 3 && Packing == 2 && std::same_as<T, float>) {
{ {
VectorType<Len, Packing> vecA(floats); VectorType<Len, Packing> vecA(floats);
VectorType<Len, Packing> vecB = vecA * 2; VectorType<Len, Packing> vecB = vecA * 2;
VectorType<Len, Packing> vecC = vecA * 3; VectorType<Len, Packing> vecC = vecA * 3;
VectorType<Len, Packing> vecD = vecA * 4; VectorType<Len, Packing> vecD = vecA * 4;
VectorType<1, 8> result = VectorType<Len, Packing>::Length(vecA, vecB, vecC, vecD); VectorType<1, 8> result = VectorType<Len, Packing>::Length(vecA, vecB, vecC, vecD);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
if (!FloatEquals(stored[0], expectedLength[0])) { if (!FloatEquals(stored[0], expectedLength[0])) {
return new std::string(std::format("Length 3 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0])); return new std::string(std::format("Length 3 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0]));
@ -473,7 +473,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecD = vecA * 4; VectorType<Len, Packing> vecD = vecA * 4;
auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC, vecD); auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC, vecD);
VectorType<1, 8> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result)); VectorType<1, 8> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result));
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
for(std::uint8_t i = 0; i < Len*Packing; i++) { for(std::uint8_t i = 0; i < Len*Packing; i++) {
if (!FloatEquals(stored[i], T(1))) { if (!FloatEquals(stored[i], T(1))) {
@ -483,13 +483,13 @@ std::string* TestAllCombinations() {
} }
} }
if constexpr(Len == 3 && Packing == 5) { if constexpr(Len == 3 && Packing == 5 && std::same_as<T, float>) {
{ {
VectorType<Len, Packing> vecA(floats); VectorType<Len, Packing> vecA(floats);
VectorType<Len, Packing> vecB = vecA * 2; VectorType<Len, Packing> vecB = vecA * 2;
VectorType<Len, Packing> vecC = vecA * 3; VectorType<Len, Packing> vecC = vecA * 3;
VectorType<1, 15> result = VectorType<Len, Packing>::Length(vecA, vecB, vecC); VectorType<1, 15> result = VectorType<Len, Packing>::Length(vecA, vecB, vecC);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
if (!FloatEquals(stored[0], expectedLength[0])) { if (!FloatEquals(stored[0], expectedLength[0])) {
return new std::string(std::format("Length 3 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0])); return new std::string(std::format("Length 3 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0]));
@ -510,7 +510,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecC = vecA * 3; VectorType<Len, Packing> vecC = vecA * 3;
auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC); auto result = VectorType<Len, Packing>::Normalize(vecA, vecB, vecC);
VectorType<1, 15> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result)); VectorType<1, 15> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result));
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
for(std::uint8_t i = 0; i < Len*Packing; i++) { for(std::uint8_t i = 0; i < Len*Packing; i++) {
if (!FloatEquals(stored[i], T(1))) { if (!FloatEquals(stored[i], T(1))) {
@ -525,7 +525,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecA(floats); VectorType<Len, Packing> vecA(floats);
VectorType<Len, Packing> vecE = vecA *2; VectorType<Len, Packing> vecE = vecA *2;
VectorType<1, Packing*2> result = VectorType<Len, Packing>::Length(vecA, vecE); VectorType<1, Packing*2> result = VectorType<Len, Packing>::Length(vecA, vecE);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
if (!FloatEquals(stored[0], expectedLength[0])) { if (!FloatEquals(stored[0], expectedLength[0])) {
return new std::string(std::format("Length 2 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0])); return new std::string(std::format("Length 2 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0]));
@ -541,7 +541,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecE = vecA * 2; VectorType<Len, Packing> vecE = vecA * 2;
auto result = VectorType<Len, Packing>::Normalize(vecA, vecE); auto result = VectorType<Len, Packing>::Normalize(vecA, vecE);
VectorType<1, Packing*2> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result)); VectorType<1, Packing*2> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result));
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
for(std::uint8_t i = 0; i < Len*Packing; i++) { for(std::uint8_t i = 0; i < Len*Packing; i++) {
if (!FloatEquals(stored[i], T(1))) { if (!FloatEquals(stored[i], T(1))) {
@ -558,7 +558,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecE = vecA * 3; VectorType<Len, Packing> vecE = vecA * 3;
VectorType<Len, Packing> vecG = vecA * 4; VectorType<Len, Packing> vecG = vecA * 4;
VectorType<1, Packing*4> result = VectorType<Len, Packing>::Length(vecA, vecC, vecE, vecG); VectorType<1, Packing*4> result = VectorType<Len, Packing>::Length(vecA, vecC, vecE, vecG);
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result.template Store<T>();
if (!FloatEquals(stored[0], expectedLength[0])) { if (!FloatEquals(stored[0], expectedLength[0])) {
return new std::string(std::format("Length 4 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0])); return new std::string(std::format("Length 4 vecA test failed at Len={} Packing={} Expected: {}, Got: {}", Len, Packing, (float)expectedLength[0], (float)stored[0]));
@ -584,7 +584,7 @@ std::string* TestAllCombinations() {
VectorType<Len, Packing> vecG = vecA * 4; VectorType<Len, Packing> vecG = vecA * 4;
auto result = VectorType<Len, Packing>::Normalize(vecA, vecC, vecE, vecG); auto result = VectorType<Len, Packing>::Normalize(vecA, vecC, vecE, vecG);
VectorType<1, Packing*4> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result)); VectorType<1, Packing*4> result2 = VectorType<Len, Packing>::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result));
std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.Store(); std::array<T, VectorType<Len, Packing>::AlignmentElement> stored = result2.template Store<T>();
for(std::uint8_t i = 0; i < Len*Packing; i++) { for(std::uint8_t i = 0; i < Len*Packing; i++) {
if (!FloatEquals(stored[i], T(1))) { if (!FloatEquals(stored[i], T(1))) {
@ -600,8 +600,11 @@ std::string* TestAllCombinations() {
extern "C" { extern "C" {
std::string* RunTest() { std::string* RunTest() {
//std::string* err = TestAllCombinations<_Float16, VectorF16, VectorF16<1, 1>::MaxElement>(); std::string* err = TestAllCombinations<_Float16, VectorF16, VectorF16<1, 1>::MaxElement>();
std::string* err = TestAllCombinations<float, VectorF32, VectorF32<1, 1>::MaxElement>(); if (err) {
return err;
}
err = TestAllCombinations<float, VectorF32, VectorF32<1, 1>::MaxElement>();
if (err) { if (err) {
return err; return err;
} }