F16 fixes
This commit is contained in:
parent
b2b4ca9c4d
commit
6e6530290b
3 changed files with 189 additions and 506 deletions
|
|
@ -458,34 +458,24 @@ namespace Crafter {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename T, std::uint32_t Aligment>
|
|
||||||
struct std::formatter<Crafter::Vector<T, 2, Aligment>> : std::formatter<std::string> {
|
template <typename T, std::uint32_t Len, std::uint32_t Aligment>
|
||||||
auto format(const Crafter::Vector<T, 2, Aligment>& obj, format_context& ctx) const {
|
struct std::formatter<Crafter::Vector<T, Len, Aligment>> : std::formatter<std::string> {
|
||||||
return std::formatter<std::string>::format(std::format("{{{}, {}}}",
|
auto format(const Crafter::Vector<T, Len, Aligment>& vec, format_context& ctx) const {
|
||||||
obj.x, obj.y
|
std::string out = "{";
|
||||||
), ctx);
|
for(std::uint32_t i2 = 0; i2 < Len; i2++) {
|
||||||
|
if constexpr(std::same_as<T, _Float16>) {
|
||||||
|
out += std::format("{}", static_cast<float>(vec.v[i2]));
|
||||||
|
} else {
|
||||||
|
out += std::format("{}", vec.v[i2]);
|
||||||
|
}
|
||||||
|
if (i2 + 1 < Len) out += ",";
|
||||||
|
}
|
||||||
|
out += "}";
|
||||||
|
return std::formatter<std::string>::format(out, ctx);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T, std::uint32_t Aligment>
|
|
||||||
struct std::formatter<Crafter::Vector<T, 3, Aligment>> : std::formatter<std::string> {
|
|
||||||
auto format(const Crafter::Vector<T, 3, Aligment>& obj, format_context& ctx) const {
|
|
||||||
return std::formatter<std::string>::format(std::format("{{{}, {}, {}}}",
|
|
||||||
obj.x, obj.y, obj.z
|
|
||||||
), ctx);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T, std::uint32_t Aligment>
|
|
||||||
struct std::formatter<Crafter::Vector<T, 4, Aligment>> : std::formatter<std::string> {
|
|
||||||
auto format(const Crafter::Vector<T, 4, Aligment>& obj, format_context& ctx) const {
|
|
||||||
return std::formatter<std::string>::format(std::format("{{{}, {}, {}, {}}}",
|
|
||||||
(float)obj.x, (float)obj.y, (float)obj.z, (float)obj.w
|
|
||||||
), ctx);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template <typename T, std::uint32_t Len, std::uint32_t Aligment, typename BT>
|
template <typename T, std::uint32_t Len, std::uint32_t Aligment, typename BT>
|
||||||
constexpr Crafter::Vector<T, Len, Aligment> operator*(BT b, const Crafter::Vector<T, Len, Aligment>& v) requires std::is_arithmetic_v<BT> {
|
constexpr Crafter::Vector<T, Len, Aligment> operator*(BT b, const Crafter::Vector<T, Len, Aligment>& v) requires std::is_arithmetic_v<BT> {
|
||||||
return v * b;
|
return v * b;
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,8 @@ namespace Crafter {
|
||||||
|
|
||||||
VectorType v;
|
VectorType v;
|
||||||
public:
|
public:
|
||||||
|
template <std::uint32_t Len2, std::uint32_t Packing2>
|
||||||
|
friend class VectorF16;
|
||||||
static constexpr std::uint32_t MaxSize = 32;
|
static constexpr std::uint32_t MaxSize = 32;
|
||||||
static constexpr std::uint8_t Alignment = GetAlingment();
|
static constexpr std::uint8_t Alignment = GetAlingment();
|
||||||
static_assert(Len * Packing <= MaxSize, "Len * Packing exceeds MaxSize");
|
static_assert(Len * Packing <= MaxSize, "Len * Packing exceeds MaxSize");
|
||||||
|
|
@ -256,18 +258,18 @@ namespace Crafter {
|
||||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||||
_Float16 dot = LengthSq();
|
_Float16 dot = LengthSq();
|
||||||
__m128h vec = _mm_set1_ph(dot);
|
__m128h vec = _mm_set1_ph(dot);
|
||||||
__m128h sqrt = _mm_rsqrt_ph(vec);
|
__m128h sqrt = _mm_sqrt_ph(vec);
|
||||||
v = _mm_div_ps(v, sqrt);
|
v = _mm_div_ph(v, sqrt);
|
||||||
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
||||||
_Float16 dot = LengthSq();
|
_Float16 dot = LengthSq();
|
||||||
__m256h vec = _mm256_set1_ph(dot);
|
__m256h vec = _mm256_set1_ph(dot);
|
||||||
__m256h sqrt = _mm256_rsqrt_ph(vec);
|
__m256h sqrt = _mm256_sqrt_ph(vec);
|
||||||
v = _mm256_div_ps(v, sqrt);
|
v = _mm256_div_ph(v, sqrt);
|
||||||
} else {
|
} else {
|
||||||
_Float16 dot = LengthSq();
|
_Float16 dot = LengthSq();
|
||||||
__m512h vec = _mm512_set1_ph(dot);
|
__m512h vec = _mm512_set1_ph(dot);
|
||||||
__m512h sqrt = _mm512_rsqrt_ph(vec);
|
__m512h sqrt = _mm512_sqrt_ph(vec);
|
||||||
v = _mm512_div_ps(v, sqrt);
|
v = _mm512_div_ph(v, sqrt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -323,42 +325,42 @@ namespace Crafter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>> SinCos() {
|
std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>> SinCos() {
|
||||||
if constexpr (std::is_same_v<VectorType, __m128h>) {
|
if constexpr (std::is_same_v<VectorType, __m128h>) {
|
||||||
__m256 wide = _mm256_cvtph_ps(_mm_castph_si128(v));
|
__m256 wide = _mm256_cvtph_ps(_mm_castph_si128(v));
|
||||||
__m256 s, c;
|
__m256 s, c;
|
||||||
sincos_f32x8(wide, s, c);
|
sincos_f32x8(wide, s, c);
|
||||||
return {
|
return {
|
||||||
VectorF16<Len, Packing>(_mm_castsi128_ph(_mm256_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT))),
|
VectorF16<Len, Packing>(_mm_castsi128_ph(_mm256_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT))),
|
||||||
VectorF16<Len, Packing>(_mm_castsi128_ph(_mm256_cvtps_ph(c, _MM_FROUND_TO_NEAREST_INT)))
|
VectorF16<Len, Packing>(_mm_castsi128_ph(_mm256_cvtps_ph(c, _MM_FROUND_TO_NEAREST_INT)))
|
||||||
};
|
};
|
||||||
|
|
||||||
} else if constexpr (std::is_same_v<VectorType, __m256h>) {
|
} else if constexpr (std::is_same_v<VectorType, __m256h>) {
|
||||||
__m512 wide = _mm512_cvtph_ps(_mm256_castph_si256(v));
|
__m512 wide = _mm512_cvtph_ps(_mm256_castph_si256(v));
|
||||||
__m512 s, c;
|
__m512 s, c;
|
||||||
sincos_f32x16(wide, s, c);
|
sincos_f32x16(wide, s, c);
|
||||||
return {
|
return {
|
||||||
VectorF16<Len, Packing>(_mm256_castsi256_ph(_mm512_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT))),
|
VectorF16<Len, Packing>(_mm256_castsi256_ph(_mm512_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT))),
|
||||||
VectorF16<Len, Packing>(_mm256_castsi256_ph(_mm512_cvtps_ph(c, _MM_FROUND_TO_NEAREST_INT)))
|
VectorF16<Len, Packing>(_mm256_castsi256_ph(_mm512_cvtps_ph(c, _MM_FROUND_TO_NEAREST_INT)))
|
||||||
};
|
};
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
__m256i lo = _mm512_castsi512_si256(_mm512_castph_si512(v));
|
__m256i lo = _mm512_castsi512_si256(_mm512_castph_si512(v));
|
||||||
__m256i hi = _mm512_extracti64x4_epi64(_mm512_castph_si512(v), 1);
|
__m256i hi = _mm512_extracti64x4_epi64(_mm512_castph_si512(v), 1);
|
||||||
|
|
||||||
__m512 s_lo, c_lo, s_hi, c_hi;
|
__m512 s_lo, c_lo, s_hi, c_hi;
|
||||||
sincos_f32x16(_mm512_cvtph_ps(lo), s_lo, c_lo);
|
sincos_f32x16(_mm512_cvtph_ps(lo), s_lo, c_lo);
|
||||||
sincos_f32x16(_mm512_cvtph_ps(hi), s_hi, c_hi);
|
sincos_f32x16(_mm512_cvtph_ps(hi), s_hi, c_hi);
|
||||||
|
|
||||||
auto pack = [](__m256i lo_ph, __m256i hi_ph) {
|
auto pack = [](__m256i lo_ph, __m256i hi_ph) {
|
||||||
return _mm512_castsi512_ph(_mm512_inserti64x4(_mm512_castsi256_si512(lo_ph), hi_ph, 1));
|
return _mm512_castsi512_ph(_mm512_inserti64x4(_mm512_castsi256_si512(lo_ph), hi_ph, 1));
|
||||||
};
|
};
|
||||||
return {
|
return {
|
||||||
VectorF16<Len, Packing>(pack(_mm512_cvtps_ph(s_lo, _MM_FROUND_TO_NEAREST_INT), _mm512_cvtps_ph(s_hi, _MM_FROUND_TO_NEAREST_INT))),
|
VectorF16<Len, Packing>(pack(_mm512_cvtps_ph(s_lo, _MM_FROUND_TO_NEAREST_INT), _mm512_cvtps_ph(s_hi, _MM_FROUND_TO_NEAREST_INT))),
|
||||||
VectorF16<Len, Packing>(pack( _mm512_cvtps_ph(c_lo, _MM_FROUND_TO_NEAREST_INT), _mm512_cvtps_ph(c_hi, _MM_FROUND_TO_NEAREST_INT)))
|
VectorF16<Len, Packing>(pack( _mm512_cvtps_ph(c_lo, _MM_FROUND_TO_NEAREST_INT), _mm512_cvtps_ph(c_hi, _MM_FROUND_TO_NEAREST_INT)))
|
||||||
};
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
template <std::array<bool, Len> values>
|
template <std::array<bool, Len> values>
|
||||||
constexpr VectorF16<Len, Packing> Negate() {
|
constexpr VectorF16<Len, Packing> Negate() {
|
||||||
|
|
@ -421,12 +423,12 @@ namespace Crafter {
|
||||||
|
|
||||||
constexpr static VectorF16<Len, Packing> Cross(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) requires(Len == 3) {
|
constexpr static VectorF16<Len, Packing> Cross(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) requires(Len == 3) {
|
||||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||||
constexpr std::array<std::uint8_t, 16> shuffleMask1 = GetShuffleMaskEpi8<{1,2,0}>();
|
constexpr std::array<std::uint8_t, 16> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||||
__m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1.data());
|
__m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1.data());
|
||||||
__m128h row1 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec1));
|
__m128h row1 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec1));
|
||||||
__m128h row4 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec1));
|
__m128h row4 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec1));
|
||||||
|
|
||||||
constexpr std::array<std::uint8_t, 16> shuffleMask3 = GetShuffleMaskEpi8<{2,0,1}>();
|
constexpr std::array<std::uint8_t, 16> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||||
__m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3.data());
|
__m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3.data());
|
||||||
__m128h row3 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec3));
|
__m128h row3 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec3));
|
||||||
__m128h row2 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec3));
|
__m128h row2 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec3));
|
||||||
|
|
@ -434,12 +436,12 @@ namespace Crafter {
|
||||||
__m128h result = _mm_mul_ph(row3, row4);
|
__m128h result = _mm_mul_ph(row3, row4);
|
||||||
return _mm_fmsub_ph(row1,row2,result);
|
return _mm_fmsub_ph(row1,row2,result);
|
||||||
} else if constexpr (std::is_same_v<VectorType, __m256h>) {
|
} else if constexpr (std::is_same_v<VectorType, __m256h>) {
|
||||||
constexpr std::array<std::uint8_t, 32> shuffleMask1 = GetShuffleMaskEpi8<{1,2,0}>();
|
constexpr std::array<std::uint8_t, 32> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||||
__m512i shuffleVec1 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask1.data()));
|
__m512i shuffleVec1 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask1.data()));
|
||||||
__m256h row1 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec1)));
|
__m256h row1 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec1)));
|
||||||
__m256h row4 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(b.v)), shuffleVec1)));
|
__m256h row4 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(b.v)), shuffleVec1)));
|
||||||
|
|
||||||
constexpr std::array<std::uint8_t, 32> shuffleMask3 = GetShuffleMaskEpi8<{2,0,1}>();
|
constexpr std::array<std::uint8_t, 32> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||||
|
|
||||||
__m512i shuffleVec3 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask3.data()));
|
__m512i shuffleVec3 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask3.data()));
|
||||||
__m256h row3 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec3)));
|
__m256h row3 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec3)));
|
||||||
|
|
@ -448,13 +450,13 @@ namespace Crafter {
|
||||||
__m256h result = _mm256_mul_ph(row3, row4);
|
__m256h result = _mm256_mul_ph(row3, row4);
|
||||||
return _mm256_fmsub_ph(row1,row2,result);
|
return _mm256_fmsub_ph(row1,row2,result);
|
||||||
} else {
|
} else {
|
||||||
constexpr std::array<std::uint8_t, 64> shuffleMask1 = GetShuffleMaskEpi8<{1,2,0}>();
|
constexpr std::array<std::uint8_t, 64> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||||
|
|
||||||
__m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1.data());
|
__m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1.data());
|
||||||
__m512h row1 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec1));
|
__m512h row1 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec1));
|
||||||
__m512h row4 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec1));
|
__m512h row4 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec1));
|
||||||
|
|
||||||
constexpr std::array<std::uint8_t, 64> shuffleMask3 = GetShuffleMaskEpi8<{2,0,1}>();
|
constexpr std::array<std::uint8_t, 64> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||||
|
|
||||||
__m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3.data());
|
__m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3.data());
|
||||||
__m512h row3 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec3));
|
__m512h row3 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec3));
|
||||||
|
|
@ -488,14 +490,14 @@ namespace Crafter {
|
||||||
VectorF16<Len, Packing> G,
|
VectorF16<Len, Packing> G,
|
||||||
VectorF16<Len, Packing> H
|
VectorF16<Len, Packing> H
|
||||||
) requires(Len == 8) {
|
) requires(Len == 8) {
|
||||||
constexpr std::uint8_t shuffleMaskA[] = GetShuffleMaskEpi8<{0,0,0,0,0,0,0,0}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskA = GetShuffleMaskEpi8<{{0,0,0,0,0,0,0,0}}>();
|
||||||
constexpr std::uint8_t shuffleMaskB[] = GetShuffleMaskEpi8<{1,1,1,1,1,1,1,1}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskB = GetShuffleMaskEpi8<{{1,1,1,1,1,1,1,1}}>();
|
||||||
constexpr std::uint8_t shuffleMaskC[] = GetShuffleMaskEpi8<{2,2,2,2,2,2,2,2}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskC = GetShuffleMaskEpi8<{{2,2,2,2,2,2,2,2}}>();
|
||||||
constexpr std::uint8_t shuffleMaskD[] = GetShuffleMaskEpi8<{3,3,3,3,3,3,3,3}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskD = GetShuffleMaskEpi8<{{3,3,3,3,3,3,3,3}}>();
|
||||||
constexpr std::uint8_t shuffleMaskE[] = GetShuffleMaskEpi8<{4,4,4,4,4,4,4,4}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskE = GetShuffleMaskEpi8<{{4,4,4,4,4,4,4,4}}>();
|
||||||
constexpr std::uint8_t shuffleMaskF[] = GetShuffleMaskEpi8<{5,5,5,5,5,5,5,5}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskF = GetShuffleMaskEpi8<{{5,5,5,5,5,5,5,5}}>();
|
||||||
constexpr std::uint8_t shuffleMaskG[] = GetShuffleMaskEpi8<{6,6,6,6,6,6,6,6}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskG = GetShuffleMaskEpi8<{{6,6,6,6,6,6,6,6}}>();
|
||||||
constexpr std::uint8_t shuffleMaskH[] = GetShuffleMaskEpi8<{7,7,7,7,7,7,7,7}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskH = GetShuffleMaskEpi8<{{7,7,7,7,7,7,7,7}}>();
|
||||||
|
|
||||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||||
VectorF16<Len, Packing> lenght = Length(A, B, C, D, E, F, G, H);
|
VectorF16<Len, Packing> lenght = Length(A, B, C, D, E, F, G, H);
|
||||||
|
|
@ -503,29 +505,28 @@ namespace Crafter {
|
||||||
__m128h one = _mm_loadu_ph(oneArr);
|
__m128h one = _mm_loadu_ph(oneArr);
|
||||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||||
|
|
||||||
|
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA.data());
|
||||||
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA);
|
|
||||||
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m128i shuffleVecB = _mm_loadu_epi8(shuffleMaskB);
|
__m128i shuffleVecB = _mm_loadu_epi8(shuffleMaskB.data());
|
||||||
__m128h fLenghtB = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecB));
|
__m128h fLenghtB = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecB));
|
||||||
|
|
||||||
__m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC);
|
__m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC.data());
|
||||||
__m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC));
|
__m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC));
|
||||||
|
|
||||||
__m128i shuffleVecD = _mm_loadu_epi8(shuffleMaskD);
|
__m128i shuffleVecD = _mm_loadu_epi8(shuffleMaskD.data());
|
||||||
__m128h fLenghtD = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecD));
|
__m128h fLenghtD = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecD));
|
||||||
|
|
||||||
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE);
|
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE.data());
|
||||||
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
||||||
|
|
||||||
__m128i shuffleVecF = _mm_loadu_epi8(shuffleMaskF);
|
__m128i shuffleVecF = _mm_loadu_epi8(shuffleMaskF.data());
|
||||||
__m128h fLenghtF = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecF));
|
__m128h fLenghtF = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecF));
|
||||||
|
|
||||||
__m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG);
|
__m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG.data());
|
||||||
__m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG));
|
__m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG));
|
||||||
|
|
||||||
__m128i shuffleVecH = _mm_loadu_epi8(shuffleMaskH);
|
__m128i shuffleVecH = _mm_loadu_epi8(shuffleMaskH.data());
|
||||||
__m128h fLenghtH = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecH));
|
__m128h fLenghtH = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecH));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -544,28 +545,28 @@ namespace Crafter {
|
||||||
__m256h one = _mm256_loadu_ph(oneArr);
|
__m256h one = _mm256_loadu_ph(oneArr);
|
||||||
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA);
|
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA.data());
|
||||||
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m256i shuffleVecB = _mm256_loadu_epi8(shuffleMaskB);
|
__m256i shuffleVecB = _mm256_loadu_epi8(shuffleMaskB.data());
|
||||||
__m256h fLenghtB = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecB));
|
__m256h fLenghtB = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecB));
|
||||||
|
|
||||||
__m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC);
|
__m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC.data());
|
||||||
__m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC));
|
__m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC));
|
||||||
|
|
||||||
__m256i shuffleVecD = _mm256_loadu_epi8(shuffleMaskD);
|
__m256i shuffleVecD = _mm256_loadu_epi8(shuffleMaskD.data());
|
||||||
__m256h fLenghtD = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecD));
|
__m256h fLenghtD = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecD));
|
||||||
|
|
||||||
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE);
|
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE.data());
|
||||||
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
||||||
|
|
||||||
__m256i shuffleVecF = _mm256_loadu_epi8(shuffleMaskF);
|
__m256i shuffleVecF = _mm256_loadu_epi8(shuffleMaskF.data());
|
||||||
__m256h fLenghtF = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecF));
|
__m256h fLenghtF = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecF));
|
||||||
|
|
||||||
__m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG);
|
__m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG.data());
|
||||||
__m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG));
|
__m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG));
|
||||||
|
|
||||||
__m256i shuffleVecH = _mm256_loadu_epi8(shuffleMaskH);
|
__m256i shuffleVecH = _mm256_loadu_epi8(shuffleMaskH.data());
|
||||||
__m256h fLenghtH = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecH));
|
__m256h fLenghtH = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecH));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -584,28 +585,28 @@ namespace Crafter {
|
||||||
__m512h one = _mm512_loadu_ph(oneArr);
|
__m512h one = _mm512_loadu_ph(oneArr);
|
||||||
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA);
|
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA.data());
|
||||||
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m512i shuffleVecB = _mm512_loadu_epi8(shuffleMaskB);
|
__m512i shuffleVecB = _mm512_loadu_epi8(shuffleMaskB.data());
|
||||||
__m512h fLenghtB = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecB));
|
__m512h fLenghtB = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecB));
|
||||||
|
|
||||||
__m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC);
|
__m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC.data());
|
||||||
__m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC));
|
__m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC));
|
||||||
|
|
||||||
__m512i shuffleVecD = _mm512_loadu_epi8(shuffleMaskD);
|
__m512i shuffleVecD = _mm512_loadu_epi8(shuffleMaskD.data());
|
||||||
__m512h fLenghtD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecD));
|
__m512h fLenghtD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecD));
|
||||||
|
|
||||||
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE);
|
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE.data());
|
||||||
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
||||||
|
|
||||||
__m512i shuffleVecF = _mm512_loadu_epi8(shuffleMaskF);
|
__m512i shuffleVecF = _mm512_loadu_epi8(shuffleMaskF.data());
|
||||||
__m512h fLenghtF = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecF));
|
__m512h fLenghtF = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecF));
|
||||||
|
|
||||||
__m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG);
|
__m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG.data());
|
||||||
__m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG));
|
__m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG));
|
||||||
|
|
||||||
__m512i shuffleVecH = _mm512_loadu_epi8(shuffleMaskH);
|
__m512i shuffleVecH = _mm512_loadu_epi8(shuffleMaskH.data());
|
||||||
__m512h fLenghtH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecH));
|
__m512h fLenghtH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecH));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -627,10 +628,10 @@ namespace Crafter {
|
||||||
VectorF16<Len, Packing> E,
|
VectorF16<Len, Packing> E,
|
||||||
VectorF16<Len, Packing> G
|
VectorF16<Len, Packing> G
|
||||||
) requires(Len == 4) {
|
) requires(Len == 4) {
|
||||||
constexpr std::uint8_t shuffleMaskA[] = GetShuffleMaskEpi8<{0,0,0,0}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskA = GetShuffleMaskEpi8<{{0,0,0,0}}>();
|
||||||
constexpr std::uint8_t shuffleMaskC[] = GetShuffleMaskEpi8<{1,1,1,1}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskC = GetShuffleMaskEpi8<{{1,1,1,1}}>();
|
||||||
constexpr std::uint8_t shuffleMaskE[] = GetShuffleMaskEpi8<{2,2,2,2}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskE = GetShuffleMaskEpi8<{{2,2,2,2}}>();
|
||||||
constexpr std::uint8_t shuffleMaskG[] = GetShuffleMaskEpi8<{3,3,3,3}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskG = GetShuffleMaskEpi8<{{3,3,3,3}}>();
|
||||||
|
|
||||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||||
VectorF16<Len, Packing> lenght = Length(A, C, E, G);
|
VectorF16<Len, Packing> lenght = Length(A, C, E, G);
|
||||||
|
|
@ -638,16 +639,16 @@ namespace Crafter {
|
||||||
__m128h one = _mm_loadu_ph(oneArr);
|
__m128h one = _mm_loadu_ph(oneArr);
|
||||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA);
|
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA.data());
|
||||||
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC);
|
__m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC.data());
|
||||||
__m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC));
|
__m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC));
|
||||||
|
|
||||||
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE);
|
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE.data());
|
||||||
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
||||||
|
|
||||||
__m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG);
|
__m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG.data());
|
||||||
__m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG));
|
__m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -662,16 +663,16 @@ namespace Crafter {
|
||||||
__m256h one = _mm256_loadu_ph(oneArr);
|
__m256h one = _mm256_loadu_ph(oneArr);
|
||||||
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA);
|
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA.data());
|
||||||
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC);
|
__m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC.data());
|
||||||
__m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC));
|
__m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC));
|
||||||
|
|
||||||
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE);
|
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE.data());
|
||||||
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
||||||
|
|
||||||
__m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG);
|
__m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG.data());
|
||||||
__m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG));
|
__m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -686,16 +687,16 @@ namespace Crafter {
|
||||||
__m512h one = _mm512_loadu_ph(oneArr);
|
__m512h one = _mm512_loadu_ph(oneArr);
|
||||||
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA);
|
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA.data());
|
||||||
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC);
|
__m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC.data());
|
||||||
__m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC));
|
__m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC));
|
||||||
|
|
||||||
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE);
|
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE.data());
|
||||||
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
||||||
|
|
||||||
__m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG);
|
__m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG.data());
|
||||||
__m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG));
|
__m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -711,8 +712,8 @@ namespace Crafter {
|
||||||
VectorF16<Len, Packing> A,
|
VectorF16<Len, Packing> A,
|
||||||
VectorF16<Len, Packing> E
|
VectorF16<Len, Packing> E
|
||||||
) requires(Len == 2) {
|
) requires(Len == 2) {
|
||||||
constexpr std::uint8_t shuffleMaskA[] = GetShuffleMaskEpi8<{0,0}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskA = GetShuffleMaskEpi8<{{0,0}}>();
|
||||||
constexpr std::uint8_t shuffleMaskE[] = GetShuffleMaskEpi8<{1,1}>();
|
constexpr std::array<std::uint8_t, Alignment*2> shuffleMaskE = GetShuffleMaskEpi8<{{1,1}}>();
|
||||||
|
|
||||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||||
VectorF16<Len, Packing> lenght = Length(A, E);
|
VectorF16<Len, Packing> lenght = Length(A, E);
|
||||||
|
|
@ -720,10 +721,10 @@ namespace Crafter {
|
||||||
__m128h one = _mm_loadu_ph(oneArr);
|
__m128h one = _mm_loadu_ph(oneArr);
|
||||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA);
|
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA.data());
|
||||||
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE);
|
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE.data());
|
||||||
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -736,10 +737,10 @@ namespace Crafter {
|
||||||
__m256h one = _mm256_loadu_ph(oneArr);
|
__m256h one = _mm256_loadu_ph(oneArr);
|
||||||
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA);
|
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA.data());
|
||||||
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE);
|
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE.data());
|
||||||
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -752,10 +753,10 @@ namespace Crafter {
|
||||||
__m512h one = _mm512_loadu_ph(oneArr);
|
__m512h one = _mm512_loadu_ph(oneArr);
|
||||||
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
||||||
|
|
||||||
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA);
|
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA.data());
|
||||||
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
||||||
|
|
||||||
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE);
|
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE.data());
|
||||||
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -1158,19 +1159,19 @@ namespace Crafter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
template <std::array<bool, Len> ShuffleValues>
|
||||||
constexpr static VectorF16<Len, Packing> Blend(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) {
|
constexpr static VectorF16<Len, Packing> Blend(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) {
|
||||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||||
return _mm_castsi128_ph(_mm_blend_epi16(GetBlendMaskEpi16<ShuffleValues>(), _mm_castph_si128(a.v), _mm_castph_si128(b)));
|
return _mm_castsi128_ph(_mm_blend_epi16(_mm_castph_si128(a.v), _mm_castph_si128(b.v), GetBlendMaskEpi16<ShuffleValues>()));
|
||||||
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
||||||
#ifndef __AVX512BW__
|
#ifndef __AVX512BW__
|
||||||
#ifndef __AVX512VL__
|
#ifndef __AVX512VL__
|
||||||
static_assert(false, "No __AVX512BW__ and __AVX512VL__ support");
|
static_assert(false, "No __AVX512BW__ and __AVX512VL__ support");
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
return _mm256_castsi256_ph(_mm256_mask_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b), GetBlendMaskEpi16<ShuffleValues>()));
|
return _mm256_castsi256_ph(_mm256_mask_blend_epi16(_mm256_castph_si256(a.v), _mm256_castph_si256(b.v), GetBlendMaskEpi16<ShuffleValues>()));
|
||||||
} else {
|
} else {
|
||||||
return _mm512_castsi512_ph(_mm512_blend_epi16(GetBlendMaskEpi16<ShuffleValues>(), _mm512_castph_si512(a.v), _mm512_castph_si512(b)));
|
return _mm512_castsi512_ph(_mm512_blend_epi16(GetBlendMaskEpi16<ShuffleValues>(), _mm512_castph_si512(a.v), _mm512_castph_si512(b.v)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1189,34 +1190,36 @@ namespace Crafter {
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr static VectorF16<4, Packing> QuanternionFromEuler(VectorF16<3, Packing> EulerHalf) requires(Len == 4) {
|
constexpr static VectorF16<4, Packing> QuanternionFromEuler(VectorF16<3, Packing> EulerHalf) requires(Len == 4) {
|
||||||
VectorF16<4, Packing> sin = EulerHalf.Sin();
|
std::tuple<VectorF16<3, Packing>, VectorF16<3, Packing>> sinCos = EulerHalf.SinCos();
|
||||||
VectorF16<4, Packing> cos = EulerHalf.Cos();
|
VectorF16<4, Packing> sin = std::get<0>(sinCos);
|
||||||
|
VectorF16<4, Packing> cos = std::get<1>(sinCos);
|
||||||
|
|
||||||
VectorF16<4, Packing> row1 = cos.template Shuffle<{{0,0,0,0}}>();
|
VectorF16<4, Packing> row1 = cos.template Shuffle<{{0,0,0,0}}>();
|
||||||
row1 = Blend<{{0,1,1,1}}>(sin, row1);
|
row1 = Blend<{{0,1,1,1}}>(sin, row1);
|
||||||
|
|
||||||
VectorF16<4, Packing> row2 = cos.template Shuffle<{{1,1,1,1}}>();
|
VectorF16<4, Packing> row2 = cos.template Shuffle<{{1,1,1,1}}>();
|
||||||
row2 = Blend<{{1,0,1,1}}>(sin, row2);
|
row2 = Blend<{{1,0,1,1}}>(sin, row2);
|
||||||
|
|
||||||
row1 = row2;
|
row1 *= row2;
|
||||||
|
|
||||||
VectorF16<4, Packing> row3 = cos.template Shuffle<{{2,2,2,2}}>();
|
VectorF16<4, Packing> row3 = cos.template Shuffle<{{2,2,2,2}}>();
|
||||||
row3 = Blend<{{1,1,0,1}}>(sin, row3);
|
row3 = Blend<{{1,1,0,1}}>(sin, row3);
|
||||||
|
|
||||||
row1 *= row3;
|
row1 *= row3;
|
||||||
|
|
||||||
VectorF16<4, Packing> row4 = sin.template Shuffle<{{0,0,0,0}}>();
|
VectorF16<4, Packing> row4 = sin.template Shuffle<{{0,0,0,0}}>();
|
||||||
row4 = Blend<{{1,0,0,0}}>(sin, row4);
|
row4 = Blend<{{0,1,1,1}}>(cos, row4);
|
||||||
row1 *= row4;
|
|
||||||
row1 = row1.template Negate<{{true,false,true}}>();
|
|
||||||
|
|
||||||
VectorF16<4, Packing> row5 = sin.template Shuffle<{{1,1,1,1}}>();
|
VectorF16<4, Packing> row5 = sin.template Shuffle<{{1,1,1,1}}>();
|
||||||
row5 = Blend<{{0,1,0,0}}>(sin, row5);
|
row5 = Blend<{{1,0,1,1}}>(cos, row5);
|
||||||
|
|
||||||
|
row4 *= row5;
|
||||||
|
|
||||||
VectorF16<4, Packing> row6 = sin.template Shuffle<{{2,2,2,2}}>();
|
VectorF16<4, Packing> row6 = sin.template Shuffle<{{2,2,2,2}}>();
|
||||||
row6 = Blend<{{0,0,1,0}}>(sin, row6);
|
row6 = Blend<{{1,1,0,1}}>(cos, row6);
|
||||||
|
row6 = row6.template Negate<{{true,false,true,false}}>();
|
||||||
|
|
||||||
row1 = MulitplyAdd(row5, row6, row1);
|
row1 = MulitplyAdd(row4, row6, row1);
|
||||||
|
|
||||||
return row1;
|
return row1;
|
||||||
}
|
}
|
||||||
|
|
@ -1323,7 +1326,7 @@ namespace Crafter {
|
||||||
return mask;
|
return mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
template <std::array<bool, Len> ShuffleValues>
|
||||||
static consteval std::uint16_t GetBlendMaskEpi16() requires (std::is_same_v<VectorType, __m256h>){
|
static consteval std::uint16_t GetBlendMaskEpi16() requires (std::is_same_v<VectorType, __m256h>){
|
||||||
std::uint16_t mask = 0;
|
std::uint16_t mask = 0;
|
||||||
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||||
|
|
@ -1336,7 +1339,7 @@ namespace Crafter {
|
||||||
return mask;
|
return mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
template <std::array<bool, Len> ShuffleValues>
|
||||||
static consteval std::uint32_t GetBlendMaskEpi16() requires (std::is_same_v<VectorType, __m512h>){
|
static consteval std::uint32_t GetBlendMaskEpi16() requires (std::is_same_v<VectorType, __m512h>){
|
||||||
std::uint32_t mask = 0;
|
std::uint32_t mask = 0;
|
||||||
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||||
|
|
|
||||||
|
|
@ -16,379 +16,69 @@ You should have received a copy of the GNU Lesser General Public
|
||||||
License along with this library; if not, write to the Free Software
|
License along with this library; if not, write to the Free Software
|
||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
*/
|
*/
|
||||||
|
#include <cmath>
|
||||||
import Crafter.Math;
|
import Crafter.Math;
|
||||||
import std;
|
import std;
|
||||||
using namespace Crafter;
|
using namespace Crafter;
|
||||||
|
|
||||||
|
template <std::uint32_t MaxSize, std::uint32_t Len, std::uint32_t Packing>
|
||||||
|
std::string* TestLoadStore() {
|
||||||
|
_Float16 floats[Len * Packing];
|
||||||
|
for (std::uint32_t i = 0; i < Len * Packing; i++) {
|
||||||
|
floats[i] = static_cast<_Float16>(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorF16<Len, Packing> vec(floats);
|
||||||
|
auto stored = vec.Store();
|
||||||
|
for (std::uint32_t i = 0; i < Len * Packing; i++) {
|
||||||
|
if (stored.v[i] != floats[i]) {
|
||||||
|
return new std::string(std::format("Load/Store mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i]), (float)stored.v[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::uint32_t MaxSize, std::uint32_t Len, std::uint32_t Packing>
|
||||||
|
std::string* TestAdd() {
|
||||||
|
_Float16 floats[Len * Packing];
|
||||||
|
for (std::uint32_t i = 0; i < Len * Packing; i++) {
|
||||||
|
floats[i] = static_cast<_Float16>(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorF16<Len, Packing> vec(floats);
|
||||||
|
vec = vec + vec;
|
||||||
|
auto stored = vec.Store();
|
||||||
|
for (std::uint32_t i = 0; i < Len * Packing; i++) {
|
||||||
|
if (stored.v[i] != floats[i] + floats[i]) {
|
||||||
|
return new std::string(std::format("Add mismatch at Len={} Packing={}, Expected: {}, Got: {}", Len, Packing, (float)(floats[i] + floats[i]), (float)stored.v[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <std::uint32_t MaxSize, std::uint32_t Len = 1, std::uint32_t Packing = 1>
|
||||||
|
std::string* TestAllCombinations() {
|
||||||
|
if constexpr (Len > MaxSize) {
|
||||||
|
return nullptr;
|
||||||
|
} else if constexpr (Len * Packing > MaxSize) {
|
||||||
|
return TestAllCombinations<MaxSize, Len + 1, 1>();
|
||||||
|
} else {
|
||||||
|
std::string* result = TestLoadStore<MaxSize, Len, Packing>();
|
||||||
|
if (result) return result;
|
||||||
|
result = TestAdd<MaxSize, Len, Packing>();
|
||||||
|
if (result) return result;
|
||||||
|
return TestAllCombinations<MaxSize, Len, Packing + 1>();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
std::string* RunTest() {
|
std::string* RunTest() {
|
||||||
// Test 1: Load/Store functionality
|
std::string* err = TestAllCombinations<VectorF16<1, 1>::MaxSize>();
|
||||||
{
|
if (err) {
|
||||||
_Float16 floats[] {0,1,2,3,4,5,6,7};
|
return err;
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i]) {
|
|
||||||
return new std::string("Load Store does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test 2: Addition operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {0,1,2,3,4,5,6,7};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 + vec1;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] + floats[i]) {
|
|
||||||
return new std::string("Add does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 3: Subtraction operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {0,1,2,3,4,5,6,7};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 - vec1;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] - floats[i]) {
|
|
||||||
return new std::string("Subtract does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 4: Multiplication operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 * vec1;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] * floats[i]) {
|
|
||||||
return new std::string("Multiply does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 5: Division operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {2,4,6,8,10,12,14,16};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 / vec1;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] / floats[i]) {
|
|
||||||
return new std::string("Divide does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 6: Compound addition operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> vec2(floats);
|
|
||||||
vec1 += vec2;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] + floats[i]) {
|
|
||||||
return new std::string("Compound Add does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 7: Compound subtraction operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> vec2(floats);
|
|
||||||
vec1 -= vec2;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] - floats[i]) {
|
|
||||||
return new std::string("Compound Subtract does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 8: Compound multiplication operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> vec2(floats);
|
|
||||||
vec1 *= vec2;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] * floats[i]) {
|
|
||||||
return new std::string("Compound Multiply does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 9: Compound division operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {2,4,6,8,10,12,14,16};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> vec2(floats);
|
|
||||||
vec1 /= vec2;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] / floats[i]) {
|
|
||||||
return new std::string("Compound Divide does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 10: Scalar addition
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 + _Float16(1.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] + 1.0) {
|
|
||||||
return new std::string("Scalar Add does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 11: Scalar subtraction
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 - _Float16(1.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] - 1.0) {
|
|
||||||
return new std::string("Scalar Subtract does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 12: Scalar multiplication
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 * _Float16(2.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] * 2.0) {
|
|
||||||
return new std::string("Scalar Multiply does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 13: Scalar division
|
|
||||||
{
|
|
||||||
_Float16 floats[] {2,4,6,8,10,12,14,16};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = vec1 / _Float16(2.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] / 2.0) {
|
|
||||||
return new std::string("Scalar Divide does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 14: Compound scalar addition
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
vec1 += _Float16(1.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] + 1.0) {
|
|
||||||
return new std::string("Compound Scalar Add does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 15: Compound scalar subtraction
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
vec1 -= _Float16(1.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] - 1.0) {
|
|
||||||
return new std::string("Compound Scalar Subtract does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 16: Compound scalar multiplication
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
vec1 *= _Float16(2.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] * 2.0) {
|
|
||||||
return new std::string("Compound Scalar Multiply does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 17: Compound scalar division
|
|
||||||
{
|
|
||||||
_Float16 floats[] {2,4,6,8,10,12,14,16};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
vec1 /= _Float16(2.0);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != floats[i] / 2.0) {
|
|
||||||
return new std::string("Compound Scalar Divide does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 18: Equality operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> vec2(floats);
|
|
||||||
if (!(vec1 == vec2)) {
|
|
||||||
return new std::string("Equality operator does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 19: Inequality operator
|
|
||||||
{
|
|
||||||
_Float16 floats1[] {1,2,3,4,5,6,7,8};
|
|
||||||
_Float16 floats2[] {2,3,4,5,6,7,8,9};
|
|
||||||
VectorF16<8, 1> vec1(floats1);
|
|
||||||
VectorF16<8, 1> vec2(floats2);
|
|
||||||
if (!(vec1 != vec2)) {
|
|
||||||
return new std::string("Inequality operator does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 20: Negation operator
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
VectorF16<8, 1> result = -vec1;
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != -floats[i]) {
|
|
||||||
return new std::string("Negation operator does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 21: Length calculation
|
|
||||||
{
|
|
||||||
_Float16 floats[] {3,4,0,0,0,0,0,0};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
_Float16 length = vec1.Length();
|
|
||||||
_Float16 expectedLength = 5.0; // sqrt(3^2 + 4^2)
|
|
||||||
if (std::abs((float)length - (float)expectedLength) > 0.001) {
|
|
||||||
return new std::string("Length calculation does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 22: Length squared calculation
|
|
||||||
{
|
|
||||||
_Float16 floats[] {3,4,0,0,0,0,0,0};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
_Float16 lengthSq = vec1.LengthSq();
|
|
||||||
_Float16 expectedLengthSq = 25.0; // 3^2 + 4^2
|
|
||||||
if (std::abs((float)lengthSq - (float)expectedLengthSq) > 0.001) {
|
|
||||||
return new std::string("Length squared calculation does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 25: Shuffle operation
|
|
||||||
{
|
|
||||||
_Float16 floats[] {1,2,3,4,5,6,7,8};
|
|
||||||
VectorF16<8, 1> vec1(floats);
|
|
||||||
// Shuffle indices 0,1,2,3 -> 3,2,1,0 (reverse first 4 elements)
|
|
||||||
VectorF16<8, 1> result = vec1.template Shuffle<{{3,2,1,0,7,6,5,4}}>();
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
if (stored.v[0] != 4 || stored.v[1] != 3 || stored.v[2] != 2 || stored.v[3] != 1) {
|
|
||||||
return new std::string("Shuffle operation does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 26: Dot product
|
|
||||||
{
|
|
||||||
_Float16 floats1[] {1,2,3,4,0,0,0,0};
|
|
||||||
_Float16 floats2[] {2,3,4,5,0,0,0,0};
|
|
||||||
VectorF16<8, 1> vec1(floats1);
|
|
||||||
VectorF16<8, 1> vec2(floats2);
|
|
||||||
_Float16 dot = VectorF16<8, 1>::Dot(vec1, vec2);
|
|
||||||
_Float16 expectedDot = 1*2 + 2*3 + 3*4 + 4*5; // 2 + 6 + 12 + 20 = 40
|
|
||||||
if (std::abs((float)dot - (float)expectedDot) > 0.001) {
|
|
||||||
return new std::string("Dot product does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 27: Cross product (for 3D vectors)
|
|
||||||
{
|
|
||||||
_Float16 floats1[] {1,2,3,0,0,0,0,0};
|
|
||||||
_Float16 floats2[] {4,5,6,0,0,0,0,0};
|
|
||||||
VectorF16<3, 1> vec1(floats1);
|
|
||||||
VectorF16<3, 1> vec2(floats2);
|
|
||||||
VectorF16<3, 1> result = VectorF16<3, 1>::Cross(vec1, vec2);
|
|
||||||
Vector<_Float16, 3, VectorF16<3, 1>::Alignment> stored = result.Store();
|
|
||||||
// Cross product: (1,2,3) x (4,5,6) = (2*6-3*5, 3*4-1*6, 1*5-2*4) = (-3, 6, -3)
|
|
||||||
if (stored.v[0] != -3 || stored.v[1] != 6 || stored.v[2] != -3) {
|
|
||||||
return new std::string("Cross product does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 28: Multiply-Add operation
|
|
||||||
{
|
|
||||||
_Float16 floats1[] {1,2,3,4,0,0,0,0};
|
|
||||||
_Float16 floats2[] {2,3,4,5,0,0,0,0};
|
|
||||||
_Float16 floats3[] {1,1,1,1,0,0,0,0};
|
|
||||||
VectorF16<8, 1> vec1(floats1);
|
|
||||||
VectorF16<8, 1> vec2(floats2);
|
|
||||||
VectorF16<8, 1> vec3(floats3);
|
|
||||||
VectorF16<8, 1> result = VectorF16<8, 1>::MulitplyAdd(vec1, vec2, vec3);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
// Should compute (1*2 + 1, 2*3 + 1, 3*4 + 1, 4*5 + 1, ...) = (3, 7, 13, 21, ...)
|
|
||||||
for(std::uint8_t i = 0; i < 4; i++) {
|
|
||||||
if(stored.v[i] != floats1[i]*floats2[i] + floats3[i]) {
|
|
||||||
return new std::string("Multiply-Add operation does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 29: Multiply-Subtract operation
|
|
||||||
{
|
|
||||||
_Float16 floats1[] {1,2,3,4,0,0,0,0};
|
|
||||||
_Float16 floats2[] {2,3,4,5,0,0,0,0};
|
|
||||||
_Float16 floats3[] {1,1,1,1,0,0,0,0};
|
|
||||||
VectorF16<8, 1> vec1(floats1);
|
|
||||||
VectorF16<8, 1> vec2(floats2);
|
|
||||||
VectorF16<8, 1> vec3(floats3);
|
|
||||||
VectorF16<8, 1> result = VectorF16<8, 1>::MulitplySub(vec1, vec2, vec3);
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = result.Store();
|
|
||||||
// Should compute (1*2 - 1, 2*3 - 1, 3*4 - 1, 4*5 - 1, ...) = (1, 5, 11, 19, ...)
|
|
||||||
for(std::uint8_t i = 0; i < 4; i++) {
|
|
||||||
if(stored.v[i] != floats1[i]*floats2[i] - floats3[i]) {
|
|
||||||
return new std::string("Multiply-Subtract operation does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test 30: Constructor with single value
|
|
||||||
{
|
|
||||||
VectorF16<8, 1> vec1(_Float16(5.0));
|
|
||||||
Vector<_Float16, 8, VectorF16<8, 1>::Alignment> stored = vec1.Store();
|
|
||||||
for(std::uint8_t i = 0; i < 8; i++) {
|
|
||||||
if(stored.v[i] != 5.0) {
|
|
||||||
return new std::string("Single value constructor does not match");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue