more tests
This commit is contained in:
parent
b582e168e3
commit
e0f992aada
2 changed files with 125 additions and 71 deletions
|
|
@ -92,7 +92,7 @@ namespace Crafter {
|
|||
|
||||
template <std::uint32_t BLen, std::uint32_t BPacking>
|
||||
constexpr operator VectorF16<BLen, BPacking>() const {
|
||||
if(constexpr Len == Blen) {
|
||||
if constexpr (Len == BLen) {
|
||||
if constexpr(std::is_same_v<VectorType, __m256h> && std::is_same_v<typename VectorF16<BLen, BPacking>::VectorType, __m128h>) {
|
||||
return VectorF16<BLen, BPacking>(_mm256_castph256_ph128(v));
|
||||
} else if constexpr(std::is_same_v<VectorType, __m512h> && std::is_same_v<typename VectorF16<BLen, BPacking>::VectorType, __m128h>) {
|
||||
|
|
@ -109,7 +109,7 @@ namespace Crafter {
|
|||
return VectorF16<BLen, BPacking>(v);
|
||||
}
|
||||
} else {
|
||||
return ExtractLo<BLen>();
|
||||
return this->template ExtractLo<BLen>();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -258,7 +258,71 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr void Normalize() {
|
||||
template <std::uint32_t ExtractLen>
|
||||
static consteval std::array<std::uint8_t, VectorF16<ExtractLen, Packing>::Alignment*2> GetExtractLoMaskEpi8() {
|
||||
std::array<std::uint8_t, VectorF16<ExtractLen, Packing>::Alignment*2> mask {{0}};
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < ExtractLen; i++) {
|
||||
mask[(i2*ExtractLen*2)+(i*2)] = i*2+(i2*Len*2);
|
||||
mask[(i2*ExtractLen*2)+(i*2+1)] = i*2+1+(i2*Len*2);
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
template <std::uint32_t ExtractLen>
|
||||
static consteval std::array<std::uint16_t, VectorF16<ExtractLen, Packing>::Alignment> GetExtractLoMaskEpi16() {
|
||||
std::array<std::uint16_t, VectorF16<ExtractLen, Packing>::Alignment> mask{};
|
||||
for (std::uint16_t i2 = 0; i2 < Packing; i2++) {
|
||||
for (std::uint16_t i = 0; i < ExtractLen; i++) {
|
||||
mask[i2 * ExtractLen + i] = i + (i2 * Len);
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
template<std::uint32_t ExtractLen>
|
||||
constexpr VectorF16<ExtractLen, Packing> ExtractLo() const {
|
||||
if constexpr(Packing > 1) {
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
constexpr std::array<std::uint8_t, VectorF16<ExtractLen, Packing>::Alignment*2> shuffleMask = GetExtractLoMaskEpi8<ExtractLen>();
|
||||
__m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data());
|
||||
return VectorF16<ExtractLen, Packing>(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec)));
|
||||
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
||||
constexpr std::array<std::uint16_t, VectorF16<ExtractLen, Packing>::Alignment> permMask = GetExtractLoMaskEpi16<ExtractLen>();
|
||||
__m256i permIdx = _mm256_loadu_epi16(permMask.data());
|
||||
__m256i result = _mm256_permutexvar_epi16(permIdx, _mm256_castph_si256(v));
|
||||
if constexpr(std::is_same_v<typename VectorF16<ExtractLen, Packing>::VectorType, __m128h>) {
|
||||
return VectorF16<ExtractLen, Packing>(_mm256_castph256_ph128(_mm256_castsi256_ph(result)));
|
||||
} else {
|
||||
return VectorF16<ExtractLen, Packing>(_mm256_castsi256_ph(result));
|
||||
}
|
||||
} else {
|
||||
constexpr std::array<std::uint16_t, VectorF16<ExtractLen, Packing>::Alignment> permMask = GetExtractLoMaskEpi16<ExtractLen>();
|
||||
__m512i permIdx = _mm512_loadu_epi16(permMask.data());
|
||||
__m512i result = _mm512_permutexvar_epi16(permIdx, _mm512_castph_si512(v));
|
||||
if constexpr(std::is_same_v<typename VectorF16<ExtractLen, Packing>::VectorType, __m128h>) {
|
||||
return VectorF16<ExtractLen, Packing>(_mm512_castph512_ph128(_mm512_castsi512_ph(result)));
|
||||
} else if constexpr(std::is_same_v<typename VectorF16<ExtractLen, Packing>::VectorType, __m256h>) {
|
||||
return VectorF16<ExtractLen, Packing>(_mm512_castph512_ph256(_mm512_castsi512_ph(result)));
|
||||
} else {
|
||||
return VectorF16<ExtractLen, Packing>(_mm512_castsi512_ph(result));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if constexpr(std::is_same_v<VectorType, __m256h> && std::is_same_v<typename VectorF16<ExtractLen, Packing>::VectorType, __m128h>) {
|
||||
return VectorF16<ExtractLen, Packing>(_mm256_castph256_ph128(v));
|
||||
} else if constexpr(std::is_same_v<VectorType, __m512h> && std::is_same_v<typename VectorF16<ExtractLen, Packing>::VectorType, __m128h>) {
|
||||
return VectorF16<ExtractLen, Packing>(_mm512_castph512_ph128(v));
|
||||
} else if constexpr(std::is_same_v<VectorType, __m512h> && std::is_same_v<typename VectorF16<ExtractLen, Packing>::VectorType, __m256h>) {
|
||||
return VectorF16<ExtractLen, Packing>(_mm512_castph512_ph256(v));
|
||||
} else {
|
||||
return VectorF16<ExtractLen, Packing>(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
constexpr void Normalize() requires(Packing == 1) {
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
_Float16 dot = LengthSq();
|
||||
__m128h vec = _mm_set1_ph(dot);
|
||||
|
|
@ -277,12 +341,12 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr _Float16 Length() const {
|
||||
constexpr _Float16 Length() const requires(Packing == 1) {
|
||||
_Float16 Result = LengthSq();
|
||||
return std::sqrtf(Result);
|
||||
}
|
||||
|
||||
constexpr _Float16 LengthSq() const {
|
||||
constexpr _Float16 LengthSq() const requires(Packing == 1) {
|
||||
return Dot(*this, *this);
|
||||
}
|
||||
|
||||
|
|
@ -369,7 +433,6 @@ namespace Crafter {
|
|||
template <std::array<bool, Len> values>
|
||||
constexpr VectorF16<Len, Packing> Negate() {
|
||||
std::array<std::uint16_t, Alignment> mask = GetNegateMask<values>();
|
||||
std::println("{}", mask);
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
return VectorF16<Len, Packing>(_mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(v), _mm_loadu_epi16(mask.data()))));
|
||||
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
||||
|
|
@ -391,15 +454,15 @@ namespace Crafter {
|
|||
}
|
||||
} else {
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
constexpr std::array<std::uint8_t, 16> shuffleMask = GetShuffleMaskEpi8<ShuffleValues>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask = GetShuffleMaskEpi8<ShuffleValues>();
|
||||
__m128i shuffleVec = _mm_loadu_epi8(shuffleMask.data());
|
||||
return VectorF16<Len, Packing>(_mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(v), shuffleVec)));
|
||||
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
||||
constexpr std::array<std::uint8_t, 32> shuffleMask = GetShuffleMaskEpi8<ShuffleValues>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask = GetShuffleMaskEpi8<ShuffleValues>();
|
||||
__m256i shuffleVec = _mm256_loadu_epi8(shuffleMask.data());
|
||||
return VectorF16<Len, Packing>(_mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(v)), _mm512_castsi256_si512(shuffleVec)))));
|
||||
} else {
|
||||
constexpr std::array<std::uint8_t, 64> shuffleMask = GetShuffleMaskEpi8<ShuffleValues>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask = GetShuffleMaskEpi8<ShuffleValues>();
|
||||
__m512i shuffleVec = _mm512_loadu_epi8(shuffleMask.data());
|
||||
return VectorF16<Len, Packing>(_mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(v), shuffleVec)));
|
||||
}
|
||||
|
|
@ -428,12 +491,12 @@ namespace Crafter {
|
|||
|
||||
constexpr static VectorF16<Len, Packing> Cross(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) requires(Len == 3) {
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
constexpr std::array<std::uint8_t, 16> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||
__m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1.data());
|
||||
__m128h row1 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec1));
|
||||
__m128h row4 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec1));
|
||||
|
||||
constexpr std::array<std::uint8_t, 16> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||
__m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3.data());
|
||||
__m128h row3 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec3));
|
||||
__m128h row2 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec3));
|
||||
|
|
@ -441,12 +504,12 @@ namespace Crafter {
|
|||
__m128h result = _mm_mul_ph(row3, row4);
|
||||
return _mm_fmsub_ph(row1,row2,result);
|
||||
} else if constexpr (std::is_same_v<VectorType, __m256h>) {
|
||||
constexpr std::array<std::uint8_t, 32> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||
__m512i shuffleVec1 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask1.data()));
|
||||
__m256h row1 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec1)));
|
||||
__m256h row4 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(b.v)), shuffleVec1)));
|
||||
|
||||
constexpr std::array<std::uint8_t, 32> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||
|
||||
__m512i shuffleVec3 = _mm512_castsi256_si512(_mm256_loadu_epi8(shuffleMask3.data()));
|
||||
__m256h row3 = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_shuffle_epi8(_mm512_castsi256_si512(_mm256_castph_si256(a.v)), shuffleVec3)));
|
||||
|
|
@ -455,13 +518,13 @@ namespace Crafter {
|
|||
__m256h result = _mm256_mul_ph(row3, row4);
|
||||
return _mm256_fmsub_ph(row1,row2,result);
|
||||
} else {
|
||||
constexpr std::array<std::uint8_t, 64> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask1 = GetShuffleMaskEpi8<{{1,2,0}}>();
|
||||
|
||||
__m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1.data());
|
||||
__m512h row1 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec1));
|
||||
__m512h row4 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec1));
|
||||
|
||||
constexpr std::array<std::uint8_t, 64> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||
constexpr std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask3 = GetShuffleMaskEpi8<{{2,0,1}}>();
|
||||
|
||||
__m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3.data());
|
||||
__m512h row3 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec3));
|
||||
|
|
@ -1263,32 +1326,8 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval std::array<std::uint8_t, 16> GetShuffleMaskEpi8() requires (std::is_same_v<VectorType, __m128h>){
|
||||
std::array<std::uint8_t, 16> shuffleMask {{0}};
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < Len; i++) {
|
||||
shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2);
|
||||
shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2);
|
||||
}
|
||||
}
|
||||
return shuffleMask;
|
||||
}
|
||||
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval std::array<std::uint8_t, 32> GetShuffleMaskEpi8() requires (std::is_same_v<VectorType, __m256h>){
|
||||
std::array<std::uint8_t, 32> shuffleMask {{0}};
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < Len; i++) {
|
||||
shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2);
|
||||
shuffleMask[(i2*Len*2)+(i*2+1)] = ShuffleValues[i]*2+1+(i2*Len*2);
|
||||
}
|
||||
}
|
||||
return shuffleMask;
|
||||
}
|
||||
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval std::array<std::uint8_t, 64> GetShuffleMaskEpi8() requires (std::is_same_v<VectorType, __m512h>){
|
||||
std::array<std::uint8_t, 64> shuffleMask {{0}};
|
||||
static consteval std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> GetShuffleMaskEpi8() {
|
||||
std::array<std::uint8_t, VectorF16<Len, Packing>::Alignment*2> shuffleMask {{0}};
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < Len; i++) {
|
||||
shuffleMask[(i2*Len*2)+(i*2)] = ShuffleValues[i]*2+(i2*Len*2);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue