more F16 math
This commit is contained in:
parent
e3cd40fa90
commit
c54ff6228c
1 changed files with 399 additions and 23 deletions
|
|
@ -263,27 +263,72 @@ namespace Crafter {
|
|||
return Dot(*this, *this);
|
||||
}
|
||||
|
||||
// template <typename AT, std::uint32_t Alen, std::uint32_t AAlignment, typename BT, std::uint32_t Blen, std::uint32_t BAlignment>
|
||||
// constexpr static Vector<T, Len, Aligment> Cross(Vector<AT, Alen, AAlignment> a, Vector<BT, Blen, BAlignment> b) requires(Len == 3 && Alen >= 3 && Blen >= 3) {
|
||||
// return Vector<T, Len, Aligment>(
|
||||
// (a.v[1] * b.v[2]) - (a.v[2] * b.v[1]),
|
||||
// (a.v[2] * b.v[0]) - (a.v[0] * b.v[2]),
|
||||
// (a.v[0] * b.v[1]) - (a.v[1] * b.v[0])
|
||||
// );
|
||||
// }
|
||||
constexpr static VectorF16<Len, Packing, Repeats> Cross(VectorF16<Len, Packing, Repeats> a, VectorF16<Len, Packing, Repeats> b) requires(Len == 3 && Packing == 2) {
|
||||
if constexpr(Len == 3) {
|
||||
if constexpr(Repeats == 1) {
|
||||
constexpr std::uint8_t shuffleMask1[] {
|
||||
2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16
|
||||
};
|
||||
__m128i shuffleVec1 = _mm_loadu_epi8(shuffleMask1);
|
||||
__m128h row1 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec1));
|
||||
__m128h row4 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec1));
|
||||
|
||||
// template <typename AT, std::uint32_t Alen, std::uint32_t AAlignment>
|
||||
// constexpr static Vector<T, Len, Aligment> Normalize(Vector<AT, Alen, AAlignment> a) requires(Len == Alen) {
|
||||
// Vector<T, 3, 0> returned;
|
||||
// T fLength = a.Length();
|
||||
|
||||
// fLength = 1.0f / fLength;
|
||||
constexpr std::uint8_t shuffleMask3[] {
|
||||
4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16
|
||||
};
|
||||
__m128i shuffleVec3 = _mm_loadu_epi8(shuffleMask3);
|
||||
__m128h row3 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a.v), shuffleVec3));
|
||||
__m128h row2 = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(b.v), shuffleVec3));
|
||||
|
||||
// for(std::uint32_t i = 0; i < Len; i++) {
|
||||
// returned.v[i] = a.v[i] * fLength;
|
||||
// }
|
||||
// return returned;
|
||||
// }
|
||||
__m128h result = _mm_mul_ph(row3, row4);
|
||||
return _mm_fmsub_ph(row1,row2,result);
|
||||
}
|
||||
if constexpr(Repeats == 2) {
|
||||
constexpr std::uint8_t shuffleMask1[] {
|
||||
2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16,
|
||||
2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16
|
||||
};
|
||||
__m256i shuffleVec1 = _mm256_loadu_epi8(shuffleMask1);
|
||||
__m256h row1 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(a.v), shuffleVec1));
|
||||
__m256h row4 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(b.v), shuffleVec1));
|
||||
|
||||
constexpr std::uint8_t shuffleMask3[] {
|
||||
4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16,
|
||||
4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16
|
||||
};
|
||||
__m256i shuffleVec3 = _mm256_loadu_epi8(shuffleMask3);
|
||||
__m256h row3 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(a.v), shuffleVec3));
|
||||
__m256h row2 = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(b.v), shuffleVec3));
|
||||
|
||||
__m256h result = _mm256_mul_ph(row3, row4);
|
||||
return _mm256_fmsub_ph(row1,row2,result);
|
||||
}
|
||||
if constexpr(Repeats == 4) {
|
||||
constexpr std::uint8_t shuffleMask1[] {
|
||||
2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16,
|
||||
2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16,
|
||||
2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16,
|
||||
2,3,4,5,0,1,6,7,11,12,13,14,9,10,15,16
|
||||
};
|
||||
__m512i shuffleVec1 = _mm512_loadu_epi8(shuffleMask1);
|
||||
__m512h row1 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec1));
|
||||
__m512h row4 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec1));
|
||||
|
||||
constexpr std::uint8_t shuffleMask3[] {
|
||||
4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16,
|
||||
4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16,
|
||||
4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16,
|
||||
4,5,0,1,2,3,6,7,13,14,8,9,11,12,15,16
|
||||
};
|
||||
__m512i shuffleVec3 = _mm512_loadu_epi8(shuffleMask3);
|
||||
__m512h row3 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(a.v), shuffleVec3));
|
||||
__m512h row2 = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(b.v), shuffleVec3));
|
||||
|
||||
__m512h result = _mm512_mul_ph(row3, row4);
|
||||
return _mm512_fmsub_ph(row1,row2,result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
constexpr static _Float16 Dot(VectorF16<Len, 1, 1> a, VectorF16<Len, 1, 1> b) {
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
|
|
@ -535,6 +580,338 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>> Normalize(
|
||||
VectorF16<Len, Packing, Repeats> A,
|
||||
VectorF16<Len, Packing, Repeats> C,
|
||||
VectorF16<Len, Packing, Repeats> E,
|
||||
VectorF16<Len, Packing, Repeats> G
|
||||
) requires(Packing == 2) {
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, C, E, G);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128h one = _mm_loadu_ph(oneArr);
|
||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskA[] {
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3
|
||||
};
|
||||
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA);
|
||||
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskC[] {
|
||||
4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7
|
||||
};
|
||||
__m128i shuffleVecC = _mm_loadu_epi8(shuffleMaskC);
|
||||
__m128h fLenghtC = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecC));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskE[] {
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11
|
||||
};
|
||||
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE);
|
||||
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskG[] {
|
||||
12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15,
|
||||
};
|
||||
__m128i shuffleVecG = _mm_loadu_epi8(shuffleMaskG);
|
||||
__m128h fLenghtG = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecG));
|
||||
|
||||
return {
|
||||
_mm_mul_ph(A.v, fLenghtA),
|
||||
_mm_mul_ph(C.v, fLenghtC),
|
||||
_mm_mul_ph(E.v, fLenghtE),
|
||||
_mm_mul_ph(G.v, fLenghtG),
|
||||
};
|
||||
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, C, E, G);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m256h one = _mm256_loadu_ph(oneArr);
|
||||
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskA[] {
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3,
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3
|
||||
};
|
||||
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA);
|
||||
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskC[] {
|
||||
4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7,
|
||||
4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7
|
||||
};
|
||||
__m256i shuffleVecC = _mm256_loadu_epi8(shuffleMaskC);
|
||||
__m256h fLenghtC = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecC));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskE[] {
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11,
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11
|
||||
};
|
||||
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE);
|
||||
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskG[] {
|
||||
12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15,
|
||||
12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15,
|
||||
};
|
||||
__m256i shuffleVecG = _mm256_loadu_epi8(shuffleMaskG);
|
||||
__m256h fLenghtG = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecG));
|
||||
|
||||
return {
|
||||
_mm256_mul_ph(A.v, fLenghtA),
|
||||
_mm256_mul_ph(C.v, fLenghtC),
|
||||
_mm256_mul_ph(E.v, fLenghtE),
|
||||
_mm256_mul_ph(G.v, fLenghtG),
|
||||
};
|
||||
} else {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, C, E, G);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m512h one = _mm512_loadu_ph(oneArr);
|
||||
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskA[] {
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3,
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3,
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3,
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3
|
||||
};
|
||||
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA);
|
||||
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskC[] {
|
||||
4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7,
|
||||
4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7,
|
||||
4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7,
|
||||
4,5,4,5,4,5,4,5,4,6,7,6,7,6,7,6,7
|
||||
};
|
||||
__m512i shuffleVecC = _mm512_loadu_epi8(shuffleMaskC);
|
||||
__m512h fLenghtC = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecC));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskE[] {
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11,
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11,
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11,
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11
|
||||
};
|
||||
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE);
|
||||
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskG[] {
|
||||
12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15,
|
||||
12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15,
|
||||
12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15,
|
||||
12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15,
|
||||
};
|
||||
__m512i shuffleVecG = _mm512_loadu_epi8(shuffleMaskG);
|
||||
__m512h fLenghtG = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecG));
|
||||
|
||||
return {
|
||||
_mm512_mul_ph(A.v, fLenghtA),
|
||||
_mm512_mul_ph(C.v, fLenghtC),
|
||||
_mm512_mul_ph(E.v, fLenghtE),
|
||||
_mm512_mul_ph(G.v, fLenghtG),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>, VectorF16<Len, Packing, Repeats>> Normalize(
|
||||
VectorF16<Len, Packing, Repeats> A,
|
||||
VectorF16<Len, Packing, Repeats> E
|
||||
) requires(Packing == 4) {
|
||||
if constexpr(std::is_same_v<VectorType, __m128h>) {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, E);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128h one = _mm_loadu_ph(oneArr);
|
||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskA[] {
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7
|
||||
};
|
||||
__m128i shuffleVecA = _mm_loadu_epi8(shuffleMaskA);
|
||||
__m128h fLenghtA = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecA));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskE[] {
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15
|
||||
};
|
||||
__m128i shuffleVecE = _mm_loadu_epi8(shuffleMaskE);
|
||||
__m128h fLenghtE = _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(fLenght), shuffleVecE));
|
||||
|
||||
return {
|
||||
_mm_mul_ph(A.v, fLenghtA),
|
||||
_mm_mul_ph(E.v, fLenghtE),
|
||||
};
|
||||
} else if constexpr(std::is_same_v<VectorType, __m256h>) {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, E);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m256h one = _mm256_loadu_ph(oneArr);
|
||||
__m256h fLenght = _mm256_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskA[] {
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7
|
||||
};
|
||||
__m256i shuffleVecA = _mm256_loadu_epi8(shuffleMaskA);
|
||||
__m256h fLenghtA = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecA));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskE[] {
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15,
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15
|
||||
};
|
||||
__m256i shuffleVecE = _mm256_loadu_epi8(shuffleMaskE);
|
||||
__m256h fLenghtE = _mm256_castsi256_ph(_mm256_shuffle_epi8(_mm256_castph_si256(fLenght), shuffleVecE));
|
||||
|
||||
return {
|
||||
_mm256_mul_ph(A.v, fLenghtA),
|
||||
_mm256_mul_ph(E.v, fLenghtE),
|
||||
};
|
||||
} else {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, E);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m512h one = _mm512_loadu_ph(oneArr);
|
||||
__m512h fLenght = _mm512_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskA[] {
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7
|
||||
};
|
||||
__m512i shuffleVecA = _mm512_loadu_epi8(shuffleMaskA);
|
||||
__m512h fLenghtA = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecA));
|
||||
|
||||
constexpr std::uint8_t shuffleMaskE[] {
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15,
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15,
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15,
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15
|
||||
};
|
||||
__m512i shuffleVecE = _mm512_loadu_epi8(shuffleMaskE);
|
||||
__m512h fLenghtE = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(fLenght), shuffleVecE));
|
||||
|
||||
return {
|
||||
_mm512_mul_ph(A.v, fLenghtA),
|
||||
_mm512_mul_ph(E.v, fLenghtE),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF16<Len, Packing, 8>, VectorF16<Len, Packing, 8>> NormalizeRepeated(
|
||||
VectorF16<Len, Packing, Repeats> A,
|
||||
VectorF16<Len, Packing, Repeats> B,
|
||||
VectorF16<Len, Packing, Repeats> C,
|
||||
VectorF16<Len, Packing, Repeats> D,
|
||||
VectorF16<Len, Packing, Repeats> E,
|
||||
VectorF16<Len, Packing, Repeats> F,
|
||||
VectorF16<Len, Packing, Repeats> G,
|
||||
VectorF16<Len, Packing, Repeats> H
|
||||
) requires(Len == 8 && Packing == 1 && Repeats == 1) {
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, B, C, D, E, F, G, H);
|
||||
__m128h one = _mm_loadu_ph(oneArr);
|
||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskABCD[] {
|
||||
0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,
|
||||
2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3,
|
||||
4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5,
|
||||
6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7
|
||||
};
|
||||
__m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD); //10 0.5
|
||||
__m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD)); //1 1
|
||||
|
||||
__m512h vecABCD;
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0)); //3 1
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3));
|
||||
vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD); //4 0.5
|
||||
|
||||
constexpr std::uint8_t shuffleMaskEFGH[] {
|
||||
8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9,
|
||||
10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
|
||||
12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
|
||||
14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
|
||||
};
|
||||
__m512h shuffleVecEFGH = _mm512_loadu_epi8(shuffleMaskEFGH); //10 0.5
|
||||
__m512h fLenghtEFGH = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), _mm512_castph_si512(shuffleVecEFGH))); //1 1
|
||||
|
||||
__m512h vecEFGH;
|
||||
vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(E.v), 0)); //3 1
|
||||
vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(F.v), 1));
|
||||
vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(G.v), 2));
|
||||
vecEFGH = _mm512_castps_ph(_mm512_insertf32x4(vecEFGH, _mm_castph_ps(H.v), 3));
|
||||
vecEFGH = _mm512_mul_ph(vecABCD, fLenghtEFGH); //4 0.5
|
||||
return {
|
||||
vecABCD,
|
||||
vecEFGH
|
||||
};
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF16<Len, Packing, 8>, VectorF16<Len, Packing, 8>> NormalizeRepeated(
|
||||
VectorF16<Len, Packing, Repeats> A,
|
||||
VectorF16<Len, Packing, Repeats> B,
|
||||
VectorF16<Len, Packing, Repeats> C,
|
||||
VectorF16<Len, Packing, Repeats> D,
|
||||
VectorF16<Len, Packing, Repeats> E,
|
||||
VectorF16<Len, Packing, Repeats> F,
|
||||
VectorF16<Len, Packing, Repeats> G,
|
||||
VectorF16<Len, Packing, Repeats> H
|
||||
) requires(Len == 4 && Packing == 2 && Repeats == 1) {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, B, C, D, E, F, G, H);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128h one = _mm_loadu_ph(oneArr);
|
||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskABCD[] {
|
||||
0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3,
|
||||
4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7,
|
||||
8,9,8,9,8,9,8,9,10,11,10,11,10,11,
|
||||
12,13,12,13,12,13,14,15,14,15,14,15
|
||||
};
|
||||
__m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD);
|
||||
__m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD));
|
||||
|
||||
__m512h vecABCD;
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3));
|
||||
vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD);
|
||||
return vecABCD;
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF16<Len, Packing, 8>, VectorF16<Len, Packing, 8>> NormalizeRepeated(
|
||||
VectorF16<Len, Packing, Repeats> A,
|
||||
VectorF16<Len, Packing, Repeats> B,
|
||||
VectorF16<Len, Packing, Repeats> C,
|
||||
VectorF16<Len, Packing, Repeats> D,
|
||||
VectorF16<Len, Packing, Repeats> E,
|
||||
VectorF16<Len, Packing, Repeats> F,
|
||||
VectorF16<Len, Packing, Repeats> G,
|
||||
VectorF16<Len, Packing, Repeats> H
|
||||
) requires(Len == 2 && Packing == 4 && Repeats == 1) {
|
||||
VectorF16<Len, Packing, Repeats> lenght = Length(A, B, C, D, E, F, G, H);
|
||||
constexpr _Float16 oneArr[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
||||
__m128h one = _mm_loadu_ph(oneArr);
|
||||
__m128h fLenght = _mm_div_ph(one, lenght.v);
|
||||
|
||||
constexpr std::uint8_t shuffleMaskABCD[] {
|
||||
0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,
|
||||
8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15,
|
||||
16,17,16,17,18,19,18,19,20,21,20,21,22,23,22,23,
|
||||
24,25,24,25,26,27,26,27,28,29,28,29,30,31,30,31
|
||||
};
|
||||
__m512i shuffleVecABCD = _mm512_loadu_epi8(shuffleMaskABCD);
|
||||
__m512h fLenghtABCD = _mm512_castsi512_ph(_mm512_shuffle_epi8(_mm512_castph_si512(_mm512_castph128_ph512(fLenght)), shuffleVecABCD));
|
||||
|
||||
__m512h vecABCD;
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(A.v), 0));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(B.v), 1));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(C.v), 2));
|
||||
vecABCD = _mm512_castps_ph(_mm512_insertf32x4(vecABCD, _mm_castph_ps(D.v), 3));
|
||||
vecABCD = _mm512_mul_ph(vecABCD, fLenghtABCD);
|
||||
return vecABCD;
|
||||
}
|
||||
|
||||
constexpr static VectorF16<Len, Packing, Repeats> Length(
|
||||
VectorF16<Len, Packing, Repeats> A,
|
||||
VectorF16<Len, Packing, Repeats> B,
|
||||
|
|
@ -882,8 +1259,7 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
// template <typename AT, std::uint32_t AAlignment, typename BT, std::uint32_t BAlignment>
|
||||
// constexpr static Vector<T, 3, Aligment> Rotate(Vector<AT, 3, AAlignment> v, Vector<BT, 4, BAlignment> q) requires(Len == 3) {
|
||||
// constexpr static VectorF16<Len, Packing, Repeats> Rotate(VectorF16<3, Packing, Repeats> v, VectorF16<4, Packing, Repeats> q) requires(Len == 3) {
|
||||
// Vector<T, 3, 0> qv(q.x, q.y, q.z);
|
||||
// Vector<T, 3, 0> t = Vector<T, 3, Aligment>::Cross(qv, v) * T(2);
|
||||
// return v + t * q.w + Vector<T, 3, Aligment>::Cross(qv, t);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue