WE ARE SO BACK
This commit is contained in:
parent
8999c8b9ec
commit
143b71eeb9
6 changed files with 1339 additions and 608 deletions
|
|
@ -8,11 +8,15 @@ import std;
|
|||
namespace Crafter {
|
||||
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||
struct VectorF16;
|
||||
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||
struct VectorF32;
|
||||
|
||||
template <std::uint8_t Len, std::uint8_t Packing, typename T>
|
||||
struct VectorBase {
|
||||
template <std::uint8_t L, std::uint8_t P>
|
||||
friend struct VectorF16;
|
||||
template <std::uint8_t L, std::uint8_t P>
|
||||
friend struct VectorF32;
|
||||
protected:
|
||||
static consteval std::uint8_t GetAlingment() {
|
||||
if(Len * Packing * sizeof(T) <= 16) {
|
||||
|
|
@ -23,9 +27,14 @@ namespace Crafter {
|
|||
return 64;
|
||||
}
|
||||
}
|
||||
using VectorType = std::conditional_t<
|
||||
(Len * Packing > 16), __m512h,
|
||||
std::conditional_t<(Len * Packing > 8), __m256h, __m128h>
|
||||
|
||||
using VectorType = std::conditional_t<std::is_same_v<T, _Float16>,
|
||||
|
||||
std::conditional_t<(Len * Packing > 16), __m512h,
|
||||
std::conditional_t<(Len * Packing > 8), __m256h, __m128h>>,
|
||||
|
||||
std::conditional_t<(Len * Packing > 8), __m512,
|
||||
std::conditional_t<(Len * Packing > 4), __m256, __m128>>
|
||||
>;
|
||||
|
||||
VectorType v;
|
||||
|
|
@ -87,10 +96,21 @@ namespace Crafter {
|
|||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval std::array<std::uint8_t, Alignment> GetShuffleMaskEpi8() {
|
||||
std::array<std::uint8_t, Alignment> shuffleMask {{0}};
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < Len; i++) {
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T))] = ShuffleValues[i]*sizeof(T)+(i2*Len*sizeof(T));
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T)+1)] = ShuffleValues[i]*sizeof(T)+1+(i2*Len*sizeof(T));
|
||||
if constexpr(std::same_as<T, _Float16>) {
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < Len; i++) {
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T))] = ShuffleValues[i]*sizeof(T)+(i2*Len*sizeof(T));
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T)+1)] = ShuffleValues[i]*sizeof(T)+1+(i2*Len*sizeof(T));
|
||||
}
|
||||
}
|
||||
} else if constexpr(std::same_as<T, float>) {
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < Len; i++) {
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T))] = ShuffleValues[i]*sizeof(T)+(i2*Len*sizeof(T));
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T)+1)] = ShuffleValues[i]*sizeof(T)+1+(i2*Len*sizeof(T));
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T)+2)] = ShuffleValues[i]*sizeof(T)+2+(i2*Len*sizeof(T));
|
||||
shuffleMask[(i2*Len*sizeof(T))+(i*sizeof(T)+3)] = ShuffleValues[i]*sizeof(T)+3+(i2*Len*sizeof(T));
|
||||
}
|
||||
}
|
||||
}
|
||||
return shuffleMask;
|
||||
|
|
@ -107,6 +127,10 @@ namespace Crafter {
|
|||
high_bit = std::bit_cast<T>(
|
||||
static_cast<std::uint16_t>(1u << (std::numeric_limits<std::uint16_t>::digits - 1))
|
||||
);
|
||||
} else if constexpr(sizeof(T) == 4) {
|
||||
high_bit = std::bit_cast<T>(
|
||||
static_cast<std::uint32_t>(1u << (std::numeric_limits<std::uint32_t>::digits - 1))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -135,8 +159,19 @@ namespace Crafter {
|
|||
template <std::uint32_t ExtractLen>
|
||||
static consteval std::array<std::uint16_t, AlignmentElement> GetExtractLoMaskEpi16() {
|
||||
std::array<std::uint16_t, AlignmentElement> mask{};
|
||||
for (std::uint16_t i2 = 0; i2 < Packing; i2++) {
|
||||
for (std::uint16_t i = 0; i < ExtractLen; i++) {
|
||||
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for (std::uint8_t i = 0; i < ExtractLen; i++) {
|
||||
mask[i2 * ExtractLen + i] = i + (i2 * Len);
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
template <std::uint32_t ExtractLen>
|
||||
static consteval std::array<std::uint32_t, AlignmentElement> GetExtractLoMaskEpi32() {
|
||||
std::array<std::uint32_t, AlignmentElement> mask{};
|
||||
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for (std::uint8_t i = 0; i < ExtractLen; i++) {
|
||||
mask[i2 * ExtractLen + i] = i + (i2 * Len);
|
||||
}
|
||||
}
|
||||
|
|
@ -146,8 +181,8 @@ namespace Crafter {
|
|||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval std::uint8_t GetShuffleMaskEpi32() {
|
||||
std::uint8_t mask = 0;
|
||||
for(std::uint8_t i = 0; i < std::min(Len, std::uint8_t(8)); i+=2) {
|
||||
mask = mask | (ShuffleValues[i] & 0b11) << i;
|
||||
for(std::uint8_t i = 0; i < std::min(Len, std::uint8_t(8)); i+=4/sizeof(T)) {
|
||||
mask = mask | (ShuffleValues[i] & 0b11) << (8 / sizeof(T) * i);
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
|
@ -163,6 +198,17 @@ namespace Crafter {
|
|||
return shuffleMask;
|
||||
}
|
||||
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval std::array<std::uint32_t, AlignmentElement> GetPermuteMaskEpi32() {
|
||||
std::array<std::uint32_t, AlignmentElement> shuffleMask {{0}};
|
||||
for(std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for(std::uint8_t i = 0; i < Len; i++) {
|
||||
shuffleMask[i2*Len+i] = ShuffleValues[i]+i2*Len;
|
||||
}
|
||||
}
|
||||
return shuffleMask;
|
||||
}
|
||||
|
||||
template <std::array<bool, Len> ShuffleValues>
|
||||
static consteval std::uint8_t GetBlendMaskEpi16() requires (std::is_same_v<VectorType, __m128h>){
|
||||
std::uint8_t mask = 0;
|
||||
|
|
@ -202,6 +248,45 @@ namespace Crafter {
|
|||
return mask;
|
||||
}
|
||||
|
||||
template <std::array<bool, Len> ShuffleValues>
|
||||
static consteval std::uint8_t GetBlendMaskEpi32() requires (std::is_same_v<VectorType, __m128>){
|
||||
std::uint8_t mask = 0;
|
||||
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for (std::uint8_t i = 0; i < Len; i++) {
|
||||
if (ShuffleValues[i]) {
|
||||
mask |= (1u << (i2 * Len + i));
|
||||
}
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
template <std::array<bool, Len> ShuffleValues>
|
||||
static consteval std::uint16_t GetBlendMaskEpi32() requires (std::is_same_v<VectorType, __m256>){
|
||||
std::uint16_t mask = 0;
|
||||
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for (std::uint8_t i = 0; i < Len; i++) {
|
||||
if (ShuffleValues[i]) {
|
||||
mask |= (1u << (i2 * Len + i));
|
||||
}
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
template <std::array<bool, Len> ShuffleValues>
|
||||
static consteval std::uint32_t GetBlendMaskEpi32() requires (std::is_same_v<VectorType, __m512>){
|
||||
std::uint32_t mask = 0;
|
||||
for (std::uint8_t i2 = 0; i2 < Packing; i2++) {
|
||||
for (std::uint8_t i = 0; i < Len; i++) {
|
||||
if (ShuffleValues[i]) {
|
||||
mask |= (1u << (i2 * Len + i));
|
||||
}
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
static constexpr float two_over_pi = 0.6366197723675814f;
|
||||
static constexpr float pi_over_2_hi = 1.5707963267341256f;
|
||||
static constexpr float pi_over_2_lo = 6.077100506506192e-11f;
|
||||
|
|
@ -221,6 +306,102 @@ namespace Crafter {
|
|||
static constexpr float s7 = 0.0000027526372f;
|
||||
static constexpr float s9 = -0.0000000239013f;
|
||||
|
||||
// --- 128-bit (SSE) helpers ---
|
||||
static constexpr void range_reduce_f32x4(__m128 ax, __m128& r, __m128& r2, __m128i& q) {
|
||||
__m128 fq = _mm_round_ps(_mm_mul_ps(ax, _mm_set1_ps(two_over_pi)), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
||||
q = _mm_cvtps_epi32(fq);
|
||||
r = _mm_sub_ps(ax, _mm_mul_ps(fq, _mm_set1_ps(pi_over_2_hi)));
|
||||
r = _mm_sub_ps(r, _mm_mul_ps(fq, _mm_set1_ps(pi_over_2_lo)));
|
||||
r2 = _mm_mul_ps(r, r);
|
||||
}
|
||||
|
||||
static constexpr void sincos_poly_f32x4(__m128 r, __m128 r2, __m128& cos_r, __m128& sin_r) {
|
||||
cos_r = _mm_fmadd_ps(_mm_set1_ps(c10), r2, _mm_set1_ps(c8));
|
||||
cos_r = _mm_fmadd_ps(cos_r, r2, _mm_set1_ps(c6));
|
||||
cos_r = _mm_fmadd_ps(cos_r, r2, _mm_set1_ps(c4));
|
||||
cos_r = _mm_fmadd_ps(cos_r, r2, _mm_set1_ps(c2));
|
||||
cos_r = _mm_fmadd_ps(cos_r, r2, _mm_set1_ps(c0));
|
||||
|
||||
sin_r = _mm_fmadd_ps(_mm_set1_ps(s9), r2, _mm_set1_ps(s7));
|
||||
sin_r = _mm_fmadd_ps(sin_r, r2, _mm_set1_ps(s5));
|
||||
sin_r = _mm_fmadd_ps(sin_r, r2, _mm_set1_ps(s3));
|
||||
sin_r = _mm_fmadd_ps(sin_r, r2, _mm_set1_ps(s1));
|
||||
sin_r = _mm_fmadd_ps(sin_r, r2, _mm_set1_ps(1.0f));
|
||||
sin_r = _mm_mul_ps(sin_r, r);
|
||||
}
|
||||
|
||||
// cos(x): use cos_poly when q even, sin_poly when q odd; negate if (q+1)&2
|
||||
static constexpr __m128 cos_f32x4(__m128 x) {
|
||||
const __m128 sign_mask = _mm_set1_ps(-0.0f);
|
||||
__m128 ax = _mm_andnot_ps(sign_mask, x);
|
||||
|
||||
__m128 r, r2; __m128i q;
|
||||
range_reduce_f32x4(ax, r, r2, q);
|
||||
|
||||
__m128 cos_r, sin_r;
|
||||
sincos_poly_f32x4(r, r2, cos_r, sin_r);
|
||||
|
||||
__m128i odd = _mm_and_si128(q, _mm_set1_epi32(1));
|
||||
__m128 use_sin = _mm_castsi128_ps(_mm_cmpeq_epi32(odd, _mm_set1_epi32(1)));
|
||||
__m128 result = _mm_blendv_ps(cos_r, sin_r, use_sin);
|
||||
|
||||
__m128i need_neg = _mm_and_si128(
|
||||
_mm_add_epi32(q, _mm_set1_epi32(1)), _mm_set1_epi32(2));
|
||||
__m128 neg_mask = _mm_castsi128_ps(_mm_slli_epi32(need_neg, 30));
|
||||
return _mm_xor_ps(result, neg_mask);
|
||||
}
|
||||
|
||||
// sin(x): use sin_poly when q even, cos_poly when q odd; negate if q&2; respect input sign
|
||||
static constexpr __m128 sin_f32x4(__m128 x) {
|
||||
const __m128 sign_mask = _mm_set1_ps(-0.0f);
|
||||
__m128 x_sign = _mm_and_ps(x, sign_mask);
|
||||
__m128 ax = _mm_andnot_ps(sign_mask, x);
|
||||
|
||||
__m128 r, r2; __m128i q;
|
||||
range_reduce_f32x4(ax, r, r2, q);
|
||||
|
||||
__m128 cos_r, sin_r;
|
||||
sincos_poly_f32x4(r, r2, cos_r, sin_r);
|
||||
|
||||
__m128i odd = _mm_and_si128(q, _mm_set1_epi32(1));
|
||||
__m128 use_cos = _mm_castsi128_ps(_mm_cmpeq_epi32(odd, _mm_set1_epi32(1)));
|
||||
__m128 result = _mm_blendv_ps(sin_r, cos_r, use_cos);
|
||||
|
||||
__m128i need_neg = _mm_and_si128(q, _mm_set1_epi32(2));
|
||||
__m128 neg_mask = _mm_castsi128_ps(_mm_slli_epi32(need_neg, 30));
|
||||
result = _mm_xor_ps(result, neg_mask);
|
||||
|
||||
// Apply original sign of x
|
||||
return _mm_xor_ps(result, x_sign);
|
||||
}
|
||||
|
||||
// --- 128-bit sincos ---
|
||||
static constexpr void sincos_f32x4(__m128 x, __m128& out_sin, __m128& out_cos) {
|
||||
const __m128 sign_mask = _mm_set1_ps(-0.0f);
|
||||
__m128 x_sign = _mm_and_ps(x, sign_mask);
|
||||
__m128 ax = _mm_andnot_ps(sign_mask, x);
|
||||
|
||||
__m128 r, r2; __m128i q;
|
||||
range_reduce_f32x4(ax, r, r2, q);
|
||||
|
||||
__m128 cos_r, sin_r;
|
||||
sincos_poly_f32x4(r, r2, cos_r, sin_r);
|
||||
|
||||
__m128i odd = _mm_and_si128(q, _mm_set1_epi32(1));
|
||||
__m128 is_odd = _mm_castsi128_ps(_mm_cmpeq_epi32(odd, _mm_set1_epi32(1)));
|
||||
|
||||
// cos: swap on odd, negate if (q+1)&2
|
||||
out_cos = _mm_blendv_ps(cos_r, sin_r, is_odd);
|
||||
__m128i cos_neg = _mm_and_si128(_mm_add_epi32(q, _mm_set1_epi32(1)), _mm_set1_epi32(2));
|
||||
out_cos = _mm_xor_ps(out_cos, _mm_castsi128_ps(_mm_slli_epi32(cos_neg, 30)));
|
||||
|
||||
// sin: swap on odd, negate if q&2, apply input sign
|
||||
out_sin = _mm_blendv_ps(sin_r, cos_r, is_odd);
|
||||
__m128i sin_neg = _mm_and_si128(q, _mm_set1_epi32(2));
|
||||
out_sin = _mm_xor_ps(out_sin, _mm_castsi128_ps(_mm_slli_epi32(sin_neg, 30)));
|
||||
out_sin = _mm_xor_ps(out_sin, x_sign);
|
||||
}
|
||||
|
||||
// Reduce |x| into [-pi/4, pi/4], return reduced value and quadrant
|
||||
static constexpr void range_reduce_f32x8(__m256 ax, __m256& r, __m256& r2, __m256i& q) {
|
||||
__m256 fq = _mm256_round_ps(_mm256_mul_ps(ax, _mm256_set1_ps(two_over_pi)), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue