packed intersection and matrix

This commit is contained in:
Jorijn van der Graaf 2026-05-18 19:57:40 +02:00
commit f0becd1582
7 changed files with 948 additions and 557 deletions

View file

@ -449,8 +449,8 @@ namespace Crafter {
}
template <std::array<bool, Len> values>
constexpr VectorF32<Len, Packing> Negate() {
std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>();
constexpr VectorF32<Len, Packing> Negate() const {
std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>();
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data())))));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
@ -549,9 +549,41 @@ namespace Crafter {
}
}
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
// Public variadic surface — one name per op, arity locked to BatchSize.
// The Pack helpers below carry the SIMD bodies and the per-(Len,Packing)
// requires clauses; this wrapper just forwards once arity matches.
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
return NormalizePack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
return LengthPack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
return LengthSqPack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Dot(VectorF32<Len, Packing> first, Rest... rest) {
return DotPack(first, rest...);
}
private:
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
@ -614,9 +646,9 @@ namespace Crafter {
}
}
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 1) {
@ -638,9 +670,9 @@ namespace Crafter {
};
}
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 2) {
@ -663,9 +695,9 @@ namespace Crafter {
}
#ifdef __AVX512F__
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C
) requires(Len == 3 && Packing == 5) {
VectorF32<1, 15> lenght = Length(A, B, C);
@ -685,8 +717,8 @@ namespace Crafter {
}
#endif
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
VectorF32<Len, Packing> A,
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
@ -733,13 +765,13 @@ namespace Crafter {
}
}
constexpr static VectorF32<1, Packing*4> Length(
VectorF32<Len, Packing> A,
constexpr static VectorF32<1, Packing*4> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
VectorF32<1, Packing*4> lenghtSq = LengthSq(A, B, C, D);
VectorF32<1, Packing*4> lenghtSq = LengthSqPack(A, B, C, D);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
@ -749,42 +781,42 @@ namespace Crafter {
}
}
constexpr static VectorF32<1, 4> Length(
constexpr static VectorF32<1, 4> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 1) {
VectorF32<1, 4> lenghtSq = LengthSq(A, B, C, D);
VectorF32<1, 4> lenghtSq = LengthSqPack(A, B, C, D);
return VectorF32<1, 4>(_mm_sqrt_ps(lenghtSq.v));
}
constexpr static VectorF32<1, 8> Length(
constexpr static VectorF32<1, 8> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 2) {
VectorF32<1, 8> lenghtSq = LengthSq(A, B, C, D);
VectorF32<1, 8> lenghtSq = LengthSqPack(A, B, C, D);
return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
}
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> Length(
constexpr static VectorF32<1, 15> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C
) requires(Len == 3 && Packing == 5) {
VectorF32<1, 15> lenghtSq = LengthSq(A, B, C);
VectorF32<1, 15> lenghtSq = LengthSqPack(A, B, C);
return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v));
}
#endif
constexpr static VectorF32<1, Packing*2> Length(
VectorF32<Len, Packing> A,
constexpr static VectorF32<1, Packing*2> LengthPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> C
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
VectorF32<1, Packing*2> lenghtSq = LengthSq(A, C);
VectorF32<1, Packing*2> lenghtSq = LengthSqPack(A, C);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
@ -796,51 +828,51 @@ namespace Crafter {
}
}
constexpr static VectorF32<1, Packing*4> LengthSq(
VectorF32<Len, Packing> A,
constexpr static VectorF32<1, Packing*4> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
return Dot(A, A, B, B, C, C, D, D);
return DotPack(A, A, B, B, C, C, D, D);
}
constexpr static VectorF32<1, 4> LengthSq(
constexpr static VectorF32<1, 4> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 1) {
return Dot(A, A, B, B, C, C, D, D);
return DotPack(A, A, B, B, C, C, D, D);
}
constexpr static VectorF32<1, 8> LengthSq(
constexpr static VectorF32<1, 8> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C,
VectorF32<Len, Packing> D
) requires(Len == 3 && Packing == 2) {
return Dot(A, A, B, B, C, C, D, D);
return DotPack(A, A, B, B, C, C, D, D);
}
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> LengthSq(
constexpr static VectorF32<1, 15> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> B,
VectorF32<Len, Packing> C
) requires(Len == 3 && Packing == 5) {
return Dot(A, A, B, B, C, C);
return DotPack(A, A, B, B, C, C);
}
#endif
constexpr static VectorF32<1, Packing*2> LengthSq(
VectorF32<Len, Packing> A,
constexpr static VectorF32<1, Packing*2> LengthSqPack(
VectorF32<Len, Packing> A,
VectorF32<Len, Packing> C
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
return Dot(A, A, C, C);
return DotPack(A, A, C, C);
}
constexpr static VectorF32<1, Packing*4> Dot(
constexpr static VectorF32<1, Packing*4> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
@ -869,7 +901,7 @@ namespace Crafter {
}
}
constexpr static VectorF32<1, 4> Dot(
constexpr static VectorF32<1, 4> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
@ -914,7 +946,7 @@ namespace Crafter {
return row1;
}
constexpr static VectorF32<1, 8> Dot(
constexpr static VectorF32<1, 8> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
@ -1021,7 +1053,7 @@ namespace Crafter {
}
#ifdef __AVX512F__
constexpr static VectorF32<1, 15> Dot(
constexpr static VectorF32<1, 15> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
@ -1112,8 +1144,8 @@ namespace Crafter {
}
#endif
constexpr static VectorF32<1, Packing*2> Dot(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
constexpr static VectorF32<1, Packing*2> DotPack(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
@ -1548,9 +1580,10 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = 1 + sizeof...(Rest);
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
alignas(16) float buf[4] = {0,0,0,0};
@ -1571,41 +1604,39 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
auto sq = LengthSq(first, rest...);
sq.v = wasm_f32x4_sqrt(sq.v);
return sq;
}
// Four pairwise dot products packed into one v128. Only the first Len
// Pairwise dot products packed into one v128. Only the first Len
// lanes contribute, so the same routine handles 3- and 4-component
// inputs — the 4th lane of Len==3 inputs may be garbage from Cross()
// and must not be summed.
constexpr static VectorF32<1, 4> Dot(
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
VectorF32<Len, Packing> D0, VectorF32<Len, Packing> D1
) requires((Len == 3 || Len == 4) && Packing == 1) {
alignas(16) float a0[4], a1[4], b0[4], b1[4], c0[4], c1[4], d0[4], d1[4];
wasm_v128_store(a0, A0.v); wasm_v128_store(a1, A1.v);
wasm_v128_store(b0, B0.v); wasm_v128_store(b1, B1.v);
wasm_v128_store(c0, C0.v); wasm_v128_store(c1, C1.v);
wasm_v128_store(d0, D0.v); wasm_v128_store(d1, D1.v);
// and must not be summed. Takes BatchSize pairs (== 4 here since
// WASM AlignmentElement is always 4 and Packing must be 1).
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize) &&
(Len == 3 || Len == 4) && Packing == 1)
constexpr static VectorF32<1, 4> Dot(VectorF32<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
std::array<VectorF32<Len, Packing>, 2 * N> args{ first, rest... };
alignas(16) float out[4] = {0,0,0,0};
for (std::uint8_t k = 0; k < Len; ++k) {
out[0] += a0[k] * a1[k];
out[1] += b0[k] * b1[k];
out[2] += c0[k] * c1[k];
out[3] += d0[k] * d1[k];
for (std::uint8_t i = 0; i < N; ++i) {
alignas(16) float a[4], b[4];
wasm_v128_store(a, args[2 * i].v);
wasm_v128_store(b, args[2 * i + 1].v);
for (std::uint8_t k = 0; k < Len; ++k) out[i] += a[k] * b[k];
}
return VectorF32<1, 4>(wasm_v128_load(out));
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
auto normOne = [](VectorF32<Len, Packing> u) {
alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v);
@ -1622,7 +1653,7 @@ namespace Crafter {
}
return VectorF32<Len, Packing>(wasm_v128_load(out));
};
return std::make_tuple(normOne(first), normOne(rest)...);
return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
}
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
@ -1842,9 +1873,10 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = 1 + sizeof...(Rest);
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
for (std::uint8_t i = 0; i < N; ++i)
@ -1860,7 +1892,8 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
auto sq = LengthSq(first, rest...);
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]);
@ -1868,7 +1901,8 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
auto normOne = [](VectorF32<Len, Packing> u) {
VectorF32<Len, Packing> out;
@ -1884,7 +1918,7 @@ namespace Crafter {
}
return out;
};
return std::make_tuple(normOne(first), normOne(rest)...);
return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
}
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {