packed intersection and matrix

This commit is contained in:
Jorijn van der Graaf 2026-05-18 19:57:40 +02:00
commit f0becd1582
7 changed files with 948 additions and 557 deletions

View file

@ -554,9 +554,41 @@ namespace Crafter {
}
}
constexpr static std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>, VectorF16<Len, Packing>, VectorF16<Len, Packing>> Normalize(
VectorF16<Len, Packing> A,
VectorF16<Len, Packing> C,
// Public variadic surface — one name per op, arity locked to BatchSize
// (or 2*BatchSize for Dot). Forwards to the *Pack helpers below which
// carry the SIMD bodies and per-(Len,Packing) requires clauses.
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
return NormalizePack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
return LengthPack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
return LengthSqPack(first, rest...);
}
template <typename... Rest>
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, _Float16>::BatchSize))
constexpr static auto Dot(VectorF16<Len, Packing> first, Rest... rest) {
return DotPack(first, rest...);
}
private:
constexpr static std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize> NormalizePack(
VectorF16<Len, Packing> A,
VectorF16<Len, Packing> C,
VectorF16<Len, Packing> E,
VectorF16<Len, Packing> G
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
@ -616,8 +648,8 @@ namespace Crafter {
}
}
constexpr static std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>> Normalize(
VectorF16<Len, Packing> A,
constexpr static std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize> NormalizePack(
VectorF16<Len, Packing> A,
VectorF16<Len, Packing> E
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
@ -662,13 +694,13 @@ namespace Crafter {
}
}
constexpr static VectorF16<1, Packing*4> Length(
VectorF16<Len, Packing> A,
constexpr static VectorF16<1, Packing*4> LengthPack(
VectorF16<Len, Packing> A,
VectorF16<Len, Packing> C,
VectorF16<Len, Packing> E,
VectorF16<Len, Packing> G
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
VectorF16<1, Packing*4> lenghtSq = LengthSq(A, C, E, G);
VectorF16<1, Packing*4> lenghtSq = LengthSqPack(A, C, E, G);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
return VectorF16<1, Packing*4>(_mm_sqrt_ph(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m256h>) {
@ -678,11 +710,11 @@ namespace Crafter {
}
}
constexpr static VectorF16<1, Packing*2> Length(
VectorF16<Len, Packing> A,
constexpr static VectorF16<1, Packing*2> LengthPack(
VectorF16<Len, Packing> A,
VectorF16<Len, Packing> E
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
VectorF16<1, Packing*2> lenghtSq = LengthSq(A, E);
VectorF16<1, Packing*2> lenghtSq = LengthSqPack(A, E);
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
return VectorF16<1, Packing*2>(_mm_sqrt_ph(lenghtSq.v));
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m256h>) {
@ -692,23 +724,23 @@ namespace Crafter {
}
}
constexpr static VectorF16<1, Packing*4> LengthSq(
VectorF16<Len, Packing> A,
constexpr static VectorF16<1, Packing*4> LengthSqPack(
VectorF16<Len, Packing> A,
VectorF16<Len, Packing> C,
VectorF16<Len, Packing> E,
VectorF16<Len, Packing> G
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
return Dot(A, A, C, C, E, E, G, G);
return DotPack(A, A, C, C, E, E, G, G);
}
constexpr static VectorF16<1, Packing*2> LengthSq(
VectorF16<Len, Packing> A,
constexpr static VectorF16<1, Packing*2> LengthSqPack(
VectorF16<Len, Packing> A,
VectorF16<Len, Packing> E
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
return Dot(A, A, E, E);
return DotPack(A, A, E, E);
}
constexpr static VectorF16<1, Packing*4> Dot(
constexpr static VectorF16<1, Packing*4> DotPack(
VectorF16<Len, Packing> A0, VectorF16<Len, Packing> A1,
VectorF16<Len, Packing> C0, VectorF16<Len, Packing> C1,
VectorF16<Len, Packing> E0, VectorF16<Len, Packing> E1,
@ -744,7 +776,7 @@ namespace Crafter {
}
}
constexpr static VectorF16<1, Packing*2> Dot(
constexpr static VectorF16<1, Packing*2> DotPack(
VectorF16<Len, Packing> A0, VectorF16<Len, Packing> A1,
VectorF16<Len, Packing> E0, VectorF16<Len, Packing> E1
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
@ -1200,9 +1232,10 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = 1 + sizeof...(Rest);
constexpr std::uint8_t N = VectorBase<Len, Packing, _Float16>::BatchSize;
VectorF16<1, static_cast<std::uint8_t>(Packing * N)> r;
std::array<VectorF16<Len, Packing>, N> args{ first, rest... };
for (std::uint8_t i = 0; i < N; ++i)
@ -1218,7 +1251,8 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
auto sq = LengthSq(first, rest...);
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i)
@ -1227,7 +1261,8 @@ namespace Crafter {
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
auto normOne = [](VectorF16<Len, Packing> u) {
VectorF16<Len, Packing> out;
@ -1243,7 +1278,7 @@ namespace Crafter {
}
return out;
};
return std::make_tuple(normOne(first), normOne(rest)...);
return std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize>{ normOne(first), normOne(rest)... };
}
constexpr static VectorF16<Len, Packing> Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) {