packed intersection and matrix
This commit is contained in:
parent
027947cae6
commit
f0becd1582
7 changed files with 948 additions and 557 deletions
|
|
@ -87,6 +87,17 @@ namespace Crafter {
|
|||
|
||||
static constexpr std::uint8_t AlignmentElement = GetAlingment()/sizeof(T);
|
||||
static constexpr std::uint8_t Alignment = GetAlingment();
|
||||
// Number of input vectors per batched Normalize/Dot/Length call that
|
||||
// exactly fills the output register on the current (Len, Packing, ISA).
|
||||
// Each input contributes `Packing` scalar results; an output register
|
||||
// holds `AlignmentElement` lanes, so optimal arity = lanes / packing.
|
||||
static constexpr std::uint8_t BatchSize = AlignmentElement / Packing;
|
||||
// Largest Packing that still fits a single SIMD register for this
|
||||
// (Len, T) on the current ISA. Independent of the current Packing
|
||||
// dimension — meant for higher-level batching code that wants to
|
||||
// process Packing sub-primitives at once (e.g. intersection tests).
|
||||
// Falls back to 1 in the pathological case Len > MaxElement.
|
||||
static constexpr std::uint8_t OptimalPacking = (MaxElement / Len) > 0 ? (MaxElement / Len) : 1;
|
||||
static_assert(Len * Packing <= MaxElement, "Len * Packing exceeds MaxElement");
|
||||
|
||||
protected:
|
||||
|
|
@ -97,6 +108,22 @@ namespace Crafter {
|
|||
return arr;
|
||||
}
|
||||
|
||||
// True iff every per-Packing-slot shuffle (output, source) pair stays
|
||||
// within the same PerLane chunk. shuffle_epi32 / shuffle_epi8 are
|
||||
// applied per 128-bit lane, so any cross-lane move has to fall through
|
||||
// to a cross-lane permute path instead.
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval bool LaneSafeShuffle() {
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
for (std::uint8_t i = 0; i < Len; ++i) {
|
||||
std::uint8_t outIdx = static_cast<std::uint8_t>(p * Len + i);
|
||||
std::uint8_t srcIdx = static_cast<std::uint8_t>(p * Len + ShuffleValues[i]);
|
||||
if (outIdx / PerLane != srcIdx / PerLane) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
static consteval bool CheckEpi32Shuffle() {
|
||||
if constexpr (PerLane == 8) {
|
||||
|
|
@ -113,7 +140,7 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return LaneSafeShuffle<ShuffleValues>();
|
||||
}
|
||||
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
|
|
@ -124,7 +151,7 @@ namespace Crafter {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return LaneSafeShuffle<ShuffleValues>();
|
||||
}
|
||||
|
||||
template <std::array<std::uint8_t, Len> ShuffleValues>
|
||||
|
|
|
|||
|
|
@ -23,64 +23,141 @@ import :MatrixRowMajor;
|
|||
import std;
|
||||
|
||||
namespace Crafter {
|
||||
// All intersection tests are batched over four primitives at a time so they
|
||||
// feed the VectorF32<3,1>::Dot / Cross / Length / Normalize four-pair
|
||||
// overloads directly. The single-primitive case is just "pass the same
|
||||
// primitive four times and read lane 0" - there is no single-vector
|
||||
// fast-path because the SIMD pipelines want full lanes.
|
||||
namespace detail {
|
||||
// Splat a single Len-vector into all Packing slots of the wider type
|
||||
// via a temporary float buffer. Performed once per intersection call;
|
||||
// the inner SIMD loop dominates so the round-trip is in the noise.
|
||||
template <std::uint8_t Packing, std::uint8_t Len>
|
||||
inline VectorF32<Len, Packing> SplatToPacking(VectorF32<Len, 1> v) {
|
||||
alignas(64) float buf[VectorF32<Len, Packing>::AlignmentElement] = {};
|
||||
std::array<float, VectorF32<Len, 1>::AlignmentElement> flat = v.template Store<float>();
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
for (std::uint8_t k = 0; k < Len; ++k) buf[p * Len + k] = flat[k];
|
||||
}
|
||||
return VectorF32<Len, Packing>(buf);
|
||||
}
|
||||
|
||||
// Möller-Trumbore against four triangles sharing one ray. Returns ray
|
||||
// parameter t per triangle, or float max where the ray misses.
|
||||
export inline VectorF32<1, 4> IntersectionTestRayTriangle(
|
||||
// Interleave two arrays of size N=BatchSize into the 2*N positional
|
||||
// argument list expected by the variadic Dot. Returns the packed
|
||||
// VectorF32<1, Packing*BatchSize> with one dot product per slot.
|
||||
template <std::uint8_t Len, std::uint8_t Packing, std::size_t N>
|
||||
inline auto DotArrays(
|
||||
std::array<VectorF32<Len, Packing>, N> const& a,
|
||||
std::array<VectorF32<Len, Packing>, N> const& b
|
||||
) {
|
||||
return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
|
||||
std::array<VectorF32<Len, Packing>, 2 * N> flat;
|
||||
((flat[2 * Is] = a[Is], flat[2 * Is + 1] = b[Is]), ...);
|
||||
return std::apply([](auto... args) {
|
||||
return VectorF32<Len, Packing>::Dot(args...);
|
||||
}, flat);
|
||||
}(std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
// Gather the `Component`-th lane of every sub-vector across an array
|
||||
// of N packed VectorF32<3, Packing> into a flat VectorF32<1, Packing*N>
|
||||
// with one scalar per pair. Used to materialize halfSize.x / .y / .z
|
||||
// alongside per-pair scalar projections in a single SIMD register.
|
||||
template <std::uint8_t Component, std::uint8_t Packing>
|
||||
inline auto ExtractComponent(
|
||||
std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& arr
|
||||
) {
|
||||
constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
|
||||
constexpr std::uint8_t Total = Packing * N;
|
||||
using OutVec = VectorF32<1, Total>;
|
||||
alignas(64) float buf[OutVec::AlignmentElement] = {};
|
||||
for (std::uint8_t b = 0; b < N; ++b) {
|
||||
auto v = arr[b].template Store<float>();
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
buf[b * Packing + p] = v[p * 3 + Component];
|
||||
}
|
||||
}
|
||||
return OutVec(buf);
|
||||
}
|
||||
|
||||
// Lane-wise absolute value. Done via a flat float buffer because the
|
||||
// F32 module does not expose a SIMD Abs primitive. Only called O(15)
|
||||
// times per OBB-OBB call, so the round-trip is negligible compared to
|
||||
// the dot-product work.
|
||||
template <std::uint8_t Total>
|
||||
inline VectorF32<1, Total> AbsVec(VectorF32<1, Total> v) {
|
||||
alignas(64) float buf[VectorF32<1, Total>::AlignmentElement];
|
||||
v.Store(buf);
|
||||
for (std::uint8_t i = 0; i < Total; ++i) buf[i] = std::abs(buf[i]);
|
||||
return VectorF32<1, Total>(buf);
|
||||
}
|
||||
}
|
||||
|
||||
// Packed batch of Packing * BatchSize OBBs, each described by world-space
|
||||
// origin, three orthonormal rotation axes (rows of the rotation matrix),
|
||||
// and per-axis half-extents. Each std::array element packs `Packing`
|
||||
// sub-OBBs; there are BatchSize such elements, so the struct holds
|
||||
// Packing * BatchSize OBBs total.
|
||||
//
|
||||
// Callers that have OBBs as MatrixRowMajor + halfSize need to extract the
|
||||
// three axes and the origin themselves — keeping the routines in terms of
|
||||
// packed VectorF32<3, Packing> lets every SIMD op stay in registers.
|
||||
export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
|
||||
struct PackedOBBs {
|
||||
static constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
|
||||
static constexpr std::uint8_t Total = Packing * N;
|
||||
|
||||
std::array<VectorF32<3, Packing>, N> halfSize;
|
||||
std::array<VectorF32<3, Packing>, N> xAxis;
|
||||
std::array<VectorF32<3, Packing>, N> yAxis;
|
||||
std::array<VectorF32<3, Packing>, N> zAxis;
|
||||
std::array<VectorF32<3, Packing>, N> origin;
|
||||
};
|
||||
|
||||
// All intersection tests are batched over Packing*BatchSize primitives at
|
||||
// a time, where `Packing = VectorF32<3,1>::OptimalPacking` for the current
|
||||
// ISA (5 on AVX-512, 2 on AVX2, 1 on SSE/WASM/scalar) and BatchSize is the
|
||||
// arity that fills one output register. Callers form the packed input by
|
||||
// laying out `Packing` sub-primitives consecutively per vertex slot, then
|
||||
// assemble `BatchSize` such packed slots into the std::array argument.
|
||||
// Result lane `i` corresponds to triangle/sphere/box index `i`.
|
||||
|
||||
// Möller-Trumbore against Packing*BatchSize triangles sharing one ray.
|
||||
// Returns ray parameter t per triangle, or float max where the ray misses.
|
||||
export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
|
||||
inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
|
||||
IntersectionTestRayTriangle(
|
||||
VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
|
||||
VectorF32<3, 1> aV0, VectorF32<3, 1> aV1, VectorF32<3, 1> aV2,
|
||||
VectorF32<3, 1> bV0, VectorF32<3, 1> bV1, VectorF32<3, 1> bV2,
|
||||
VectorF32<3, 1> cV0, VectorF32<3, 1> cV1, VectorF32<3, 1> cV2,
|
||||
VectorF32<3, 1> dV0, VectorF32<3, 1> dV1, VectorF32<3, 1> dV2
|
||||
std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v0,
|
||||
std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v1,
|
||||
std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& v2
|
||||
) {
|
||||
VectorF32<3, 1> aE1 = aV1 - aV0;
|
||||
VectorF32<3, 1> aE2 = aV2 - aV0;
|
||||
VectorF32<3, 1> bE1 = bV1 - bV0;
|
||||
VectorF32<3, 1> bE2 = bV2 - bV0;
|
||||
VectorF32<3, 1> cE1 = cV1 - cV0;
|
||||
VectorF32<3, 1> cE2 = cV2 - cV0;
|
||||
VectorF32<3, 1> dE1 = dV1 - dV0;
|
||||
VectorF32<3, 1> dE2 = dV2 - dV0;
|
||||
constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
|
||||
constexpr std::uint8_t Total = Packing * N;
|
||||
using PVec = VectorF32<3, Packing>;
|
||||
|
||||
VectorF32<3, 1> aH = VectorF32<3, 1>::Cross(rayDir, aE2);
|
||||
VectorF32<3, 1> bH = VectorF32<3, 1>::Cross(rayDir, bE2);
|
||||
VectorF32<3, 1> cH = VectorF32<3, 1>::Cross(rayDir, cE2);
|
||||
VectorF32<3, 1> dH = VectorF32<3, 1>::Cross(rayDir, dE2);
|
||||
PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
|
||||
PVec rayDirP = detail::SplatToPacking<Packing>(rayDir);
|
||||
|
||||
VectorF32<3, 1> aS = rayOrigin - aV0;
|
||||
VectorF32<3, 1> bS = rayOrigin - bV0;
|
||||
VectorF32<3, 1> cS = rayOrigin - cV0;
|
||||
VectorF32<3, 1> dS = rayOrigin - dV0;
|
||||
std::array<PVec, N> E1, E2, H, S, Q, rayDirArr;
|
||||
for (std::uint8_t i = 0; i < N; ++i) {
|
||||
E1[i] = v1[i] - v0[i];
|
||||
E2[i] = v2[i] - v0[i];
|
||||
H[i] = PVec::Cross(rayDirP, E2[i]);
|
||||
S[i] = rayOriginP - v0[i];
|
||||
Q[i] = PVec::Cross(S[i], E1[i]);
|
||||
rayDirArr[i] = rayDirP;
|
||||
}
|
||||
|
||||
VectorF32<3, 1> aQ = VectorF32<3, 1>::Cross(aS, aE1);
|
||||
VectorF32<3, 1> bQ = VectorF32<3, 1>::Cross(bS, bE1);
|
||||
VectorF32<3, 1> cQ = VectorF32<3, 1>::Cross(cS, cE1);
|
||||
VectorF32<3, 1> dQ = VectorF32<3, 1>::Cross(dS, dE1);
|
||||
auto det = detail::DotArrays(E1, H);
|
||||
auto uNum = detail::DotArrays(S, H);
|
||||
auto vNum = detail::DotArrays(rayDirArr, Q);
|
||||
auto tNum = detail::DotArrays(E2, Q);
|
||||
|
||||
// Four 3-component dots packed into one __m128 per call.
|
||||
VectorF32<1, 4> det = VectorF32<3, 1>::Dot(
|
||||
aE1, aH, bE1, bH, cE1, cH, dE1, dH);
|
||||
VectorF32<1, 4> uNum = VectorF32<3, 1>::Dot(
|
||||
aS, aH, bS, bH, cS, cH, dS, dH);
|
||||
VectorF32<1, 4> vNum = VectorF32<3, 1>::Dot(
|
||||
rayDir, aQ, rayDir, bQ, rayDir, cQ, rayDir, dQ);
|
||||
VectorF32<1, 4> tNum = VectorF32<3, 1>::Dot(
|
||||
aE2, aQ, bE2, bQ, cE2, cQ, dE2, dQ);
|
||||
|
||||
std::array<float, 4> detArr = det.template Store<float>();
|
||||
std::array<float, 4> uArr = uNum.template Store<float>();
|
||||
std::array<float, 4> vArr = vNum.template Store<float>();
|
||||
std::array<float, 4> tArr = tNum.template Store<float>();
|
||||
auto detArr = det.template Store<float>();
|
||||
auto uArr = uNum.template Store<float>();
|
||||
auto vArr = vNum.template Store<float>();
|
||||
auto tArr = tNum.template Store<float>();
|
||||
|
||||
constexpr float eps = std::numeric_limits<float>::epsilon();
|
||||
constexpr float maxF = std::numeric_limits<float>::max();
|
||||
alignas(16) std::array<float, 4> out{};
|
||||
for (std::uint8_t i = 0; i < 4; ++i) {
|
||||
alignas(64) std::array<float, VectorF32<1, Total>::AlignmentElement> out{};
|
||||
for (std::uint8_t i = 0; i < Total; ++i) {
|
||||
float d = detArr[i];
|
||||
if (d <= eps) { out[i] = maxF; continue; }
|
||||
float invD = 1.0f / d;
|
||||
|
|
@ -90,115 +167,120 @@ namespace Crafter {
|
|||
if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; }
|
||||
out[i] = tArr[i] * invD;
|
||||
}
|
||||
return VectorF32<1, 4>(out.data());
|
||||
return VectorF32<1, Total>(out.data());
|
||||
}
|
||||
|
||||
// One ray against four spheres. radii must hold {rA, rB, rC, rD} in lanes
|
||||
// 0..3.
|
||||
export inline VectorF32<1, 4> IntersectionTestRaySphere(
|
||||
// One ray against Packing*BatchSize spheres. `radii` holds one radius per
|
||||
// sphere in lane order matching the result.
|
||||
export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
|
||||
inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
|
||||
IntersectionTestRaySphere(
|
||||
VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
|
||||
VectorF32<3, 1> posA, VectorF32<3, 1> posB,
|
||||
VectorF32<3, 1> posC, VectorF32<3, 1> posD,
|
||||
VectorF32<1, 4> radii
|
||||
std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
|
||||
VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii
|
||||
) {
|
||||
VectorF32<3, 1> sA = rayOrigin - posA;
|
||||
VectorF32<3, 1> sB = rayOrigin - posB;
|
||||
VectorF32<3, 1> sC = rayOrigin - posC;
|
||||
VectorF32<3, 1> sD = rayOrigin - posD;
|
||||
constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
|
||||
constexpr std::uint8_t Total = Packing * N;
|
||||
using PVec = VectorF32<3, Packing>;
|
||||
using OutVec = VectorF32<1, Total>;
|
||||
|
||||
PVec rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
|
||||
PVec rayDirP = detail::SplatToPacking<Packing>(rayDir);
|
||||
|
||||
std::array<PVec, N> s;
|
||||
std::array<PVec, N> rayDirArr;
|
||||
for (std::uint8_t i = 0; i < N; ++i) {
|
||||
s[i] = rayOriginP - pos[i];
|
||||
rayDirArr[i] = rayDirP;
|
||||
}
|
||||
|
||||
// dirDotS_i = rayDir · (rayOrigin - pos_i)
|
||||
VectorF32<1, 4> dirDotS = VectorF32<3, 1>::Dot(
|
||||
rayDir, sA, rayDir, sB, rayDir, sC, rayDir, sD);
|
||||
// sqDist_i = |rayOrigin - pos_i|² (a.k.a. LengthSq of the s vectors)
|
||||
VectorF32<1, 4> sqDist = VectorF32<3, 1>::LengthSq(sA, sB, sC, sD);
|
||||
// aScalar = rayDir · rayDir, broadcast across four lanes.
|
||||
VectorF32<1, 4> aScalar = VectorF32<3, 1>::LengthSq(
|
||||
rayDir, rayDir, rayDir, rayDir);
|
||||
auto dirDotS = detail::DotArrays(rayDirArr, s);
|
||||
// sqDist_i = |rayOrigin - pos_i|² across all packed slots.
|
||||
auto sqDist = std::apply([](auto... args) { return PVec::LengthSq(args...); }, s);
|
||||
// aScalar = rayDir · rayDir, broadcast across every lane.
|
||||
auto aScalar = std::apply([](auto... args) { return PVec::LengthSq(args...); }, rayDirArr);
|
||||
|
||||
VectorF32<1, 4> two(2.0f);
|
||||
VectorF32<1, 4> four(4.0f);
|
||||
VectorF32<1, 4> b = two * dirDotS;
|
||||
VectorF32<1, 4> c = sqDist - radii * radii;
|
||||
OutVec two(2.0f);
|
||||
OutVec four(4.0f);
|
||||
OutVec b = two * dirDotS;
|
||||
OutVec c = sqDist - radii * radii;
|
||||
// discriminant = b² - 4·a·c
|
||||
VectorF32<1, 4> disc = b * b - four * aScalar * c;
|
||||
OutVec disc = b * b - four * aScalar * c;
|
||||
|
||||
std::array<float, 4> discArr = disc.template Store<float>();
|
||||
std::array<float, 4> bArr = b.template Store<float>();
|
||||
std::array<float, 4> aArr = aScalar.template Store<float>();
|
||||
auto discArr = disc.template Store<float>();
|
||||
auto bArr = b.template Store<float>();
|
||||
auto aArr = aScalar.template Store<float>();
|
||||
|
||||
constexpr float maxF = std::numeric_limits<float>::max();
|
||||
alignas(16) std::array<float, 4> out{};
|
||||
for (std::uint8_t i = 0; i < 4; ++i) {
|
||||
alignas(64) std::array<float, OutVec::AlignmentElement> out{};
|
||||
for (std::uint8_t i = 0; i < Total; ++i) {
|
||||
float d = discArr[i];
|
||||
if (d < 0.0f) { out[i] = maxF; continue; }
|
||||
float sqrtD = std::sqrt(d);
|
||||
float t = -0.5f * (bArr[i] + sqrtD) / aArr[i];
|
||||
out[i] = (t > 0.0f) ? t : maxF;
|
||||
}
|
||||
return VectorF32<1, 4>(out.data());
|
||||
return OutVec(out.data());
|
||||
}
|
||||
|
||||
// One ray against four OBBs. Each box is described by world-space position,
|
||||
// half-extent vector (per-axis sizes), and a unit quaternion rotation.
|
||||
export inline VectorF32<1, 4> IntersectionTestRayOrientedBox(
|
||||
// Packing that fits both Len=3 (positions, sizes) and Len=4 (quaternions)
|
||||
// in one SIMD register. Len=4's OptimalPacking is always ≤ Len=3's, so we
|
||||
// use the smaller of the two so a single Packing covers every type the
|
||||
// routine needs.
|
||||
inline constexpr std::uint8_t RayOBBPacking = std::min(
|
||||
VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking);
|
||||
|
||||
// One ray against Packing*BatchSize OBBs. Each box is described by
|
||||
// world-space position, full-extent size, and a unit quaternion rotation.
|
||||
export template <std::uint8_t Packing = RayOBBPacking>
|
||||
inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
|
||||
IntersectionTestRayOrientedBox(
|
||||
VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir,
|
||||
VectorF32<3, 1> posA, VectorF32<3, 1> sizeA, VectorF32<4, 1> rotA,
|
||||
VectorF32<3, 1> posB, VectorF32<3, 1> sizeB, VectorF32<4, 1> rotB,
|
||||
VectorF32<3, 1> posC, VectorF32<3, 1> sizeC, VectorF32<4, 1> rotC,
|
||||
VectorF32<3, 1> posD, VectorF32<3, 1> sizeD, VectorF32<4, 1> rotD
|
||||
std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& pos,
|
||||
std::array<VectorF32<3, Packing>, VectorF32<3, Packing>::BatchSize> const& size,
|
||||
std::array<VectorF32<4, Packing>, VectorF32<3, Packing>::BatchSize> const& rot
|
||||
) {
|
||||
// Conjugate quaternion: negate xyz, keep w. Negate<{true,true,true,false}>
|
||||
// is constant-folded into a single XOR with a mask vector.
|
||||
VectorF32<4, 1> invRotA = rotA.template Negate<{{true, true, true, false}}>();
|
||||
VectorF32<4, 1> invRotB = rotB.template Negate<{{true, true, true, false}}>();
|
||||
VectorF32<4, 1> invRotC = rotC.template Negate<{{true, true, true, false}}>();
|
||||
VectorF32<4, 1> invRotD = rotD.template Negate<{{true, true, true, false}}>();
|
||||
constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
|
||||
constexpr std::uint8_t Total = Packing * N;
|
||||
using PVec3 = VectorF32<3, Packing>;
|
||||
using PVec4 = VectorF32<4, Packing>;
|
||||
using OutVec = VectorF32<1, Total>;
|
||||
|
||||
VectorF32<3, 1> localOriginA = VectorF32<3, 1>::Rotate(rayOrigin - posA, invRotA);
|
||||
VectorF32<3, 1> localOriginB = VectorF32<3, 1>::Rotate(rayOrigin - posB, invRotB);
|
||||
VectorF32<3, 1> localOriginC = VectorF32<3, 1>::Rotate(rayOrigin - posC, invRotC);
|
||||
VectorF32<3, 1> localOriginD = VectorF32<3, 1>::Rotate(rayOrigin - posD, invRotD);
|
||||
PVec3 rayOriginP = detail::SplatToPacking<Packing>(rayOrigin);
|
||||
PVec3 rayDirP = detail::SplatToPacking<Packing>(rayDir);
|
||||
|
||||
VectorF32<3, 1> localDirA = VectorF32<3, 1>::Rotate(rayDir, invRotA);
|
||||
VectorF32<3, 1> localDirB = VectorF32<3, 1>::Rotate(rayDir, invRotB);
|
||||
VectorF32<3, 1> localDirC = VectorF32<3, 1>::Rotate(rayDir, invRotC);
|
||||
VectorF32<3, 1> localDirD = VectorF32<3, 1>::Rotate(rayDir, invRotD);
|
||||
// Conjugate quaternion: negate xyz, keep w. Constant-folded into one
|
||||
// XOR with a mask vector inside Negate.
|
||||
std::array<PVec3, N> localOrigin, localDir, half;
|
||||
for (std::uint8_t i = 0; i < N; ++i) {
|
||||
PVec4 invRot = rot[i].template Negate<{{true, true, true, false}}>();
|
||||
localOrigin[i] = PVec3::Rotate(rayOriginP - pos[i], invRot);
|
||||
localDir[i] = PVec3::Rotate(rayDirP, invRot);
|
||||
half[i] = size[i] * 0.5f;
|
||||
}
|
||||
|
||||
VectorF32<3, 1> halfA = sizeA * 0.5f;
|
||||
VectorF32<3, 1> halfB = sizeB * 0.5f;
|
||||
VectorF32<3, 1> halfC = sizeC * 0.5f;
|
||||
VectorF32<3, 1> halfD = sizeD * 0.5f;
|
||||
|
||||
std::array<std::array<float, 4>, 4> origLanes{
|
||||
localOriginA.template Store<float>(),
|
||||
localOriginB.template Store<float>(),
|
||||
localOriginC.template Store<float>(),
|
||||
localOriginD.template Store<float>(),
|
||||
};
|
||||
std::array<std::array<float, 4>, 4> dirLanes{
|
||||
localDirA.template Store<float>(),
|
||||
localDirB.template Store<float>(),
|
||||
localDirC.template Store<float>(),
|
||||
localDirD.template Store<float>(),
|
||||
};
|
||||
std::array<std::array<float, 4>, 4> halfLanes{
|
||||
halfA.template Store<float>(),
|
||||
halfB.template Store<float>(),
|
||||
halfC.template Store<float>(),
|
||||
halfD.template Store<float>(),
|
||||
};
|
||||
std::array<std::array<float, PVec3::AlignmentElement>, N> origLanes, dirLanes, halfLanes;
|
||||
for (std::uint8_t i = 0; i < N; ++i) {
|
||||
origLanes[i] = localOrigin[i].template Store<float>();
|
||||
dirLanes[i] = localDir[i].template Store<float>();
|
||||
halfLanes[i] = half[i].template Store<float>();
|
||||
}
|
||||
|
||||
constexpr float eps = std::numeric_limits<float>::epsilon();
|
||||
constexpr float maxF = std::numeric_limits<float>::max();
|
||||
alignas(16) std::array<float, 4> out{};
|
||||
for (std::uint8_t b = 0; b < 4; ++b) {
|
||||
alignas(64) std::array<float, OutVec::AlignmentElement> out{};
|
||||
for (std::uint8_t b = 0; b < Total; ++b) {
|
||||
std::uint8_t batchIdx = b / Packing;
|
||||
std::uint8_t subIdx = b % Packing;
|
||||
float tMin = 0.0f;
|
||||
float tMax = maxF;
|
||||
bool miss = false;
|
||||
for (std::uint8_t i = 0; i < 3; ++i) {
|
||||
float d = dirLanes[b][i];
|
||||
float o = origLanes[b][i];
|
||||
float h = halfLanes[b][i];
|
||||
std::uint8_t lane = static_cast<std::uint8_t>(subIdx * 3 + i);
|
||||
float d = dirLanes[batchIdx][lane];
|
||||
float o = origLanes[batchIdx][lane];
|
||||
float h = halfLanes[batchIdx][lane];
|
||||
if (std::abs(d) < eps) {
|
||||
if (o < -h || o > h) { miss = true; break; }
|
||||
} else {
|
||||
|
|
@ -213,87 +295,65 @@ namespace Crafter {
|
|||
}
|
||||
out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax);
|
||||
}
|
||||
return VectorF32<1, 4>(out.data());
|
||||
return OutVec(out.data());
|
||||
}
|
||||
|
||||
// One sphere against four OBBs. boxMatrix encodes rotation in m[r][0..2]
|
||||
// and translation in m[r][3].
|
||||
export inline VectorF32<1, 4> IntersectionTestSphereOrientedBox(
|
||||
VectorF32<3, 1> sphereCenter, VectorF32<1, 4> radii,
|
||||
VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
|
||||
VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB,
|
||||
VectorF32<3, 1> sizeC, MatrixRowMajor<float, 4, 3, 1> boxC,
|
||||
VectorF32<3, 1> sizeD, MatrixRowMajor<float, 4, 3, 1> boxD
|
||||
// One sphere against Packing*BatchSize OBBs described by a PackedOBBs.
|
||||
// Returns 0.0 per pair where the sphere intersects the box, max-float
|
||||
// otherwise. `radii` carries one sphere radius per pair in the same lane
|
||||
// order as the resulting test output.
|
||||
export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
|
||||
inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
|
||||
IntersectionTestSphereOrientedBox(
|
||||
VectorF32<3, 1> sphereCenter,
|
||||
VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)> radii,
|
||||
PackedOBBs<Packing> const& boxes
|
||||
) {
|
||||
auto perBox = [&](MatrixRowMajor<float, 4, 3, 1> const& m,
|
||||
VectorF32<3, 1> const& size,
|
||||
VectorF32<3, 1>& xAxis,
|
||||
VectorF32<3, 1>& yAxis,
|
||||
VectorF32<3, 1>& zAxis,
|
||||
VectorF32<3, 1>& delta) {
|
||||
// Existing semantics: the OBB axes are read from the rows of the
|
||||
// upper 3x3 block, and the translation column is gathered from the
|
||||
// w lane of each row.
|
||||
std::array<float, 4> r0 = m.rows[0].template Store<float>();
|
||||
std::array<float, 4> r1 = m.rows[1].template Store<float>();
|
||||
std::array<float, 4> r2 = m.rows[2].template Store<float>();
|
||||
alignas(16) float xBuf[4] = { r0[0], r0[1], r0[2], 0.0f };
|
||||
alignas(16) float yBuf[4] = { r1[0], r1[1], r1[2], 0.0f };
|
||||
alignas(16) float zBuf[4] = { r2[0], r2[1], r2[2], 0.0f };
|
||||
alignas(16) float oBuf[4] = { r0[3], r1[3], r2[3], 0.0f };
|
||||
xAxis = VectorF32<3, 1>(xBuf);
|
||||
yAxis = VectorF32<3, 1>(yBuf);
|
||||
zAxis = VectorF32<3, 1>(zBuf);
|
||||
VectorF32<3, 1> origin(oBuf);
|
||||
delta = sphereCenter - origin;
|
||||
(void)size;
|
||||
};
|
||||
constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize;
|
||||
constexpr std::uint8_t Total = Packing * N;
|
||||
using PVec3 = VectorF32<3, Packing>;
|
||||
using OutVec = VectorF32<1, Total>;
|
||||
|
||||
VectorF32<3, 1> xA, yA, zA, dA;
|
||||
VectorF32<3, 1> xB, yB, zB, dB;
|
||||
VectorF32<3, 1> xC, yC, zC, dC;
|
||||
VectorF32<3, 1> xD, yD, zD, dD;
|
||||
perBox(boxA, sizeA, xA, yA, zA, dA);
|
||||
perBox(boxB, sizeB, xB, yB, zB, dB);
|
||||
perBox(boxC, sizeC, xC, yC, zC, dC);
|
||||
perBox(boxD, sizeD, xD, yD, zD, dD);
|
||||
PVec3 sphereCenterP = detail::SplatToPacking<Packing>(sphereCenter);
|
||||
std::array<PVec3, N> delta;
|
||||
for (std::uint8_t i = 0; i < N; ++i) {
|
||||
delta[i] = sphereCenterP - boxes.origin[i];
|
||||
}
|
||||
|
||||
// Local sphere center per box: project delta onto each box axis. We
|
||||
// produce {lx, ly, lz, lx, ly, lz, lx, ly, lz, lx, ly, lz} as three
|
||||
// packed 4-wide Dot results (one Dot per axis).
|
||||
VectorF32<1, 4> locX = VectorF32<3, 1>::Dot(
|
||||
dA, xA, dB, xB, dC, xC, dD, xD);
|
||||
VectorF32<1, 4> locY = VectorF32<3, 1>::Dot(
|
||||
dA, yA, dB, yB, dC, yC, dD, yD);
|
||||
VectorF32<1, 4> locZ = VectorF32<3, 1>::Dot(
|
||||
dA, zA, dB, zB, dC, zC, dD, zD);
|
||||
// Project the world-space delta onto each box axis.
|
||||
auto locX = detail::DotArrays(delta, boxes.xAxis);
|
||||
auto locY = detail::DotArrays(delta, boxes.yAxis);
|
||||
auto locZ = detail::DotArrays(delta, boxes.zAxis);
|
||||
|
||||
std::array<float, 4> lxArr = locX.template Store<float>();
|
||||
std::array<float, 4> lyArr = locY.template Store<float>();
|
||||
std::array<float, 4> lzArr = locZ.template Store<float>();
|
||||
std::array<float, 4> rArr = radii.template Store<float>();
|
||||
std::array<std::array<float, 4>, 4> sizeLanes{
|
||||
sizeA.template Store<float>(),
|
||||
sizeB.template Store<float>(),
|
||||
sizeC.template Store<float>(),
|
||||
sizeD.template Store<float>(),
|
||||
};
|
||||
auto lxArr = locX.template Store<float>();
|
||||
auto lyArr = locY.template Store<float>();
|
||||
auto lzArr = locZ.template Store<float>();
|
||||
auto rArr = radii.template Store<float>();
|
||||
std::array<std::array<float, PVec3::AlignmentElement>, N> sizeLanes;
|
||||
for (std::uint8_t i = 0; i < N; ++i) {
|
||||
sizeLanes[i] = boxes.halfSize[i].template Store<float>();
|
||||
}
|
||||
|
||||
alignas(16) std::array<float, 4> out{};
|
||||
for (std::uint8_t i = 0; i < 4; ++i) {
|
||||
constexpr float maxF = std::numeric_limits<float>::max();
|
||||
alignas(64) std::array<float, OutVec::AlignmentElement> out{};
|
||||
for (std::uint8_t i = 0; i < Total; ++i) {
|
||||
std::uint8_t batchIdx = i / Packing;
|
||||
std::uint8_t subIdx = i % Packing;
|
||||
float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i];
|
||||
float sx = sizeLanes[i][0], sy = sizeLanes[i][1], sz = sizeLanes[i][2];
|
||||
float sx = sizeLanes[batchIdx][subIdx * 3 + 0];
|
||||
float sy = sizeLanes[batchIdx][subIdx * 3 + 1];
|
||||
float sz = sizeLanes[batchIdx][subIdx * 3 + 2];
|
||||
float cx = std::clamp(lx, -sx, sx);
|
||||
float cy = std::clamp(ly, -sy, sy);
|
||||
float cz = std::clamp(lz, -sz, sz);
|
||||
float dx = lx - cx, dy = ly - cy, dz = lz - cz;
|
||||
float distSq = dx * dx + dy * dy + dz * dz;
|
||||
float r = rArr[i];
|
||||
// Returns 0.0 on hit, max on miss - keeps a consistent
|
||||
// "t-like" output signature with the other intersection tests.
|
||||
out[i] = (distSq <= r * r) ? 0.0f : std::numeric_limits<float>::max();
|
||||
// Returns 0.0 on hit, max on miss — same "t-like" output signature
|
||||
// as the ray-vs-X tests.
|
||||
out[i] = (distSq <= r * r) ? 0.0f : maxF;
|
||||
}
|
||||
return VectorF32<1, 4>(out.data());
|
||||
return OutVec(out.data());
|
||||
}
|
||||
|
||||
// Eight local corners of a unit OBB transformed by `matrix`. Uses one
|
||||
|
|
@ -350,100 +410,104 @@ namespace Crafter {
|
|||
return result;
|
||||
}
|
||||
|
||||
// SAT against fifteen separating axes (3 box-A, 3 box-B, 9 cross products).
|
||||
// We compute every corner projection with batched 4-pair Dots: each axis
|
||||
// projects four corners per call, two calls per axis covers the 8 corners.
|
||||
export inline bool IntersectionTestOrientedBoxOrientedBox(
|
||||
VectorF32<3, 1> sizeA, MatrixRowMajor<float, 4, 3, 1> boxA,
|
||||
VectorF32<3, 1> sizeB, MatrixRowMajor<float, 4, 3, 1> boxB
|
||||
// SAT against the 15 separating axis candidates (3 from box A, 3 from
|
||||
// box B, 9 cross products). Returns 0.0 per pair when the boxes overlap
|
||||
// and max-float when a separating axis was found, matching the
|
||||
// "smaller-is-closer" convention of the ray-vs-X tests.
|
||||
//
|
||||
// The corner-free formulation: for an OBB (origin O, unit axes X/Y/Z,
|
||||
// half-extents h) and a separating-axis candidate a, the projection
|
||||
// interval is centered at O·a with radius hx|X·a| + hy|Y·a| + hz|Z·a|.
|
||||
// Each axis therefore only needs four dot products per box (and a couple
|
||||
// of fused-multiply-adds) instead of eight corner projections — every
|
||||
// sub-pair runs in parallel inside the SIMD lanes.
|
||||
export template <std::uint8_t Packing = VectorF32<3, 1>::OptimalPacking>
|
||||
inline VectorF32<1, static_cast<std::uint8_t>(Packing * VectorF32<3, Packing>::BatchSize)>
|
||||
IntersectionTestOrientedBoxOrientedBox(
|
||||
PackedOBBs<Packing> const& a, PackedOBBs<Packing> const& b
|
||||
) {
|
||||
std::array<VectorF32<3, 1>, 8> cornersA = GetOBBCorners(sizeA, boxA);
|
||||
std::array<VectorF32<3, 1>, 8> cornersB = GetOBBCorners(sizeB, boxB);
|
||||
using PVec = VectorF32<3, Packing>;
|
||||
constexpr std::uint8_t N = PVec::BatchSize;
|
||||
constexpr std::uint8_t Total = Packing * N;
|
||||
using OutVec = VectorF32<1, Total>;
|
||||
|
||||
// Axes are the upper-3 lanes of each matrix row (same convention as
|
||||
// SphereOrientedBox). ExtractLo<3> just retypes the SIMD register; the
|
||||
// 4th lane is ignored by the Len=3 ops below.
|
||||
std::array<VectorF32<3, 1>, 3> axesA = {
|
||||
boxA.rows[0].template ExtractLo<3>(),
|
||||
boxA.rows[1].template ExtractLo<3>(),
|
||||
boxA.rows[2].template ExtractLo<3>(),
|
||||
// Per-pair half-extents pulled out of each PackedOBBs into flat
|
||||
// VectorF32<1, Total> registers so they can multiply the projection
|
||||
// dots directly.
|
||||
OutVec halfA_x = detail::ExtractComponent<0, Packing>(a.halfSize);
|
||||
OutVec halfA_y = detail::ExtractComponent<1, Packing>(a.halfSize);
|
||||
OutVec halfA_z = detail::ExtractComponent<2, Packing>(a.halfSize);
|
||||
OutVec halfB_x = detail::ExtractComponent<0, Packing>(b.halfSize);
|
||||
OutVec halfB_y = detail::ExtractComponent<1, Packing>(b.halfSize);
|
||||
OutVec halfB_z = detail::ExtractComponent<2, Packing>(b.halfSize);
|
||||
|
||||
constexpr float maxF = std::numeric_limits<float>::max();
|
||||
alignas(64) std::array<float, OutVec::AlignmentElement> out{};
|
||||
for (std::uint8_t i = 0; i < Total; ++i) out[i] = 0.0f; // start: overlap
|
||||
|
||||
auto axesOfA = [&](std::uint8_t i) -> std::array<PVec, N> const& {
|
||||
return (i == 0) ? a.xAxis : (i == 1) ? a.yAxis : a.zAxis;
|
||||
};
|
||||
std::array<VectorF32<3, 1>, 3> axesB = {
|
||||
boxB.rows[0].template ExtractLo<3>(),
|
||||
boxB.rows[1].template ExtractLo<3>(),
|
||||
boxB.rows[2].template ExtractLo<3>(),
|
||||
auto axesOfB = [&](std::uint8_t i) -> std::array<PVec, N> const& {
|
||||
return (i == 0) ? b.xAxis : (i == 1) ? b.yAxis : b.zAxis;
|
||||
};
|
||||
|
||||
std::array<VectorF32<3, 1>, 15> axes{};
|
||||
axes[0] = axesA[0]; axes[1] = axesA[1]; axes[2] = axesA[2];
|
||||
axes[3] = axesB[0]; axes[4] = axesB[1]; axes[5] = axesB[2];
|
||||
// Normalize all nine cross axes together with a single batched
|
||||
// Normalize call (Packing=3 not in the API, so two calls of four +
|
||||
// one of one would be needed; for now just normalize in two batches
|
||||
// of four and the trailing one inline).
|
||||
std::array<VectorF32<3, 1>, 9> crossAxes{};
|
||||
std::uint8_t k = 0;
|
||||
// For each separating-axis candidate, compute per-pair min/max for
|
||||
// both boxes and OR the "separating" condition into `out`.
|
||||
auto checkAxis = [&](std::array<PVec, N> const& axis) {
|
||||
OutVec cA = detail::DotArrays(a.origin, axis);
|
||||
OutVec dA_x = detail::DotArrays(a.xAxis, axis);
|
||||
OutVec dA_y = detail::DotArrays(a.yAxis, axis);
|
||||
OutVec dA_z = detail::DotArrays(a.zAxis, axis);
|
||||
OutVec rA = halfA_x * detail::AbsVec(dA_x)
|
||||
+ halfA_y * detail::AbsVec(dA_y)
|
||||
+ halfA_z * detail::AbsVec(dA_z);
|
||||
|
||||
OutVec cB = detail::DotArrays(b.origin, axis);
|
||||
OutVec dB_x = detail::DotArrays(b.xAxis, axis);
|
||||
OutVec dB_y = detail::DotArrays(b.yAxis, axis);
|
||||
OutVec dB_z = detail::DotArrays(b.zAxis, axis);
|
||||
OutVec rB = halfB_x * detail::AbsVec(dB_x)
|
||||
+ halfB_y * detail::AbsVec(dB_y)
|
||||
+ halfB_z * detail::AbsVec(dB_z);
|
||||
|
||||
OutVec minA = cA - rA;
|
||||
OutVec maxA = cA + rA;
|
||||
OutVec minB = cB - rB;
|
||||
OutVec maxB = cB + rB;
|
||||
|
||||
auto minAArr = minA.template Store<float>();
|
||||
auto maxAArr = maxA.template Store<float>();
|
||||
auto minBArr = minB.template Store<float>();
|
||||
auto maxBArr = maxB.template Store<float>();
|
||||
for (std::uint8_t i = 0; i < Total; ++i) {
|
||||
// NaN comparisons (from degenerate cross axes) return false and
|
||||
// correctly leave `out[i]` untouched on this axis.
|
||||
if (maxAArr[i] < minBArr[i] || maxBArr[i] < minAArr[i]) {
|
||||
out[i] = maxF;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
checkAxis(a.xAxis); checkAxis(a.yAxis); checkAxis(a.zAxis);
|
||||
checkAxis(b.xAxis); checkAxis(b.yAxis); checkAxis(b.zAxis);
|
||||
|
||||
// The 9 cross-product axes. Each batch slot's cross axes are computed
|
||||
// per-slot, then normalized together (one PVec::Normalize per cross
|
||||
// index processes N packed slots in parallel).
|
||||
for (std::uint8_t i = 0; i < 3; ++i) {
|
||||
auto const& aAx = axesOfA(i);
|
||||
for (std::uint8_t j = 0; j < 3; ++j) {
|
||||
crossAxes[k++] = VectorF32<3, 1>::Cross(axesA[i], axesB[j]);
|
||||
auto const& bAx = axesOfB(j);
|
||||
std::array<PVec, N> crossAx;
|
||||
for (std::uint8_t k = 0; k < N; ++k) crossAx[k] = PVec::Cross(aAx[k], bAx[k]);
|
||||
auto normalized = std::apply([](auto... args) {
|
||||
return PVec::Normalize(args...);
|
||||
}, crossAx);
|
||||
checkAxis(normalized);
|
||||
}
|
||||
}
|
||||
auto norm0 = VectorF32<3, 1>::Normalize(crossAxes[0], crossAxes[1], crossAxes[2], crossAxes[3]);
|
||||
auto norm1 = VectorF32<3, 1>::Normalize(crossAxes[4], crossAxes[5], crossAxes[6], crossAxes[7]);
|
||||
auto norm2 = VectorF32<3, 1>::Normalize(crossAxes[8], crossAxes[8], crossAxes[8], crossAxes[8]);
|
||||
axes[6] = std::get<0>(norm0);
|
||||
axes[7] = std::get<1>(norm0);
|
||||
axes[8] = std::get<2>(norm0);
|
||||
axes[9] = std::get<3>(norm0);
|
||||
axes[10] = std::get<0>(norm1);
|
||||
axes[11] = std::get<1>(norm1);
|
||||
axes[12] = std::get<2>(norm1);
|
||||
axes[13] = std::get<3>(norm1);
|
||||
axes[14] = std::get<0>(norm2);
|
||||
|
||||
for (std::uint8_t axisIdx = 0; axisIdx < 15; ++axisIdx) {
|
||||
VectorF32<3, 1> axis = axes[axisIdx];
|
||||
// Project all 8 corners of each box onto `axis` using two batched
|
||||
// 4-pair Dot calls (lo and hi corners).
|
||||
VectorF32<1, 4> projA_lo = VectorF32<3, 1>::Dot(
|
||||
cornersA[0], axis, cornersA[1], axis,
|
||||
cornersA[2], axis, cornersA[3], axis);
|
||||
VectorF32<1, 4> projA_hi = VectorF32<3, 1>::Dot(
|
||||
cornersA[4], axis, cornersA[5], axis,
|
||||
cornersA[6], axis, cornersA[7], axis);
|
||||
VectorF32<1, 4> projB_lo = VectorF32<3, 1>::Dot(
|
||||
cornersB[0], axis, cornersB[1], axis,
|
||||
cornersB[2], axis, cornersB[3], axis);
|
||||
VectorF32<1, 4> projB_hi = VectorF32<3, 1>::Dot(
|
||||
cornersB[4], axis, cornersB[5], axis,
|
||||
cornersB[6], axis, cornersB[7], axis);
|
||||
|
||||
std::array<float, 4> aLo = projA_lo.template Store<float>();
|
||||
std::array<float, 4> aHi = projA_hi.template Store<float>();
|
||||
std::array<float, 4> bLo = projB_lo.template Store<float>();
|
||||
std::array<float, 4> bHi = projB_hi.template Store<float>();
|
||||
|
||||
float minA = aLo[0], maxA = aLo[0];
|
||||
for (std::uint8_t i = 1; i < 4; ++i) {
|
||||
minA = std::min(minA, aLo[i]);
|
||||
maxA = std::max(maxA, aLo[i]);
|
||||
}
|
||||
for (std::uint8_t i = 0; i < 4; ++i) {
|
||||
minA = std::min(minA, aHi[i]);
|
||||
maxA = std::max(maxA, aHi[i]);
|
||||
}
|
||||
float minB = bLo[0], maxB = bLo[0];
|
||||
for (std::uint8_t i = 1; i < 4; ++i) {
|
||||
minB = std::min(minB, bLo[i]);
|
||||
maxB = std::max(maxB, bLo[i]);
|
||||
}
|
||||
for (std::uint8_t i = 0; i < 4; ++i) {
|
||||
minB = std::min(minB, bHi[i]);
|
||||
maxB = std::max(maxB, bHi[i]);
|
||||
}
|
||||
|
||||
if (maxA < minB || maxB < minA) return false;
|
||||
}
|
||||
return true;
|
||||
return OutVec(out.data());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -270,12 +270,12 @@ namespace Crafter {
|
|||
// R0 = up × R2 is linear in R2, so its normalized direction does
|
||||
// not depend on whether we hand R2 in before or after its own
|
||||
// normalize. Computing R0_raw from the un-normalized R2 lets us
|
||||
// satisfy the 4-input Normalize requirement with one batched call
|
||||
// (duplicating R2 and R0 in the padding slots).
|
||||
// satisfy the VectorF32<3,1>::BatchSize Normalize requirement with
|
||||
// one batched call (duplicating R2 and R0 in the padding slots).
|
||||
VectorF32<3, 1> R0Raw = VectorF32<3, 1>::Cross(upDirection, eyeDirection);
|
||||
auto normalized = VectorF32<3, 1>::Normalize(eyeDirection, R0Raw, eyeDirection, R0Raw);
|
||||
VectorF32<3, 1> R2 = std::get<0>(normalized);
|
||||
VectorF32<3, 1> R0 = std::get<1>(normalized);
|
||||
VectorF32<3, 1> R2 = normalized[0];
|
||||
VectorF32<3, 1> R0 = normalized[1];
|
||||
VectorF32<3, 1> R1 = VectorF32<3, 1>::Cross(R2, R0);
|
||||
VectorF32<3, 1> negEye = -eyePosition;
|
||||
|
||||
|
|
|
|||
|
|
@ -554,9 +554,41 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>, VectorF16<Len, Packing>, VectorF16<Len, Packing>> Normalize(
|
||||
VectorF16<Len, Packing> A,
|
||||
VectorF16<Len, Packing> C,
|
||||
// Public variadic surface — one name per op, arity locked to BatchSize
|
||||
// (or 2*BatchSize for Dot). Forwards to the *Pack helpers below which
|
||||
// carry the SIMD bodies and per-(Len,Packing) requires clauses.
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
|
||||
constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
|
||||
return NormalizePack(first, rest...);
|
||||
}
|
||||
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
|
||||
constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
|
||||
return LengthPack(first, rest...);
|
||||
}
|
||||
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
|
||||
constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
|
||||
return LengthSqPack(first, rest...);
|
||||
}
|
||||
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, _Float16>::BatchSize))
|
||||
constexpr static auto Dot(VectorF16<Len, Packing> first, Rest... rest) {
|
||||
return DotPack(first, rest...);
|
||||
}
|
||||
|
||||
private:
|
||||
constexpr static std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize> NormalizePack(
|
||||
VectorF16<Len, Packing> A,
|
||||
VectorF16<Len, Packing> C,
|
||||
VectorF16<Len, Packing> E,
|
||||
VectorF16<Len, Packing> G
|
||||
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
|
||||
|
|
@ -616,8 +648,8 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>> Normalize(
|
||||
VectorF16<Len, Packing> A,
|
||||
constexpr static std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize> NormalizePack(
|
||||
VectorF16<Len, Packing> A,
|
||||
VectorF16<Len, Packing> E
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
|
||||
|
|
@ -662,13 +694,13 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF16<1, Packing*4> Length(
|
||||
VectorF16<Len, Packing> A,
|
||||
constexpr static VectorF16<1, Packing*4> LengthPack(
|
||||
VectorF16<Len, Packing> A,
|
||||
VectorF16<Len, Packing> C,
|
||||
VectorF16<Len, Packing> E,
|
||||
VectorF16<Len, Packing> G
|
||||
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
|
||||
VectorF16<1, Packing*4> lenghtSq = LengthSq(A, C, E, G);
|
||||
VectorF16<1, Packing*4> lenghtSq = LengthSqPack(A, C, E, G);
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
|
||||
return VectorF16<1, Packing*4>(_mm_sqrt_ph(lenghtSq.v));
|
||||
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m256h>) {
|
||||
|
|
@ -678,11 +710,11 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF16<1, Packing*2> Length(
|
||||
VectorF16<Len, Packing> A,
|
||||
constexpr static VectorF16<1, Packing*2> LengthPack(
|
||||
VectorF16<Len, Packing> A,
|
||||
VectorF16<Len, Packing> E
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
|
||||
VectorF16<1, Packing*2> lenghtSq = LengthSq(A, E);
|
||||
VectorF16<1, Packing*2> lenghtSq = LengthSqPack(A, E);
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m128h>) {
|
||||
return VectorF16<1, Packing*2>(_mm_sqrt_ph(lenghtSq.v));
|
||||
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, _Float16>::VectorType, __m256h>) {
|
||||
|
|
@ -692,23 +724,23 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF16<1, Packing*4> LengthSq(
|
||||
VectorF16<Len, Packing> A,
|
||||
constexpr static VectorF16<1, Packing*4> LengthSqPack(
|
||||
VectorF16<Len, Packing> A,
|
||||
VectorF16<Len, Packing> C,
|
||||
VectorF16<Len, Packing> E,
|
||||
VectorF16<Len, Packing> G
|
||||
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
|
||||
return Dot(A, A, C, C, E, E, G, G);
|
||||
return DotPack(A, A, C, C, E, E, G, G);
|
||||
}
|
||||
|
||||
constexpr static VectorF16<1, Packing*2> LengthSq(
|
||||
VectorF16<Len, Packing> A,
|
||||
constexpr static VectorF16<1, Packing*2> LengthSqPack(
|
||||
VectorF16<Len, Packing> A,
|
||||
VectorF16<Len, Packing> E
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
|
||||
return Dot(A, A, E, E);
|
||||
return DotPack(A, A, E, E);
|
||||
}
|
||||
|
||||
constexpr static VectorF16<1, Packing*4> Dot(
|
||||
constexpr static VectorF16<1, Packing*4> DotPack(
|
||||
VectorF16<Len, Packing> A0, VectorF16<Len, Packing> A1,
|
||||
VectorF16<Len, Packing> C0, VectorF16<Len, Packing> C1,
|
||||
VectorF16<Len, Packing> E0, VectorF16<Len, Packing> E1,
|
||||
|
|
@ -744,7 +776,7 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF16<1, Packing*2> Dot(
|
||||
constexpr static VectorF16<1, Packing*2> DotPack(
|
||||
VectorF16<Len, Packing> A0, VectorF16<Len, Packing> A1,
|
||||
VectorF16<Len, Packing> E0, VectorF16<Len, Packing> E1
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, _Float16>::AlignmentElement) {
|
||||
|
|
@ -1200,9 +1232,10 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
|
||||
constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
|
||||
constexpr std::uint8_t N = 1 + sizeof...(Rest);
|
||||
constexpr std::uint8_t N = VectorBase<Len, Packing, _Float16>::BatchSize;
|
||||
VectorF16<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||
std::array<VectorF16<Len, Packing>, N> args{ first, rest... };
|
||||
for (std::uint8_t i = 0; i < N; ++i)
|
||||
|
|
@ -1218,7 +1251,8 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
|
||||
constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
|
||||
auto sq = LengthSq(first, rest...);
|
||||
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i)
|
||||
|
|
@ -1227,7 +1261,8 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, _Float16>::BatchSize))
|
||||
constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
|
||||
auto normOne = [](VectorF16<Len, Packing> u) {
|
||||
VectorF16<Len, Packing> out;
|
||||
|
|
@ -1243,7 +1278,7 @@ namespace Crafter {
|
|||
}
|
||||
return out;
|
||||
};
|
||||
return std::make_tuple(normOne(first), normOne(rest)...);
|
||||
return std::array<VectorF16<Len, Packing>, VectorBase<Len, Packing, _Float16>::BatchSize>{ normOne(first), normOne(rest)... };
|
||||
}
|
||||
|
||||
constexpr static VectorF16<Len, Packing> Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) {
|
||||
|
|
|
|||
|
|
@ -449,8 +449,8 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template <std::array<bool, Len> values>
|
||||
constexpr VectorF32<Len, Packing> Negate() {
|
||||
std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>();
|
||||
constexpr VectorF32<Len, Packing> Negate() const {
|
||||
std::array<float, VectorBase<Len, Packing, float>::AlignmentElement> mask = VectorBase<Len, Packing, float>::template GetNegateMask<values>();
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
|
||||
return VectorF32<Len, Packing>(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data())))));
|
||||
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
|
||||
|
|
@ -549,9 +549,41 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
// Public variadic surface — one name per op, arity locked to BatchSize.
|
||||
// The Pack helpers below carry the SIMD bodies and the per-(Len,Packing)
|
||||
// requires clauses; this wrapper just forwards once arity matches.
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
return NormalizePack(first, rest...);
|
||||
}
|
||||
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
return LengthPack(first, rest...);
|
||||
}
|
||||
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
return LengthSqPack(first, rest...);
|
||||
}
|
||||
|
||||
template <typename... Rest>
|
||||
requires ((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Dot(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
return DotPack(first, rest...);
|
||||
}
|
||||
|
||||
private:
|
||||
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
|
||||
|
|
@ -614,9 +646,9 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 3 && Packing == 1) {
|
||||
|
|
@ -638,9 +670,9 @@ namespace Crafter {
|
|||
};
|
||||
}
|
||||
|
||||
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 3 && Packing == 2) {
|
||||
|
|
@ -663,9 +695,9 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C
|
||||
) requires(Len == 3 && Packing == 5) {
|
||||
VectorF32<1, 15> lenght = Length(A, B, C);
|
||||
|
|
@ -685,8 +717,8 @@ namespace Crafter {
|
|||
}
|
||||
#endif
|
||||
|
||||
constexpr static std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> Normalize(
|
||||
VectorF32<Len, Packing> A,
|
||||
constexpr static std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize> NormalizePack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
|
||||
|
|
@ -733,13 +765,13 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, Packing*4> Length(
|
||||
VectorF32<Len, Packing> A,
|
||||
constexpr static VectorF32<1, Packing*4> LengthPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
|
||||
VectorF32<1, Packing*4> lenghtSq = LengthSq(A, B, C, D);
|
||||
VectorF32<1, Packing*4> lenghtSq = LengthSqPack(A, B, C, D);
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
|
||||
return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v));
|
||||
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
|
||||
|
|
@ -749,42 +781,42 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, 4> Length(
|
||||
constexpr static VectorF32<1, 4> LengthPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 3 && Packing == 1) {
|
||||
VectorF32<1, 4> lenghtSq = LengthSq(A, B, C, D);
|
||||
VectorF32<1, 4> lenghtSq = LengthSqPack(A, B, C, D);
|
||||
return VectorF32<1, 4>(_mm_sqrt_ps(lenghtSq.v));
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, 8> Length(
|
||||
constexpr static VectorF32<1, 8> LengthPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 3 && Packing == 2) {
|
||||
VectorF32<1, 8> lenghtSq = LengthSq(A, B, C, D);
|
||||
VectorF32<1, 8> lenghtSq = LengthSqPack(A, B, C, D);
|
||||
return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v));
|
||||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
constexpr static VectorF32<1, 15> Length(
|
||||
constexpr static VectorF32<1, 15> LengthPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C
|
||||
) requires(Len == 3 && Packing == 5) {
|
||||
VectorF32<1, 15> lenghtSq = LengthSq(A, B, C);
|
||||
VectorF32<1, 15> lenghtSq = LengthSqPack(A, B, C);
|
||||
return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v));
|
||||
}
|
||||
#endif
|
||||
|
||||
constexpr static VectorF32<1, Packing*2> Length(
|
||||
VectorF32<Len, Packing> A,
|
||||
constexpr static VectorF32<1, Packing*2> LengthPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> C
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
|
||||
VectorF32<1, Packing*2> lenghtSq = LengthSq(A, C);
|
||||
VectorF32<1, Packing*2> lenghtSq = LengthSqPack(A, C);
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
|
||||
return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v));
|
||||
} else if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m256>) {
|
||||
|
|
@ -796,51 +828,51 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, Packing*4> LengthSq(
|
||||
VectorF32<Len, Packing> A,
|
||||
constexpr static VectorF32<1, Packing*4> LengthSqPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 4 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
|
||||
return Dot(A, A, B, B, C, C, D, D);
|
||||
return DotPack(A, A, B, B, C, C, D, D);
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, 4> LengthSq(
|
||||
constexpr static VectorF32<1, 4> LengthSqPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 3 && Packing == 1) {
|
||||
return Dot(A, A, B, B, C, C, D, D);
|
||||
return DotPack(A, A, B, B, C, C, D, D);
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, 8> LengthSq(
|
||||
constexpr static VectorF32<1, 8> LengthSqPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C,
|
||||
VectorF32<Len, Packing> D
|
||||
) requires(Len == 3 && Packing == 2) {
|
||||
return Dot(A, A, B, B, C, C, D, D);
|
||||
return DotPack(A, A, B, B, C, C, D, D);
|
||||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
constexpr static VectorF32<1, 15> LengthSq(
|
||||
constexpr static VectorF32<1, 15> LengthSqPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> B,
|
||||
VectorF32<Len, Packing> C
|
||||
) requires(Len == 3 && Packing == 5) {
|
||||
return Dot(A, A, B, B, C, C);
|
||||
return DotPack(A, A, B, B, C, C);
|
||||
}
|
||||
#endif
|
||||
|
||||
constexpr static VectorF32<1, Packing*2> LengthSq(
|
||||
VectorF32<Len, Packing> A,
|
||||
constexpr static VectorF32<1, Packing*2> LengthSqPack(
|
||||
VectorF32<Len, Packing> A,
|
||||
VectorF32<Len, Packing> C
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
|
||||
return Dot(A, A, C, C);
|
||||
return DotPack(A, A, C, C);
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, Packing*4> Dot(
|
||||
constexpr static VectorF32<1, Packing*4> DotPack(
|
||||
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
|
||||
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
|
||||
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
|
||||
|
|
@ -869,7 +901,7 @@ namespace Crafter {
|
|||
}
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, 4> Dot(
|
||||
constexpr static VectorF32<1, 4> DotPack(
|
||||
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
|
||||
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
|
||||
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
|
||||
|
|
@ -914,7 +946,7 @@ namespace Crafter {
|
|||
return row1;
|
||||
}
|
||||
|
||||
constexpr static VectorF32<1, 8> Dot(
|
||||
constexpr static VectorF32<1, 8> DotPack(
|
||||
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
|
||||
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
|
||||
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
|
||||
|
|
@ -1021,7 +1053,7 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
#ifdef __AVX512F__
|
||||
constexpr static VectorF32<1, 15> Dot(
|
||||
constexpr static VectorF32<1, 15> DotPack(
|
||||
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
|
||||
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
|
||||
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
|
||||
|
|
@ -1112,8 +1144,8 @@ namespace Crafter {
|
|||
}
|
||||
#endif
|
||||
|
||||
constexpr static VectorF32<1, Packing*2> Dot(
|
||||
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
|
||||
constexpr static VectorF32<1, Packing*2> DotPack(
|
||||
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
|
||||
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1
|
||||
) requires(Len == 2 && Packing*Len == VectorBase<Len, Packing, float>::AlignmentElement) {
|
||||
if constexpr(std::is_same_v<typename VectorBase<Len, Packing, float>::VectorType, __m128>) {
|
||||
|
|
@ -1548,9 +1580,10 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
constexpr std::uint8_t N = 1 + sizeof...(Rest);
|
||||
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
|
||||
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
|
||||
alignas(16) float buf[4] = {0,0,0,0};
|
||||
|
|
@ -1571,41 +1604,39 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
auto sq = LengthSq(first, rest...);
|
||||
sq.v = wasm_f32x4_sqrt(sq.v);
|
||||
return sq;
|
||||
}
|
||||
|
||||
// Four pairwise dot products packed into one v128. Only the first Len
|
||||
// Pairwise dot products packed into one v128. Only the first Len
|
||||
// lanes contribute, so the same routine handles 3- and 4-component
|
||||
// inputs — the 4th lane of Len==3 inputs may be garbage from Cross()
|
||||
// and must not be summed.
|
||||
constexpr static VectorF32<1, 4> Dot(
|
||||
VectorF32<Len, Packing> A0, VectorF32<Len, Packing> A1,
|
||||
VectorF32<Len, Packing> B0, VectorF32<Len, Packing> B1,
|
||||
VectorF32<Len, Packing> C0, VectorF32<Len, Packing> C1,
|
||||
VectorF32<Len, Packing> D0, VectorF32<Len, Packing> D1
|
||||
) requires((Len == 3 || Len == 4) && Packing == 1) {
|
||||
alignas(16) float a0[4], a1[4], b0[4], b1[4], c0[4], c1[4], d0[4], d1[4];
|
||||
wasm_v128_store(a0, A0.v); wasm_v128_store(a1, A1.v);
|
||||
wasm_v128_store(b0, B0.v); wasm_v128_store(b1, B1.v);
|
||||
wasm_v128_store(c0, C0.v); wasm_v128_store(c1, C1.v);
|
||||
wasm_v128_store(d0, D0.v); wasm_v128_store(d1, D1.v);
|
||||
|
||||
// and must not be summed. Takes BatchSize pairs (== 4 here since
|
||||
// WASM AlignmentElement is always 4 and Packing must be 1).
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize) &&
|
||||
(Len == 3 || Len == 4) && Packing == 1)
|
||||
constexpr static VectorF32<1, 4> Dot(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
|
||||
std::array<VectorF32<Len, Packing>, 2 * N> args{ first, rest... };
|
||||
alignas(16) float out[4] = {0,0,0,0};
|
||||
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||
out[0] += a0[k] * a1[k];
|
||||
out[1] += b0[k] * b1[k];
|
||||
out[2] += c0[k] * c1[k];
|
||||
out[3] += d0[k] * d1[k];
|
||||
for (std::uint8_t i = 0; i < N; ++i) {
|
||||
alignas(16) float a[4], b[4];
|
||||
wasm_v128_store(a, args[2 * i].v);
|
||||
wasm_v128_store(b, args[2 * i + 1].v);
|
||||
for (std::uint8_t k = 0; k < Len; ++k) out[i] += a[k] * b[k];
|
||||
}
|
||||
return VectorF32<1, 4>(wasm_v128_load(out));
|
||||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
auto normOne = [](VectorF32<Len, Packing> u) {
|
||||
alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v);
|
||||
|
|
@ -1622,7 +1653,7 @@ namespace Crafter {
|
|||
}
|
||||
return VectorF32<Len, Packing>(wasm_v128_load(out));
|
||||
};
|
||||
return std::make_tuple(normOne(first), normOne(rest)...);
|
||||
return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
|
||||
}
|
||||
|
||||
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
|
||||
|
|
@ -1842,9 +1873,10 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
constexpr std::uint8_t N = 1 + sizeof...(Rest);
|
||||
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
|
||||
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
|
||||
for (std::uint8_t i = 0; i < N; ++i)
|
||||
|
|
@ -1860,7 +1892,8 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
auto sq = LengthSq(first, rest...);
|
||||
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]);
|
||||
|
|
@ -1868,7 +1901,8 @@ namespace Crafter {
|
|||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
auto normOne = [](VectorF32<Len, Packing> u) {
|
||||
VectorF32<Len, Packing> out;
|
||||
|
|
@ -1884,7 +1918,7 @@ namespace Crafter {
|
|||
}
|
||||
return out;
|
||||
};
|
||||
return std::make_tuple(normOne(first), normOne(rest)...);
|
||||
return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
|
||||
}
|
||||
|
||||
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue