diff --git a/interfaces/Crafter.Math-Common.cppm b/interfaces/Crafter.Math-Common.cppm index f690cee..a730d53 100644 --- a/interfaces/Crafter.Math-Common.cppm +++ b/interfaces/Crafter.Math-Common.cppm @@ -87,6 +87,17 @@ namespace Crafter { static constexpr std::uint8_t AlignmentElement = GetAlingment()/sizeof(T); static constexpr std::uint8_t Alignment = GetAlingment(); + // Number of input vectors per batched Normalize/Dot/Length call that + // exactly fills the output register on the current (Len, Packing, ISA). + // Each input contributes `Packing` scalar results; an output register + // holds `AlignmentElement` lanes, so optimal arity = lanes / packing. + static constexpr std::uint8_t BatchSize = AlignmentElement / Packing; + // Largest Packing that still fits a single SIMD register for this + // (Len, T) on the current ISA. Independent of the current Packing + // dimension — meant for higher-level batching code that wants to + // process Packing sub-primitives at once (e.g. intersection tests). + // Falls back to 1 in the pathological case Len > MaxElement. + static constexpr std::uint8_t OptimalPacking = (MaxElement / Len) > 0 ? (MaxElement / Len) : 1; static_assert(Len * Packing <= MaxElement, "Len * Packing exceeds MaxElement"); protected: @@ -97,6 +108,22 @@ namespace Crafter { return arr; } + // True iff every per-Packing-slot shuffle (output, source) pair stays + // within the same PerLane chunk. shuffle_epi32 / shuffle_epi8 are + // applied per 128-bit lane, so any cross-lane move has to fall through + // to a cross-lane permute path instead. + template ShuffleValues> + static consteval bool LaneSafeShuffle() { + for (std::uint8_t p = 0; p < Packing; ++p) { + for (std::uint8_t i = 0; i < Len; ++i) { + std::uint8_t outIdx = static_cast(p * Len + i); + std::uint8_t srcIdx = static_cast(p * Len + ShuffleValues[i]); + if (outIdx / PerLane != srcIdx / PerLane) return false; + } + } + return true; + } + template ShuffleValues> static consteval bool CheckEpi32Shuffle() { if constexpr (PerLane == 8) { @@ -113,7 +140,7 @@ namespace Crafter { } } } - return true; + return LaneSafeShuffle(); } template ShuffleValues> @@ -124,7 +151,7 @@ namespace Crafter { return false; } } - return true; + return LaneSafeShuffle(); } template ShuffleValues> diff --git a/interfaces/Crafter.Math-Intersection.cppm b/interfaces/Crafter.Math-Intersection.cppm index 3c167ed..1d7411c 100755 --- a/interfaces/Crafter.Math-Intersection.cppm +++ b/interfaces/Crafter.Math-Intersection.cppm @@ -23,64 +23,141 @@ import :MatrixRowMajor; import std; namespace Crafter { - // All intersection tests are batched over four primitives at a time so they - // feed the VectorF32<3,1>::Dot / Cross / Length / Normalize four-pair - // overloads directly. The single-primitive case is just "pass the same - // primitive four times and read lane 0" - there is no single-vector - // fast-path because the SIMD pipelines want full lanes. + namespace detail { + // Splat a single Len-vector into all Packing slots of the wider type + // via a temporary float buffer. Performed once per intersection call; + // the inner SIMD loop dominates so the round-trip is in the noise. + template + inline VectorF32 SplatToPacking(VectorF32 v) { + alignas(64) float buf[VectorF32::AlignmentElement] = {}; + std::array::AlignmentElement> flat = v.template Store(); + for (std::uint8_t p = 0; p < Packing; ++p) { + for (std::uint8_t k = 0; k < Len; ++k) buf[p * Len + k] = flat[k]; + } + return VectorF32(buf); + } - // Möller-Trumbore against four triangles sharing one ray. Returns ray - // parameter t per triangle, or float max where the ray misses. - export inline VectorF32<1, 4> IntersectionTestRayTriangle( + // Interleave two arrays of size N=BatchSize into the 2*N positional + // argument list expected by the variadic Dot. Returns the packed + // VectorF32<1, Packing*BatchSize> with one dot product per slot. + template + inline auto DotArrays( + std::array, N> const& a, + std::array, N> const& b + ) { + return [&](std::index_sequence) { + std::array, 2 * N> flat; + ((flat[2 * Is] = a[Is], flat[2 * Is + 1] = b[Is]), ...); + return std::apply([](auto... args) { + return VectorF32::Dot(args...); + }, flat); + }(std::make_index_sequence{}); + } + + // Gather the `Component`-th lane of every sub-vector across an array + // of N packed VectorF32<3, Packing> into a flat VectorF32<1, Packing*N> + // with one scalar per pair. Used to materialize halfSize.x / .y / .z + // alongside per-pair scalar projections in a single SIMD register. + template + inline auto ExtractComponent( + std::array, VectorF32<3, Packing>::BatchSize> const& arr + ) { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + using OutVec = VectorF32<1, Total>; + alignas(64) float buf[OutVec::AlignmentElement] = {}; + for (std::uint8_t b = 0; b < N; ++b) { + auto v = arr[b].template Store(); + for (std::uint8_t p = 0; p < Packing; ++p) { + buf[b * Packing + p] = v[p * 3 + Component]; + } + } + return OutVec(buf); + } + + // Lane-wise absolute value. Done via a flat float buffer because the + // F32 module does not expose a SIMD Abs primitive. Only called O(15) + // times per OBB-OBB call, so the round-trip is negligible compared to + // the dot-product work. + template + inline VectorF32<1, Total> AbsVec(VectorF32<1, Total> v) { + alignas(64) float buf[VectorF32<1, Total>::AlignmentElement]; + v.Store(buf); + for (std::uint8_t i = 0; i < Total; ++i) buf[i] = std::abs(buf[i]); + return VectorF32<1, Total>(buf); + } + } + + // Packed batch of Packing * BatchSize OBBs, each described by world-space + // origin, three orthonormal rotation axes (rows of the rotation matrix), + // and per-axis half-extents. Each std::array element packs `Packing` + // sub-OBBs; there are BatchSize such elements, so the struct holds + // Packing * BatchSize OBBs total. + // + // Callers that have OBBs as MatrixRowMajor + halfSize need to extract the + // three axes and the origin themselves — keeping the routines in terms of + // packed VectorF32<3, Packing> lets every SIMD op stay in registers. + export template ::OptimalPacking> + struct PackedOBBs { + static constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + static constexpr std::uint8_t Total = Packing * N; + + std::array, N> halfSize; + std::array, N> xAxis; + std::array, N> yAxis; + std::array, N> zAxis; + std::array, N> origin; + }; + + // All intersection tests are batched over Packing*BatchSize primitives at + // a time, where `Packing = VectorF32<3,1>::OptimalPacking` for the current + // ISA (5 on AVX-512, 2 on AVX2, 1 on SSE/WASM/scalar) and BatchSize is the + // arity that fills one output register. Callers form the packed input by + // laying out `Packing` sub-primitives consecutively per vertex slot, then + // assemble `BatchSize` such packed slots into the std::array argument. + // Result lane `i` corresponds to triangle/sphere/box index `i`. + + // Möller-Trumbore against Packing*BatchSize triangles sharing one ray. + // Returns ray parameter t per triangle, or float max where the ray misses. + export template ::OptimalPacking> + inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> + IntersectionTestRayTriangle( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, - VectorF32<3, 1> aV0, VectorF32<3, 1> aV1, VectorF32<3, 1> aV2, - VectorF32<3, 1> bV0, VectorF32<3, 1> bV1, VectorF32<3, 1> bV2, - VectorF32<3, 1> cV0, VectorF32<3, 1> cV1, VectorF32<3, 1> cV2, - VectorF32<3, 1> dV0, VectorF32<3, 1> dV1, VectorF32<3, 1> dV2 + std::array, VectorF32<3, Packing>::BatchSize> const& v0, + std::array, VectorF32<3, Packing>::BatchSize> const& v1, + std::array, VectorF32<3, Packing>::BatchSize> const& v2 ) { - VectorF32<3, 1> aE1 = aV1 - aV0; - VectorF32<3, 1> aE2 = aV2 - aV0; - VectorF32<3, 1> bE1 = bV1 - bV0; - VectorF32<3, 1> bE2 = bV2 - bV0; - VectorF32<3, 1> cE1 = cV1 - cV0; - VectorF32<3, 1> cE2 = cV2 - cV0; - VectorF32<3, 1> dE1 = dV1 - dV0; - VectorF32<3, 1> dE2 = dV2 - dV0; + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + using PVec = VectorF32<3, Packing>; - VectorF32<3, 1> aH = VectorF32<3, 1>::Cross(rayDir, aE2); - VectorF32<3, 1> bH = VectorF32<3, 1>::Cross(rayDir, bE2); - VectorF32<3, 1> cH = VectorF32<3, 1>::Cross(rayDir, cE2); - VectorF32<3, 1> dH = VectorF32<3, 1>::Cross(rayDir, dE2); + PVec rayOriginP = detail::SplatToPacking(rayOrigin); + PVec rayDirP = detail::SplatToPacking(rayDir); - VectorF32<3, 1> aS = rayOrigin - aV0; - VectorF32<3, 1> bS = rayOrigin - bV0; - VectorF32<3, 1> cS = rayOrigin - cV0; - VectorF32<3, 1> dS = rayOrigin - dV0; + std::array E1, E2, H, S, Q, rayDirArr; + for (std::uint8_t i = 0; i < N; ++i) { + E1[i] = v1[i] - v0[i]; + E2[i] = v2[i] - v0[i]; + H[i] = PVec::Cross(rayDirP, E2[i]); + S[i] = rayOriginP - v0[i]; + Q[i] = PVec::Cross(S[i], E1[i]); + rayDirArr[i] = rayDirP; + } - VectorF32<3, 1> aQ = VectorF32<3, 1>::Cross(aS, aE1); - VectorF32<3, 1> bQ = VectorF32<3, 1>::Cross(bS, bE1); - VectorF32<3, 1> cQ = VectorF32<3, 1>::Cross(cS, cE1); - VectorF32<3, 1> dQ = VectorF32<3, 1>::Cross(dS, dE1); + auto det = detail::DotArrays(E1, H); + auto uNum = detail::DotArrays(S, H); + auto vNum = detail::DotArrays(rayDirArr, Q); + auto tNum = detail::DotArrays(E2, Q); - // Four 3-component dots packed into one __m128 per call. - VectorF32<1, 4> det = VectorF32<3, 1>::Dot( - aE1, aH, bE1, bH, cE1, cH, dE1, dH); - VectorF32<1, 4> uNum = VectorF32<3, 1>::Dot( - aS, aH, bS, bH, cS, cH, dS, dH); - VectorF32<1, 4> vNum = VectorF32<3, 1>::Dot( - rayDir, aQ, rayDir, bQ, rayDir, cQ, rayDir, dQ); - VectorF32<1, 4> tNum = VectorF32<3, 1>::Dot( - aE2, aQ, bE2, bQ, cE2, cQ, dE2, dQ); - - std::array detArr = det.template Store(); - std::array uArr = uNum.template Store(); - std::array vArr = vNum.template Store(); - std::array tArr = tNum.template Store(); + auto detArr = det.template Store(); + auto uArr = uNum.template Store(); + auto vArr = vNum.template Store(); + auto tArr = tNum.template Store(); constexpr float eps = std::numeric_limits::epsilon(); constexpr float maxF = std::numeric_limits::max(); - alignas(16) std::array out{}; - for (std::uint8_t i = 0; i < 4; ++i) { + alignas(64) std::array::AlignmentElement> out{}; + for (std::uint8_t i = 0; i < Total; ++i) { float d = detArr[i]; if (d <= eps) { out[i] = maxF; continue; } float invD = 1.0f / d; @@ -90,115 +167,120 @@ namespace Crafter { if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; } out[i] = tArr[i] * invD; } - return VectorF32<1, 4>(out.data()); + return VectorF32<1, Total>(out.data()); } - // One ray against four spheres. radii must hold {rA, rB, rC, rD} in lanes - // 0..3. - export inline VectorF32<1, 4> IntersectionTestRaySphere( + // One ray against Packing*BatchSize spheres. `radii` holds one radius per + // sphere in lane order matching the result. + export template ::OptimalPacking> + inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> + IntersectionTestRaySphere( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, - VectorF32<3, 1> posA, VectorF32<3, 1> posB, - VectorF32<3, 1> posC, VectorF32<3, 1> posD, - VectorF32<1, 4> radii + std::array, VectorF32<3, Packing>::BatchSize> const& pos, + VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> radii ) { - VectorF32<3, 1> sA = rayOrigin - posA; - VectorF32<3, 1> sB = rayOrigin - posB; - VectorF32<3, 1> sC = rayOrigin - posC; - VectorF32<3, 1> sD = rayOrigin - posD; + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + using PVec = VectorF32<3, Packing>; + using OutVec = VectorF32<1, Total>; + + PVec rayOriginP = detail::SplatToPacking(rayOrigin); + PVec rayDirP = detail::SplatToPacking(rayDir); + + std::array s; + std::array rayDirArr; + for (std::uint8_t i = 0; i < N; ++i) { + s[i] = rayOriginP - pos[i]; + rayDirArr[i] = rayDirP; + } // dirDotS_i = rayDir · (rayOrigin - pos_i) - VectorF32<1, 4> dirDotS = VectorF32<3, 1>::Dot( - rayDir, sA, rayDir, sB, rayDir, sC, rayDir, sD); - // sqDist_i = |rayOrigin - pos_i|² (a.k.a. LengthSq of the s vectors) - VectorF32<1, 4> sqDist = VectorF32<3, 1>::LengthSq(sA, sB, sC, sD); - // aScalar = rayDir · rayDir, broadcast across four lanes. - VectorF32<1, 4> aScalar = VectorF32<3, 1>::LengthSq( - rayDir, rayDir, rayDir, rayDir); + auto dirDotS = detail::DotArrays(rayDirArr, s); + // sqDist_i = |rayOrigin - pos_i|² across all packed slots. + auto sqDist = std::apply([](auto... args) { return PVec::LengthSq(args...); }, s); + // aScalar = rayDir · rayDir, broadcast across every lane. + auto aScalar = std::apply([](auto... args) { return PVec::LengthSq(args...); }, rayDirArr); - VectorF32<1, 4> two(2.0f); - VectorF32<1, 4> four(4.0f); - VectorF32<1, 4> b = two * dirDotS; - VectorF32<1, 4> c = sqDist - radii * radii; + OutVec two(2.0f); + OutVec four(4.0f); + OutVec b = two * dirDotS; + OutVec c = sqDist - radii * radii; // discriminant = b² - 4·a·c - VectorF32<1, 4> disc = b * b - four * aScalar * c; + OutVec disc = b * b - four * aScalar * c; - std::array discArr = disc.template Store(); - std::array bArr = b.template Store(); - std::array aArr = aScalar.template Store(); + auto discArr = disc.template Store(); + auto bArr = b.template Store(); + auto aArr = aScalar.template Store(); constexpr float maxF = std::numeric_limits::max(); - alignas(16) std::array out{}; - for (std::uint8_t i = 0; i < 4; ++i) { + alignas(64) std::array out{}; + for (std::uint8_t i = 0; i < Total; ++i) { float d = discArr[i]; if (d < 0.0f) { out[i] = maxF; continue; } float sqrtD = std::sqrt(d); float t = -0.5f * (bArr[i] + sqrtD) / aArr[i]; out[i] = (t > 0.0f) ? t : maxF; } - return VectorF32<1, 4>(out.data()); + return OutVec(out.data()); } - // One ray against four OBBs. Each box is described by world-space position, - // half-extent vector (per-axis sizes), and a unit quaternion rotation. - export inline VectorF32<1, 4> IntersectionTestRayOrientedBox( + // Packing that fits both Len=3 (positions, sizes) and Len=4 (quaternions) + // in one SIMD register. Len=4's OptimalPacking is always ≤ Len=3's, so we + // use the smaller of the two so a single Packing covers every type the + // routine needs. + inline constexpr std::uint8_t RayOBBPacking = std::min( + VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking); + + // One ray against Packing*BatchSize OBBs. Each box is described by + // world-space position, full-extent size, and a unit quaternion rotation. + export template + inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> + IntersectionTestRayOrientedBox( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, - VectorF32<3, 1> posA, VectorF32<3, 1> sizeA, VectorF32<4, 1> rotA, - VectorF32<3, 1> posB, VectorF32<3, 1> sizeB, VectorF32<4, 1> rotB, - VectorF32<3, 1> posC, VectorF32<3, 1> sizeC, VectorF32<4, 1> rotC, - VectorF32<3, 1> posD, VectorF32<3, 1> sizeD, VectorF32<4, 1> rotD + std::array, VectorF32<3, Packing>::BatchSize> const& pos, + std::array, VectorF32<3, Packing>::BatchSize> const& size, + std::array, VectorF32<3, Packing>::BatchSize> const& rot ) { - // Conjugate quaternion: negate xyz, keep w. Negate<{true,true,true,false}> - // is constant-folded into a single XOR with a mask vector. - VectorF32<4, 1> invRotA = rotA.template Negate<{{true, true, true, false}}>(); - VectorF32<4, 1> invRotB = rotB.template Negate<{{true, true, true, false}}>(); - VectorF32<4, 1> invRotC = rotC.template Negate<{{true, true, true, false}}>(); - VectorF32<4, 1> invRotD = rotD.template Negate<{{true, true, true, false}}>(); + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + using PVec3 = VectorF32<3, Packing>; + using PVec4 = VectorF32<4, Packing>; + using OutVec = VectorF32<1, Total>; - VectorF32<3, 1> localOriginA = VectorF32<3, 1>::Rotate(rayOrigin - posA, invRotA); - VectorF32<3, 1> localOriginB = VectorF32<3, 1>::Rotate(rayOrigin - posB, invRotB); - VectorF32<3, 1> localOriginC = VectorF32<3, 1>::Rotate(rayOrigin - posC, invRotC); - VectorF32<3, 1> localOriginD = VectorF32<3, 1>::Rotate(rayOrigin - posD, invRotD); + PVec3 rayOriginP = detail::SplatToPacking(rayOrigin); + PVec3 rayDirP = detail::SplatToPacking(rayDir); - VectorF32<3, 1> localDirA = VectorF32<3, 1>::Rotate(rayDir, invRotA); - VectorF32<3, 1> localDirB = VectorF32<3, 1>::Rotate(rayDir, invRotB); - VectorF32<3, 1> localDirC = VectorF32<3, 1>::Rotate(rayDir, invRotC); - VectorF32<3, 1> localDirD = VectorF32<3, 1>::Rotate(rayDir, invRotD); + // Conjugate quaternion: negate xyz, keep w. Constant-folded into one + // XOR with a mask vector inside Negate. + std::array localOrigin, localDir, half; + for (std::uint8_t i = 0; i < N; ++i) { + PVec4 invRot = rot[i].template Negate<{{true, true, true, false}}>(); + localOrigin[i] = PVec3::Rotate(rayOriginP - pos[i], invRot); + localDir[i] = PVec3::Rotate(rayDirP, invRot); + half[i] = size[i] * 0.5f; + } - VectorF32<3, 1> halfA = sizeA * 0.5f; - VectorF32<3, 1> halfB = sizeB * 0.5f; - VectorF32<3, 1> halfC = sizeC * 0.5f; - VectorF32<3, 1> halfD = sizeD * 0.5f; - - std::array, 4> origLanes{ - localOriginA.template Store(), - localOriginB.template Store(), - localOriginC.template Store(), - localOriginD.template Store(), - }; - std::array, 4> dirLanes{ - localDirA.template Store(), - localDirB.template Store(), - localDirC.template Store(), - localDirD.template Store(), - }; - std::array, 4> halfLanes{ - halfA.template Store(), - halfB.template Store(), - halfC.template Store(), - halfD.template Store(), - }; + std::array, N> origLanes, dirLanes, halfLanes; + for (std::uint8_t i = 0; i < N; ++i) { + origLanes[i] = localOrigin[i].template Store(); + dirLanes[i] = localDir[i].template Store(); + halfLanes[i] = half[i].template Store(); + } constexpr float eps = std::numeric_limits::epsilon(); constexpr float maxF = std::numeric_limits::max(); - alignas(16) std::array out{}; - for (std::uint8_t b = 0; b < 4; ++b) { + alignas(64) std::array out{}; + for (std::uint8_t b = 0; b < Total; ++b) { + std::uint8_t batchIdx = b / Packing; + std::uint8_t subIdx = b % Packing; float tMin = 0.0f; float tMax = maxF; bool miss = false; for (std::uint8_t i = 0; i < 3; ++i) { - float d = dirLanes[b][i]; - float o = origLanes[b][i]; - float h = halfLanes[b][i]; + std::uint8_t lane = static_cast(subIdx * 3 + i); + float d = dirLanes[batchIdx][lane]; + float o = origLanes[batchIdx][lane]; + float h = halfLanes[batchIdx][lane]; if (std::abs(d) < eps) { if (o < -h || o > h) { miss = true; break; } } else { @@ -213,87 +295,65 @@ namespace Crafter { } out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax); } - return VectorF32<1, 4>(out.data()); + return OutVec(out.data()); } - // One sphere against four OBBs. boxMatrix encodes rotation in m[r][0..2] - // and translation in m[r][3]. - export inline VectorF32<1, 4> IntersectionTestSphereOrientedBox( - VectorF32<3, 1> sphereCenter, VectorF32<1, 4> radii, - VectorF32<3, 1> sizeA, MatrixRowMajor boxA, - VectorF32<3, 1> sizeB, MatrixRowMajor boxB, - VectorF32<3, 1> sizeC, MatrixRowMajor boxC, - VectorF32<3, 1> sizeD, MatrixRowMajor boxD + // One sphere against Packing*BatchSize OBBs described by a PackedOBBs. + // Returns 0.0 per pair where the sphere intersects the box, max-float + // otherwise. `radii` carries one sphere radius per pair in the same lane + // order as the resulting test output. + export template ::OptimalPacking> + inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> + IntersectionTestSphereOrientedBox( + VectorF32<3, 1> sphereCenter, + VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> radii, + PackedOBBs const& boxes ) { - auto perBox = [&](MatrixRowMajor const& m, - VectorF32<3, 1> const& size, - VectorF32<3, 1>& xAxis, - VectorF32<3, 1>& yAxis, - VectorF32<3, 1>& zAxis, - VectorF32<3, 1>& delta) { - // Existing semantics: the OBB axes are read from the rows of the - // upper 3x3 block, and the translation column is gathered from the - // w lane of each row. - std::array r0 = m.rows[0].template Store(); - std::array r1 = m.rows[1].template Store(); - std::array r2 = m.rows[2].template Store(); - alignas(16) float xBuf[4] = { r0[0], r0[1], r0[2], 0.0f }; - alignas(16) float yBuf[4] = { r1[0], r1[1], r1[2], 0.0f }; - alignas(16) float zBuf[4] = { r2[0], r2[1], r2[2], 0.0f }; - alignas(16) float oBuf[4] = { r0[3], r1[3], r2[3], 0.0f }; - xAxis = VectorF32<3, 1>(xBuf); - yAxis = VectorF32<3, 1>(yBuf); - zAxis = VectorF32<3, 1>(zBuf); - VectorF32<3, 1> origin(oBuf); - delta = sphereCenter - origin; - (void)size; - }; + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + using PVec3 = VectorF32<3, Packing>; + using OutVec = VectorF32<1, Total>; - VectorF32<3, 1> xA, yA, zA, dA; - VectorF32<3, 1> xB, yB, zB, dB; - VectorF32<3, 1> xC, yC, zC, dC; - VectorF32<3, 1> xD, yD, zD, dD; - perBox(boxA, sizeA, xA, yA, zA, dA); - perBox(boxB, sizeB, xB, yB, zB, dB); - perBox(boxC, sizeC, xC, yC, zC, dC); - perBox(boxD, sizeD, xD, yD, zD, dD); + PVec3 sphereCenterP = detail::SplatToPacking(sphereCenter); + std::array delta; + for (std::uint8_t i = 0; i < N; ++i) { + delta[i] = sphereCenterP - boxes.origin[i]; + } - // Local sphere center per box: project delta onto each box axis. We - // produce {lx, ly, lz, lx, ly, lz, lx, ly, lz, lx, ly, lz} as three - // packed 4-wide Dot results (one Dot per axis). - VectorF32<1, 4> locX = VectorF32<3, 1>::Dot( - dA, xA, dB, xB, dC, xC, dD, xD); - VectorF32<1, 4> locY = VectorF32<3, 1>::Dot( - dA, yA, dB, yB, dC, yC, dD, yD); - VectorF32<1, 4> locZ = VectorF32<3, 1>::Dot( - dA, zA, dB, zB, dC, zC, dD, zD); + // Project the world-space delta onto each box axis. + auto locX = detail::DotArrays(delta, boxes.xAxis); + auto locY = detail::DotArrays(delta, boxes.yAxis); + auto locZ = detail::DotArrays(delta, boxes.zAxis); - std::array lxArr = locX.template Store(); - std::array lyArr = locY.template Store(); - std::array lzArr = locZ.template Store(); - std::array rArr = radii.template Store(); - std::array, 4> sizeLanes{ - sizeA.template Store(), - sizeB.template Store(), - sizeC.template Store(), - sizeD.template Store(), - }; + auto lxArr = locX.template Store(); + auto lyArr = locY.template Store(); + auto lzArr = locZ.template Store(); + auto rArr = radii.template Store(); + std::array, N> sizeLanes; + for (std::uint8_t i = 0; i < N; ++i) { + sizeLanes[i] = boxes.halfSize[i].template Store(); + } - alignas(16) std::array out{}; - for (std::uint8_t i = 0; i < 4; ++i) { + constexpr float maxF = std::numeric_limits::max(); + alignas(64) std::array out{}; + for (std::uint8_t i = 0; i < Total; ++i) { + std::uint8_t batchIdx = i / Packing; + std::uint8_t subIdx = i % Packing; float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i]; - float sx = sizeLanes[i][0], sy = sizeLanes[i][1], sz = sizeLanes[i][2]; + float sx = sizeLanes[batchIdx][subIdx * 3 + 0]; + float sy = sizeLanes[batchIdx][subIdx * 3 + 1]; + float sz = sizeLanes[batchIdx][subIdx * 3 + 2]; float cx = std::clamp(lx, -sx, sx); float cy = std::clamp(ly, -sy, sy); float cz = std::clamp(lz, -sz, sz); float dx = lx - cx, dy = ly - cy, dz = lz - cz; float distSq = dx * dx + dy * dy + dz * dz; float r = rArr[i]; - // Returns 0.0 on hit, max on miss - keeps a consistent - // "t-like" output signature with the other intersection tests. - out[i] = (distSq <= r * r) ? 0.0f : std::numeric_limits::max(); + // Returns 0.0 on hit, max on miss — same "t-like" output signature + // as the ray-vs-X tests. + out[i] = (distSq <= r * r) ? 0.0f : maxF; } - return VectorF32<1, 4>(out.data()); + return OutVec(out.data()); } // Eight local corners of a unit OBB transformed by `matrix`. Uses one @@ -350,100 +410,104 @@ namespace Crafter { return result; } - // SAT against fifteen separating axes (3 box-A, 3 box-B, 9 cross products). - // We compute every corner projection with batched 4-pair Dots: each axis - // projects four corners per call, two calls per axis covers the 8 corners. - export inline bool IntersectionTestOrientedBoxOrientedBox( - VectorF32<3, 1> sizeA, MatrixRowMajor boxA, - VectorF32<3, 1> sizeB, MatrixRowMajor boxB + // SAT against the 15 separating axis candidates (3 from box A, 3 from + // box B, 9 cross products). Returns 0.0 per pair when the boxes overlap + // and max-float when a separating axis was found, matching the + // "smaller-is-closer" convention of the ray-vs-X tests. + // + // The corner-free formulation: for an OBB (origin O, unit axes X/Y/Z, + // half-extents h) and a separating-axis candidate a, the projection + // interval is centered at O·a with radius hx|X·a| + hy|Y·a| + hz|Z·a|. + // Each axis therefore only needs four dot products per box (and a couple + // of fused-multiply-adds) instead of eight corner projections — every + // sub-pair runs in parallel inside the SIMD lanes. + export template ::OptimalPacking> + inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> + IntersectionTestOrientedBoxOrientedBox( + PackedOBBs const& a, PackedOBBs const& b ) { - std::array, 8> cornersA = GetOBBCorners(sizeA, boxA); - std::array, 8> cornersB = GetOBBCorners(sizeB, boxB); + using PVec = VectorF32<3, Packing>; + constexpr std::uint8_t N = PVec::BatchSize; + constexpr std::uint8_t Total = Packing * N; + using OutVec = VectorF32<1, Total>; - // Axes are the upper-3 lanes of each matrix row (same convention as - // SphereOrientedBox). ExtractLo<3> just retypes the SIMD register; the - // 4th lane is ignored by the Len=3 ops below. - std::array, 3> axesA = { - boxA.rows[0].template ExtractLo<3>(), - boxA.rows[1].template ExtractLo<3>(), - boxA.rows[2].template ExtractLo<3>(), + // Per-pair half-extents pulled out of each PackedOBBs into flat + // VectorF32<1, Total> registers so they can multiply the projection + // dots directly. + OutVec halfA_x = detail::ExtractComponent<0, Packing>(a.halfSize); + OutVec halfA_y = detail::ExtractComponent<1, Packing>(a.halfSize); + OutVec halfA_z = detail::ExtractComponent<2, Packing>(a.halfSize); + OutVec halfB_x = detail::ExtractComponent<0, Packing>(b.halfSize); + OutVec halfB_y = detail::ExtractComponent<1, Packing>(b.halfSize); + OutVec halfB_z = detail::ExtractComponent<2, Packing>(b.halfSize); + + constexpr float maxF = std::numeric_limits::max(); + alignas(64) std::array out{}; + for (std::uint8_t i = 0; i < Total; ++i) out[i] = 0.0f; // start: overlap + + auto axesOfA = [&](std::uint8_t i) -> std::array const& { + return (i == 0) ? a.xAxis : (i == 1) ? a.yAxis : a.zAxis; }; - std::array, 3> axesB = { - boxB.rows[0].template ExtractLo<3>(), - boxB.rows[1].template ExtractLo<3>(), - boxB.rows[2].template ExtractLo<3>(), + auto axesOfB = [&](std::uint8_t i) -> std::array const& { + return (i == 0) ? b.xAxis : (i == 1) ? b.yAxis : b.zAxis; }; - std::array, 15> axes{}; - axes[0] = axesA[0]; axes[1] = axesA[1]; axes[2] = axesA[2]; - axes[3] = axesB[0]; axes[4] = axesB[1]; axes[5] = axesB[2]; - // Normalize all nine cross axes together with a single batched - // Normalize call (Packing=3 not in the API, so two calls of four + - // one of one would be needed; for now just normalize in two batches - // of four and the trailing one inline). - std::array, 9> crossAxes{}; - std::uint8_t k = 0; + // For each separating-axis candidate, compute per-pair min/max for + // both boxes and OR the "separating" condition into `out`. + auto checkAxis = [&](std::array const& axis) { + OutVec cA = detail::DotArrays(a.origin, axis); + OutVec dA_x = detail::DotArrays(a.xAxis, axis); + OutVec dA_y = detail::DotArrays(a.yAxis, axis); + OutVec dA_z = detail::DotArrays(a.zAxis, axis); + OutVec rA = halfA_x * detail::AbsVec(dA_x) + + halfA_y * detail::AbsVec(dA_y) + + halfA_z * detail::AbsVec(dA_z); + + OutVec cB = detail::DotArrays(b.origin, axis); + OutVec dB_x = detail::DotArrays(b.xAxis, axis); + OutVec dB_y = detail::DotArrays(b.yAxis, axis); + OutVec dB_z = detail::DotArrays(b.zAxis, axis); + OutVec rB = halfB_x * detail::AbsVec(dB_x) + + halfB_y * detail::AbsVec(dB_y) + + halfB_z * detail::AbsVec(dB_z); + + OutVec minA = cA - rA; + OutVec maxA = cA + rA; + OutVec minB = cB - rB; + OutVec maxB = cB + rB; + + auto minAArr = minA.template Store(); + auto maxAArr = maxA.template Store(); + auto minBArr = minB.template Store(); + auto maxBArr = maxB.template Store(); + for (std::uint8_t i = 0; i < Total; ++i) { + // NaN comparisons (from degenerate cross axes) return false and + // correctly leave `out[i]` untouched on this axis. + if (maxAArr[i] < minBArr[i] || maxBArr[i] < minAArr[i]) { + out[i] = maxF; + } + } + }; + + checkAxis(a.xAxis); checkAxis(a.yAxis); checkAxis(a.zAxis); + checkAxis(b.xAxis); checkAxis(b.yAxis); checkAxis(b.zAxis); + + // The 9 cross-product axes. Each batch slot's cross axes are computed + // per-slot, then normalized together (one PVec::Normalize per cross + // index processes N packed slots in parallel). for (std::uint8_t i = 0; i < 3; ++i) { + auto const& aAx = axesOfA(i); for (std::uint8_t j = 0; j < 3; ++j) { - crossAxes[k++] = VectorF32<3, 1>::Cross(axesA[i], axesB[j]); + auto const& bAx = axesOfB(j); + std::array crossAx; + for (std::uint8_t k = 0; k < N; ++k) crossAx[k] = PVec::Cross(aAx[k], bAx[k]); + auto normalized = std::apply([](auto... args) { + return PVec::Normalize(args...); + }, crossAx); + checkAxis(normalized); } } - auto norm0 = VectorF32<3, 1>::Normalize(crossAxes[0], crossAxes[1], crossAxes[2], crossAxes[3]); - auto norm1 = VectorF32<3, 1>::Normalize(crossAxes[4], crossAxes[5], crossAxes[6], crossAxes[7]); - auto norm2 = VectorF32<3, 1>::Normalize(crossAxes[8], crossAxes[8], crossAxes[8], crossAxes[8]); - axes[6] = std::get<0>(norm0); - axes[7] = std::get<1>(norm0); - axes[8] = std::get<2>(norm0); - axes[9] = std::get<3>(norm0); - axes[10] = std::get<0>(norm1); - axes[11] = std::get<1>(norm1); - axes[12] = std::get<2>(norm1); - axes[13] = std::get<3>(norm1); - axes[14] = std::get<0>(norm2); - for (std::uint8_t axisIdx = 0; axisIdx < 15; ++axisIdx) { - VectorF32<3, 1> axis = axes[axisIdx]; - // Project all 8 corners of each box onto `axis` using two batched - // 4-pair Dot calls (lo and hi corners). - VectorF32<1, 4> projA_lo = VectorF32<3, 1>::Dot( - cornersA[0], axis, cornersA[1], axis, - cornersA[2], axis, cornersA[3], axis); - VectorF32<1, 4> projA_hi = VectorF32<3, 1>::Dot( - cornersA[4], axis, cornersA[5], axis, - cornersA[6], axis, cornersA[7], axis); - VectorF32<1, 4> projB_lo = VectorF32<3, 1>::Dot( - cornersB[0], axis, cornersB[1], axis, - cornersB[2], axis, cornersB[3], axis); - VectorF32<1, 4> projB_hi = VectorF32<3, 1>::Dot( - cornersB[4], axis, cornersB[5], axis, - cornersB[6], axis, cornersB[7], axis); - - std::array aLo = projA_lo.template Store(); - std::array aHi = projA_hi.template Store(); - std::array bLo = projB_lo.template Store(); - std::array bHi = projB_hi.template Store(); - - float minA = aLo[0], maxA = aLo[0]; - for (std::uint8_t i = 1; i < 4; ++i) { - minA = std::min(minA, aLo[i]); - maxA = std::max(maxA, aLo[i]); - } - for (std::uint8_t i = 0; i < 4; ++i) { - minA = std::min(minA, aHi[i]); - maxA = std::max(maxA, aHi[i]); - } - float minB = bLo[0], maxB = bLo[0]; - for (std::uint8_t i = 1; i < 4; ++i) { - minB = std::min(minB, bLo[i]); - maxB = std::max(maxB, bLo[i]); - } - for (std::uint8_t i = 0; i < 4; ++i) { - minB = std::min(minB, bHi[i]); - maxB = std::max(maxB, bHi[i]); - } - - if (maxA < minB || maxB < minA) return false; - } - return true; + return OutVec(out.data()); } } diff --git a/interfaces/Crafter.Math-MatrixRowMajor.cppm b/interfaces/Crafter.Math-MatrixRowMajor.cppm index 3e56369..f8eb13d 100755 --- a/interfaces/Crafter.Math-MatrixRowMajor.cppm +++ b/interfaces/Crafter.Math-MatrixRowMajor.cppm @@ -270,12 +270,12 @@ namespace Crafter { // R0 = up × R2 is linear in R2, so its normalized direction does // not depend on whether we hand R2 in before or after its own // normalize. Computing R0_raw from the un-normalized R2 lets us - // satisfy the 4-input Normalize requirement with one batched call - // (duplicating R2 and R0 in the padding slots). + // satisfy the VectorF32<3,1>::BatchSize Normalize requirement with + // one batched call (duplicating R2 and R0 in the padding slots). VectorF32<3, 1> R0Raw = VectorF32<3, 1>::Cross(upDirection, eyeDirection); auto normalized = VectorF32<3, 1>::Normalize(eyeDirection, R0Raw, eyeDirection, R0Raw); - VectorF32<3, 1> R2 = std::get<0>(normalized); - VectorF32<3, 1> R0 = std::get<1>(normalized); + VectorF32<3, 1> R2 = normalized[0]; + VectorF32<3, 1> R0 = normalized[1]; VectorF32<3, 1> R1 = VectorF32<3, 1>::Cross(R2, R0); VectorF32<3, 1> negEye = -eyePosition; diff --git a/interfaces/Crafter.Math-VectorF16.cppm b/interfaces/Crafter.Math-VectorF16.cppm index d893f62..032c788 100755 --- a/interfaces/Crafter.Math-VectorF16.cppm +++ b/interfaces/Crafter.Math-VectorF16.cppm @@ -554,9 +554,41 @@ namespace Crafter { } } - constexpr static std::tuple, VectorF16, VectorF16, VectorF16> Normalize( - VectorF16 A, - VectorF16 C, + // Public variadic surface — one name per op, arity locked to BatchSize + // (or 2*BatchSize for Dot). Forwards to the *Pack helpers below which + // carry the SIMD bodies and per-(Len,Packing) requires clauses. + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto Normalize(VectorF16 first, Rest... rest) { + return NormalizePack(first, rest...); + } + + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto Length(VectorF16 first, Rest... rest) { + return LengthPack(first, rest...); + } + + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto LengthSq(VectorF16 first, Rest... rest) { + return LengthSqPack(first, rest...); + } + + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize)) + constexpr static auto Dot(VectorF16 first, Rest... rest) { + return DotPack(first, rest...); + } + + private: + constexpr static std::array, VectorBase::BatchSize> NormalizePack( + VectorF16 A, + VectorF16 C, VectorF16 E, VectorF16 G ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { @@ -616,8 +648,8 @@ namespace Crafter { } } - constexpr static std::tuple, VectorF16> Normalize( - VectorF16 A, + constexpr static std::array, VectorBase::BatchSize> NormalizePack( + VectorF16 A, VectorF16 E ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128h>) { @@ -662,13 +694,13 @@ namespace Crafter { } } - constexpr static VectorF16<1, Packing*4> Length( - VectorF16 A, + constexpr static VectorF16<1, Packing*4> LengthPack( + VectorF16 A, VectorF16 C, VectorF16 E, VectorF16 G ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { - VectorF16<1, Packing*4> lenghtSq = LengthSq(A, C, E, G); + VectorF16<1, Packing*4> lenghtSq = LengthSqPack(A, C, E, G); if constexpr(std::is_same_v::VectorType, __m128h>) { return VectorF16<1, Packing*4>(_mm_sqrt_ph(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256h>) { @@ -678,11 +710,11 @@ namespace Crafter { } } - constexpr static VectorF16<1, Packing*2> Length( - VectorF16 A, + constexpr static VectorF16<1, Packing*2> LengthPack( + VectorF16 A, VectorF16 E ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { - VectorF16<1, Packing*2> lenghtSq = LengthSq(A, E); + VectorF16<1, Packing*2> lenghtSq = LengthSqPack(A, E); if constexpr(std::is_same_v::VectorType, __m128h>) { return VectorF16<1, Packing*2>(_mm_sqrt_ph(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256h>) { @@ -692,23 +724,23 @@ namespace Crafter { } } - constexpr static VectorF16<1, Packing*4> LengthSq( - VectorF16 A, + constexpr static VectorF16<1, Packing*4> LengthSqPack( + VectorF16 A, VectorF16 C, VectorF16 E, VectorF16 G ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { - return Dot(A, A, C, C, E, E, G, G); + return DotPack(A, A, C, C, E, E, G, G); } - constexpr static VectorF16<1, Packing*2> LengthSq( - VectorF16 A, + constexpr static VectorF16<1, Packing*2> LengthSqPack( + VectorF16 A, VectorF16 E ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { - return Dot(A, A, E, E); + return DotPack(A, A, E, E); } - constexpr static VectorF16<1, Packing*4> Dot( + constexpr static VectorF16<1, Packing*4> DotPack( VectorF16 A0, VectorF16 A1, VectorF16 C0, VectorF16 C1, VectorF16 E0, VectorF16 E1, @@ -744,7 +776,7 @@ namespace Crafter { } } - constexpr static VectorF16<1, Packing*2> Dot( + constexpr static VectorF16<1, Packing*2> DotPack( VectorF16 A0, VectorF16 A1, VectorF16 E0, VectorF16 E1 ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { @@ -1200,9 +1232,10 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF16 first, Rest... rest) { - constexpr std::uint8_t N = 1 + sizeof...(Rest); + constexpr std::uint8_t N = VectorBase::BatchSize; VectorF16<1, static_cast(Packing * N)> r; std::array, N> args{ first, rest... }; for (std::uint8_t i = 0; i < N; ++i) @@ -1218,7 +1251,8 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF16 first, Rest... rest) { auto sq = LengthSq(first, rest...); for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) @@ -1227,7 +1261,8 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF16 first, Rest... rest) { auto normOne = [](VectorF16 u) { VectorF16 out; @@ -1243,7 +1278,7 @@ namespace Crafter { } return out; }; - return std::make_tuple(normOne(first), normOne(rest)...); + return std::array, VectorBase::BatchSize>{ normOne(first), normOne(rest)... }; } constexpr static VectorF16 Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) { diff --git a/interfaces/Crafter.Math-VectorF32.cppm b/interfaces/Crafter.Math-VectorF32.cppm index 6582d94..1b40c59 100755 --- a/interfaces/Crafter.Math-VectorF32.cppm +++ b/interfaces/Crafter.Math-VectorF32.cppm @@ -449,8 +449,8 @@ namespace Crafter { } template values> - constexpr VectorF32 Negate() { - std::array::AlignmentElement> mask = VectorBase::template GetNegateMask(); + constexpr VectorF32 Negate() const { + std::array::AlignmentElement> mask = VectorBase::template GetNegateMask(); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32(_mm_castsi128_ps(_mm_xor_si128(_mm_castps_si128(this->v), _mm_loadu_si128(reinterpret_cast<__m128i*>(mask.data()))))); } else if constexpr(std::is_same_v::VectorType, __m256>) { @@ -549,9 +549,41 @@ namespace Crafter { } } - constexpr static std::tuple, VectorF32, VectorF32, VectorF32> Normalize( - VectorF32 A, - VectorF32 B, + // Public variadic surface — one name per op, arity locked to BatchSize. + // The Pack helpers below carry the SIMD bodies and the per-(Len,Packing) + // requires clauses; this wrapper just forwards once arity matches. + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto Normalize(VectorF32 first, Rest... rest) { + return NormalizePack(first, rest...); + } + + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto Length(VectorF32 first, Rest... rest) { + return LengthPack(first, rest...); + } + + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto LengthSq(VectorF32 first, Rest... rest) { + return LengthSqPack(first, rest...); + } + + template + requires ((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize)) + constexpr static auto Dot(VectorF32 first, Rest... rest) { + return DotPack(first, rest...); + } + + private: + constexpr static std::array, VectorBase::BatchSize> NormalizePack( + VectorF32 A, + VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { @@ -614,9 +646,9 @@ namespace Crafter { } } - constexpr static std::tuple, VectorF32, VectorF32, VectorF32> Normalize( - VectorF32 A, - VectorF32 B, + constexpr static std::array, VectorBase::BatchSize> NormalizePack( + VectorF32 A, + VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { @@ -638,9 +670,9 @@ namespace Crafter { }; } - constexpr static std::tuple, VectorF32, VectorF32, VectorF32> Normalize( - VectorF32 A, - VectorF32 B, + constexpr static std::array, VectorBase::BatchSize> NormalizePack( + VectorF32 A, + VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { @@ -663,9 +695,9 @@ namespace Crafter { } #ifdef __AVX512F__ - constexpr static std::tuple, VectorF32, VectorF32> Normalize( - VectorF32 A, - VectorF32 B, + constexpr static std::array, VectorBase::BatchSize> NormalizePack( + VectorF32 A, + VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { VectorF32<1, 15> lenght = Length(A, B, C); @@ -685,8 +717,8 @@ namespace Crafter { } #endif - constexpr static std::tuple, VectorF32> Normalize( - VectorF32 A, + constexpr static std::array, VectorBase::BatchSize> NormalizePack( + VectorF32 A, VectorF32 B ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { @@ -733,13 +765,13 @@ namespace Crafter { } } - constexpr static VectorF32<1, Packing*4> Length( - VectorF32 A, + constexpr static VectorF32<1, Packing*4> LengthPack( + VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { - VectorF32<1, Packing*4> lenghtSq = LengthSq(A, B, C, D); + VectorF32<1, Packing*4> lenghtSq = LengthSqPack(A, B, C, D); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*4>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { @@ -749,42 +781,42 @@ namespace Crafter { } } - constexpr static VectorF32<1, 4> Length( + constexpr static VectorF32<1, 4> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { - VectorF32<1, 4> lenghtSq = LengthSq(A, B, C, D); + VectorF32<1, 4> lenghtSq = LengthSqPack(A, B, C, D); return VectorF32<1, 4>(_mm_sqrt_ps(lenghtSq.v)); } - constexpr static VectorF32<1, 8> Length( + constexpr static VectorF32<1, 8> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { - VectorF32<1, 8> lenghtSq = LengthSq(A, B, C, D); + VectorF32<1, 8> lenghtSq = LengthSqPack(A, B, C, D); return VectorF32<1, Packing*4>(_mm256_sqrt_ps(lenghtSq.v)); } #ifdef __AVX512F__ - constexpr static VectorF32<1, 15> Length( + constexpr static VectorF32<1, 15> LengthPack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { - VectorF32<1, 15> lenghtSq = LengthSq(A, B, C); + VectorF32<1, 15> lenghtSq = LengthSqPack(A, B, C); return VectorF32<1, 15>(_mm512_sqrt_ps(lenghtSq.v)); } #endif - constexpr static VectorF32<1, Packing*2> Length( - VectorF32 A, + constexpr static VectorF32<1, Packing*2> LengthPack( + VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { - VectorF32<1, Packing*2> lenghtSq = LengthSq(A, C); + VectorF32<1, Packing*2> lenghtSq = LengthSqPack(A, C); if constexpr(std::is_same_v::VectorType, __m128>) { return VectorF32<1, Packing*2>(_mm_sqrt_ps(lenghtSq.v)); } else if constexpr(std::is_same_v::VectorType, __m256>) { @@ -796,51 +828,51 @@ namespace Crafter { } } - constexpr static VectorF32<1, Packing*4> LengthSq( - VectorF32 A, + constexpr static VectorF32<1, Packing*4> LengthSqPack( + VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 4 && Packing*Len == VectorBase::AlignmentElement) { - return Dot(A, A, B, B, C, C, D, D); + return DotPack(A, A, B, B, C, C, D, D); } - constexpr static VectorF32<1, 4> LengthSq( + constexpr static VectorF32<1, 4> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 1) { - return Dot(A, A, B, B, C, C, D, D); + return DotPack(A, A, B, B, C, C, D, D); } - constexpr static VectorF32<1, 8> LengthSq( + constexpr static VectorF32<1, 8> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C, VectorF32 D ) requires(Len == 3 && Packing == 2) { - return Dot(A, A, B, B, C, C, D, D); + return DotPack(A, A, B, B, C, C, D, D); } #ifdef __AVX512F__ - constexpr static VectorF32<1, 15> LengthSq( + constexpr static VectorF32<1, 15> LengthSqPack( VectorF32 A, VectorF32 B, VectorF32 C ) requires(Len == 3 && Packing == 5) { - return Dot(A, A, B, B, C, C); + return DotPack(A, A, B, B, C, C); } #endif - constexpr static VectorF32<1, Packing*2> LengthSq( - VectorF32 A, + constexpr static VectorF32<1, Packing*2> LengthSqPack( + VectorF32 A, VectorF32 C ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { - return Dot(A, A, C, C); + return DotPack(A, A, C, C); } - constexpr static VectorF32<1, Packing*4> Dot( + constexpr static VectorF32<1, Packing*4> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1, @@ -869,7 +901,7 @@ namespace Crafter { } } - constexpr static VectorF32<1, 4> Dot( + constexpr static VectorF32<1, 4> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1, @@ -914,7 +946,7 @@ namespace Crafter { return row1; } - constexpr static VectorF32<1, 8> Dot( + constexpr static VectorF32<1, 8> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1, @@ -1021,7 +1053,7 @@ namespace Crafter { } #ifdef __AVX512F__ - constexpr static VectorF32<1, 15> Dot( + constexpr static VectorF32<1, 15> DotPack( VectorF32 A0, VectorF32 A1, VectorF32 B0, VectorF32 B1, VectorF32 C0, VectorF32 C1 @@ -1112,8 +1144,8 @@ namespace Crafter { } #endif - constexpr static VectorF32<1, Packing*2> Dot( - VectorF32 A0, VectorF32 A1, + constexpr static VectorF32<1, Packing*2> DotPack( + VectorF32 A0, VectorF32 A1, VectorF32 C0, VectorF32 C1 ) requires(Len == 2 && Packing*Len == VectorBase::AlignmentElement) { if constexpr(std::is_same_v::VectorType, __m128>) { @@ -1548,9 +1580,10 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF32 first, Rest... rest) { - constexpr std::uint8_t N = 1 + sizeof...(Rest); + constexpr std::uint8_t N = VectorBase::BatchSize; VectorF32<1, static_cast(Packing * N)> r; std::array, N> args{ first, rest... }; alignas(16) float buf[4] = {0,0,0,0}; @@ -1571,41 +1604,39 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF32 first, Rest... rest) { auto sq = LengthSq(first, rest...); sq.v = wasm_f32x4_sqrt(sq.v); return sq; } - // Four pairwise dot products packed into one v128. Only the first Len + // Pairwise dot products packed into one v128. Only the first Len // lanes contribute, so the same routine handles 3- and 4-component // inputs — the 4th lane of Len==3 inputs may be garbage from Cross() - // and must not be summed. - constexpr static VectorF32<1, 4> Dot( - VectorF32 A0, VectorF32 A1, - VectorF32 B0, VectorF32 B1, - VectorF32 C0, VectorF32 C1, - VectorF32 D0, VectorF32 D1 - ) requires((Len == 3 || Len == 4) && Packing == 1) { - alignas(16) float a0[4], a1[4], b0[4], b1[4], c0[4], c1[4], d0[4], d1[4]; - wasm_v128_store(a0, A0.v); wasm_v128_store(a1, A1.v); - wasm_v128_store(b0, B0.v); wasm_v128_store(b1, B1.v); - wasm_v128_store(c0, C0.v); wasm_v128_store(c1, C1.v); - wasm_v128_store(d0, D0.v); wasm_v128_store(d1, D1.v); - + // and must not be summed. Takes BatchSize pairs (== 4 here since + // WASM AlignmentElement is always 4 and Packing must be 1). + template + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize) && + (Len == 3 || Len == 4) && Packing == 1) + constexpr static VectorF32<1, 4> Dot(VectorF32 first, Rest... rest) { + constexpr std::uint8_t N = VectorBase::BatchSize; + std::array, 2 * N> args{ first, rest... }; alignas(16) float out[4] = {0,0,0,0}; - for (std::uint8_t k = 0; k < Len; ++k) { - out[0] += a0[k] * a1[k]; - out[1] += b0[k] * b1[k]; - out[2] += c0[k] * c1[k]; - out[3] += d0[k] * d1[k]; + for (std::uint8_t i = 0; i < N; ++i) { + alignas(16) float a[4], b[4]; + wasm_v128_store(a, args[2 * i].v); + wasm_v128_store(b, args[2 * i + 1].v); + for (std::uint8_t k = 0; k < Len; ++k) out[i] += a[k] * b[k]; } return VectorF32<1, 4>(wasm_v128_load(out)); } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF32 first, Rest... rest) { auto normOne = [](VectorF32 u) { alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v); @@ -1622,7 +1653,7 @@ namespace Crafter { } return VectorF32(wasm_v128_load(out)); }; - return std::make_tuple(normOne(first), normOne(rest)...); + return std::array, VectorBase::BatchSize>{ normOne(first), normOne(rest)... }; } constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { @@ -1842,9 +1873,10 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto LengthSq(VectorF32 first, Rest... rest) { - constexpr std::uint8_t N = 1 + sizeof...(Rest); + constexpr std::uint8_t N = VectorBase::BatchSize; VectorF32<1, static_cast(Packing * N)> r; std::array, N> args{ first, rest... }; for (std::uint8_t i = 0; i < N; ++i) @@ -1860,7 +1892,8 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Length(VectorF32 first, Rest... rest) { auto sq = LengthSq(first, rest...); for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]); @@ -1868,7 +1901,8 @@ namespace Crafter { } template - requires((std::is_same_v> && ...)) + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) constexpr static auto Normalize(VectorF32 first, Rest... rest) { auto normOne = [](VectorF32 u) { VectorF32 out; @@ -1884,7 +1918,7 @@ namespace Crafter { } return out; }; - return std::make_tuple(normOne(first), normOne(rest)...); + return std::array, VectorBase::BatchSize>{ normOne(first), normOne(rest)... }; } constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { diff --git a/tests/Intersection.cpp b/tests/Intersection.cpp index d6a9c4b..e844890 100644 --- a/tests/Intersection.cpp +++ b/tests/Intersection.cpp @@ -40,134 +40,390 @@ VectorF32<4, 1> Vec4(float x, float y, float z, float w) { return VectorF32<4, 1>(buf); } -VectorF32<1, 4> Vec1x4(float a, float b, float c, float d) { - alignas(16) float buf[4] = { a, b, c, d }; - return VectorF32<1, 4>(buf); +// Pack Total = Packing * N vec3 records into N packed VectorF32<3, Packing>s. +// `data[i]` is the i-th sub-primitive's three components in [x, y, z] order; +// the helper places `data[batch*Packing + sub]` into the `sub`-th slot of +// `result[batch]`. Records beyond `data.size()` are left as zeros. +template +std::array, VectorF32<3, Packing>::BatchSize> +PackVec3(std::span> data) { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + std::array, N> result; + for (std::uint8_t b = 0; b < N; ++b) { + alignas(64) float buf[VectorF32<3, Packing>::AlignmentElement] = {}; + for (std::uint8_t s = 0; s < Packing; ++s) { + std::size_t idx = static_cast(b) * Packing + s; + if (idx < data.size()) { + buf[s * 3 + 0] = data[idx][0]; + buf[s * 3 + 1] = data[idx][1]; + buf[s * 3 + 2] = data[idx][2]; + } + } + result[b] = VectorF32<3, Packing>(buf); + } + return result; } -// Möller-Trumbore in this codebase rejects det <= eps, so triangles must be -// wound so their geometric normal opposes the ray direction. For rays going +Z -// that means clockwise from a +Z viewer. -std::string* TestRayTriangle() { +// Same idea for vec4 records (quaternions). +template +std::array, VectorF32<3, Packing>::BatchSize> +PackVec4MatchingVec3Batch(std::span> data) { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + std::array, N> result; + for (std::uint8_t b = 0; b < N; ++b) { + alignas(64) float buf[VectorF32<4, Packing>::AlignmentElement] = {}; + for (std::uint8_t s = 0; s < Packing; ++s) { + std::size_t idx = static_cast(b) * Packing + s; + if (idx < data.size()) { + buf[s * 4 + 0] = data[idx][0]; + buf[s * 4 + 1] = data[idx][1]; + buf[s * 4 + 2] = data[idx][2]; + buf[s * 4 + 3] = data[idx][3]; + } + } + result[b] = VectorF32<4, Packing>(buf); + } + return result; +} + +// Pack `Total` scalars into a VectorF32<1, Total>. +template +VectorF32<1, Total> PackScalars(std::span data) { + alignas(64) float buf[VectorF32<1, Total>::AlignmentElement] = {}; + for (std::size_t i = 0; i < data.size() && i < Total; ++i) buf[i] = data[i]; + return VectorF32<1, Total>(buf); +} + +template +std::string* TestRayTriangleN() { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5); VectorF32<3, 1> rayDir = Vec3(0, 0, 1); - // A: hits at z=0, t=5 (front-facing). - VectorF32<3, 1> a0 = Vec3(-1, -1, 0), a1 = Vec3(0, 1, 0), a2 = Vec3(1, -1, 0); - // B: hits at z=10, t=15. - VectorF32<3, 1> b0 = Vec3(-1, -1, 10), b1 = Vec3(0, 1, 10), b2 = Vec3(1, -1, 10); - // C: front-facing triangle far off to the side - u or v out of [0,1]. - VectorF32<3, 1> c0 = Vec3(99, -1, 0), c1 = Vec3(100, 1, 0), c2 = Vec3(101, -1, 0); - // D: triangle parallel to the ray (all vertices share y=2; ray lives in y=0). - VectorF32<3, 1> d0 = Vec3(-1, 2, -1), d1 = Vec3(1, 2, 1), d2 = Vec3(0, 2, 2); + // Cycle of four triangle patterns, repeated to fill Total slots: + // 0: hits at z=0 (t=5) + // 1: hits at z=10 (t=15) + // 2: front-facing but off to the side - u/v rejected (miss) + // 3: parallel to the ray (miss) + constexpr std::array, 4> v0_pat = {{ + {-1, -1, 0}, {-1, -1, 10}, { 99, -1, 0}, {-1, 2, -1} + }}; + constexpr std::array, 4> v1_pat = {{ + { 0, 1, 0}, { 0, 1, 10}, {100, 1, 0}, { 1, 2, 1} + }}; + constexpr std::array, 4> v2_pat = {{ + { 1, -1, 0}, { 1, -1, 10}, {101, -1, 0}, { 0, 2, 2} + }}; + constexpr std::array expected_pat = { 5.0f, 15.0f, kMaxF, kMaxF }; - VectorF32<1, 4> t = IntersectionTestRayTriangle(rayOrigin, rayDir, - a0, a1, a2, - b0, b1, b2, - c0, c1, c2, - d0, d1, d2); - std::array s = t.template Store(); + std::array, Total> v0Data, v1Data, v2Data; + for (std::uint8_t i = 0; i < Total; ++i) { + v0Data[i] = v0_pat[i % 4]; + v1Data[i] = v1_pat[i % 4]; + v2Data[i] = v2_pat[i % 4]; + } + auto v0 = PackVec3(v0Data); + auto v1 = PackVec3(v1Data); + auto v2 = PackVec3(v2Data); - if (!FloatEquals(s[0], 5.0f)) - return new std::string(std::format("RayTriangle A: expected 5, got {}", s[0])); - if (!FloatEquals(s[1], 15.0f)) - return new std::string(std::format("RayTriangle B: expected 15, got {}", s[1])); - if (s[2] != kMaxF) - return new std::string(std::format("RayTriangle C: expected max (miss), got {}", s[2])); - if (s[3] != kMaxF) - return new std::string(std::format("RayTriangle D: expected max (parallel miss), got {}", s[3])); - return nullptr; -} + auto t = IntersectionTestRayTriangle(rayOrigin, rayDir, v0, v1, v2); + auto stored = t.template Store(); -std::string* TestRayTriangleBackFacing() { - // Same A vertices but CCW from +Z viewer -> back-facing for +Z ray -> miss. - VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5); - VectorF32<3, 1> rayDir = Vec3(0, 0, 1); - VectorF32<3, 1> v0 = Vec3(-1, -1, 0), v1 = Vec3(1, -1, 0), v2 = Vec3(0, 1, 0); - - VectorF32<1, 4> t = IntersectionTestRayTriangle(rayOrigin, rayDir, - v0, v1, v2, - v0, v1, v2, - v0, v1, v2, - v0, v1, v2); - std::array s = t.template Store(); - for (std::uint8_t i = 0; i < 4; ++i) { - if (s[i] != kMaxF) - return new std::string(std::format("RayTriangle back-facing lane {}: expected max, got {}", i, s[i])); + for (std::uint8_t i = 0; i < Total; ++i) { + float expected = expected_pat[i % 4]; + float got = stored[i]; + if (expected == kMaxF) { + if (got != kMaxF) + return new std::string(std::format( + "RayTriangle<{}> tri {}: expected miss, got {}", Packing, i, got)); + } else if (!FloatEquals(got, expected)) { + return new std::string(std::format( + "RayTriangle<{}> tri {}: expected {}, got {}", Packing, i, expected, got)); + } } return nullptr; } -std::string* TestRaySphere() { - VectorF32<3, 1> rayOrigin = Vec3(0, 0, -10); +template +std::string* TestRayTriangleBackFacingN() { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + + // Same vertices as the front-facing case but wound CCW from +Z (back-facing + // for a +Z ray) - all sub-primitives should miss. + std::array, Total> v0Data, v1Data, v2Data; + for (std::uint8_t i = 0; i < Total; ++i) { + v0Data[i] = {-1, -1, 0}; + v1Data[i] = { 1, -1, 0}; + v2Data[i] = { 0, 1, 0}; + } + auto v0 = PackVec3(v0Data); + auto v1 = PackVec3(v1Data); + auto v2 = PackVec3(v2Data); + + VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5); VectorF32<3, 1> rayDir = Vec3(0, 0, 1); - - // A: sphere at origin radius 2 - first hit at z=-2, t=8. - VectorF32<3, 1> posA = Vec3(0, 0, 0); - // B: sphere at (0,0,20) radius 1 - first hit at z=19, t=29. - VectorF32<3, 1> posB = Vec3(0, 0, 20); - // C: sphere off to the side, ray misses. - VectorF32<3, 1> posC = Vec3(10, 10, 0); - // D: sphere behind the ray origin. - VectorF32<3, 1> posD = Vec3(0, 0, -50); - - VectorF32<1, 4> radii = Vec1x4(2.0f, 1.0f, 0.5f, 1.0f); - VectorF32<1, 4> t = IntersectionTestRaySphere(rayOrigin, rayDir, - posA, posB, posC, posD, radii); - std::array s = t.template Store(); - - if (!FloatEquals(s[0], 8.0f)) - return new std::string(std::format("RaySphere A: expected 8, got {}", s[0])); - if (!FloatEquals(s[1], 29.0f)) - return new std::string(std::format("RaySphere B: expected 29, got {}", s[1])); - if (s[2] != kMaxF) - return new std::string(std::format("RaySphere C: expected max (miss), got {}", s[2])); - if (s[3] != kMaxF) - return new std::string(std::format("RaySphere D: expected max (behind), got {}", s[3])); + auto t = IntersectionTestRayTriangle(rayOrigin, rayDir, v0, v1, v2); + auto stored = t.template Store(); + for (std::uint8_t i = 0; i < Total; ++i) { + if (stored[i] != kMaxF) + return new std::string(std::format( + "RayTriangle back-facing<{}> tri {}: expected max, got {}", + Packing, i, stored[i])); + } return nullptr; } -std::string* TestRayOrientedBox() { +template +std::string* TestRaySphereN() { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + + VectorF32<3, 1> rayOrigin = Vec3(0, 0, -10); + VectorF32<3, 1> rayDir = Vec3(0, 0, 1); + + // Cycle of four sphere patterns: + // 0: at origin, r=2, first hit z=-2, t=8 + // 1: at (0,0,20), r=1, first hit z=19, t=29 + // 2: off-axis at (10,10,0), r=0.5, miss + // 3: behind ray at (0,0,-50), r=1, miss + constexpr std::array, 4> pos_pat = {{ + { 0, 0, 0}, { 0, 0, 20}, {10, 10, 0}, { 0, 0, -50} + }}; + constexpr std::array radii_pat = { 2.0f, 1.0f, 0.5f, 1.0f }; + constexpr std::array expected_pat = { 8.0f, 29.0f, kMaxF, kMaxF }; + + std::array, Total> posData; + std::array radiiData; + for (std::uint8_t i = 0; i < Total; ++i) { + posData[i] = pos_pat[i % 4]; + radiiData[i] = radii_pat[i % 4]; + } + auto pos = PackVec3(posData); + auto radii = PackScalars(radiiData); + + auto t = IntersectionTestRaySphere(rayOrigin, rayDir, pos, radii); + auto stored = t.template Store(); + + for (std::uint8_t i = 0; i < Total; ++i) { + float expected = expected_pat[i % 4]; + float got = stored[i]; + if (expected == kMaxF) { + if (got != kMaxF) + return new std::string(std::format( + "RaySphere<{}> sph {}: expected miss, got {}", Packing, i, got)); + } else if (!FloatEquals(got, expected)) { + return new std::string(std::format( + "RaySphere<{}> sph {}: expected {}, got {}", Packing, i, expected, got)); + } + } + return nullptr; +} + +template +std::string* TestRayOrientedBoxN() { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + VectorF32<3, 1> rayOrigin = Vec3(0, 0, -5); VectorF32<3, 1> rayDir = Vec3(0, 0, 1); - // Identity quaternion (axis-aligned). - VectorF32<4, 1> idQ = Vec4(0, 0, 0, 1); - // Note: RayOrientedBox treats `size` as the *full* extent (it computes - // halfExtents = size * 0.5 internally). So size=2 means the box spans - // [-1, 1] in each axis. (SphereOrientedBox uses the opposite convention.) - // - // A: box at origin size 2 (half 1) -> ray enters at z=-1, t=4. - VectorF32<3, 1> posA = Vec3(0, 0, 0), sizeA = Vec3(2, 2, 2); - // B: box at (0,0,10) size 2 (half 1) -> ray enters at z=9, t=14. - VectorF32<3, 1> posB = Vec3(0, 0, 10), sizeB = Vec3(2, 2, 2); - // C: box off to the side - miss. - VectorF32<3, 1> posC = Vec3(50, 0, 0), sizeC = Vec3(2, 2, 2); - // D: box behind ray - miss. - VectorF32<3, 1> posD = Vec3(0, 0, -50), sizeD = Vec3(2, 2, 2); + // Cycle of four AABB-as-OBB patterns (identity rotation): + // 0: at origin, full size 2, enters z=-1 -> t=4 + // 1: at (0,0,10), full size 2, enters z=9 -> t=14 + // 2: off-axis at (50,0,0) -> miss + // 3: behind ray at (0,0,-50) -> miss + constexpr std::array, 4> pos_pat = {{ + { 0, 0, 0}, { 0, 0, 10}, {50, 0, 0}, { 0, 0, -50} + }}; + constexpr std::array size_one = { 2, 2, 2 }; + constexpr std::array idQ = { 0, 0, 0, 1 }; + constexpr std::array expected_pat = { 4.0f, 14.0f, kMaxF, kMaxF }; - VectorF32<1, 4> t = IntersectionTestRayOrientedBox(rayOrigin, rayDir, - posA, sizeA, idQ, - posB, sizeB, idQ, - posC, sizeC, idQ, - posD, sizeD, idQ); - std::array s = t.template Store(); + std::array, Total> posData; + std::array, Total> sizeData; + std::array, Total> rotData; + for (std::uint8_t i = 0; i < Total; ++i) { + posData[i] = pos_pat[i % 4]; + sizeData[i] = size_one; + rotData[i] = idQ; + } + auto pos = PackVec3(posData); + auto size = PackVec3(sizeData); + auto rot = PackVec4MatchingVec3Batch(rotData); - if (!FloatEquals(s[0], 4.0f)) - return new std::string(std::format("RayOrientedBox A: expected 4, got {}", s[0])); - if (!FloatEquals(s[1], 14.0f)) - return new std::string(std::format("RayOrientedBox B: expected 14, got {}", s[1])); - if (s[2] != kMaxF) - return new std::string(std::format("RayOrientedBox C: expected max (miss), got {}", s[2])); - if (s[3] != kMaxF) - return new std::string(std::format("RayOrientedBox D: expected max (behind), got {}", s[3])); + auto t = IntersectionTestRayOrientedBox(rayOrigin, rayDir, pos, size, rot); + auto stored = t.template Store(); + + for (std::uint8_t i = 0; i < Total; ++i) { + float expected = expected_pat[i % 4]; + float got = stored[i]; + if (expected == kMaxF) { + if (got != kMaxF) + return new std::string(std::format( + "RayOrientedBox<{}> box {}: expected miss, got {}", Packing, i, got)); + } else if (!FloatEquals(got, expected)) { + return new std::string(std::format( + "RayOrientedBox<{}> box {}: expected {}, got {}", Packing, i, expected, got)); + } + } + return nullptr; +} + +// Helper: pack a homogeneous array of OBB descriptors into a PackedOBBs. +template +PackedOBBs PackOBBs( + std::span> halfSizes, + std::span> xAxes, + std::span> yAxes, + std::span> zAxes, + std::span> origins +) { + PackedOBBs out; + out.halfSize = PackVec3(halfSizes); + out.xAxis = PackVec3(xAxes); + out.yAxis = PackVec3(yAxes); + out.zAxis = PackVec3(zAxes); + out.origin = PackVec3(origins); + return out; +} + +// SphereOrientedBox takes a PackedOBBs (half-extents, three rotation axes, +// origin per sub-box). For axis-aligned boxes the axes are world x/y/z. +template +std::string* TestSphereOrientedBoxN() { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + + VectorF32<3, 1> sphereCenter = Vec3(0, 0, 0); + + // Cycle of four box patterns (half-extent semantics, world-axis aligned): + // 0: at origin half=2, r=1 -> sphere inside -> hit + // 1: at (5,0,0) half=1, r=0.5 -> miss + // 2: at (3,0,0) half=1, r=0.5 -> miss + // 3: at origin half=0.5, r=1 -> sphere encloses box center -> hit + constexpr std::array, 4> size_pat = {{ + { 2, 2, 2}, { 1, 1, 1}, { 1, 1, 1}, {0.5f, 0.5f, 0.5f} + }}; + constexpr std::array, 4> origin_pat = {{ + { 0, 0, 0}, { 5, 0, 0}, { 3, 0, 0}, { 0, 0, 0} + }}; + constexpr std::array radii_pat = { 1.0f, 0.5f, 0.5f, 1.0f }; + constexpr std::array expected_pat = { 0.0f, kMaxF, kMaxF, 0.0f }; + + constexpr std::array ax_x = { 1, 0, 0 }; + constexpr std::array ax_y = { 0, 1, 0 }; + constexpr std::array ax_z = { 0, 0, 1 }; + + std::array, Total> sizeData, originData; + std::array, Total> xAxesData, yAxesData, zAxesData; + std::array radiiData; + for (std::uint8_t i = 0; i < Total; ++i) { + sizeData[i] = size_pat[i % 4]; + originData[i] = origin_pat[i % 4]; + xAxesData[i] = ax_x; + yAxesData[i] = ax_y; + zAxesData[i] = ax_z; + radiiData[i] = radii_pat[i % 4]; + } + auto boxes = PackOBBs(sizeData, xAxesData, yAxesData, zAxesData, originData); + auto radii = PackScalars(radiiData); + + auto t = IntersectionTestSphereOrientedBox(sphereCenter, radii, boxes); + auto stored = t.template Store(); + + for (std::uint8_t i = 0; i < Total; ++i) { + float expected = expected_pat[i % 4]; + float got = stored[i]; + if (expected == kMaxF) { + if (got != kMaxF) + return new std::string(std::format( + "SphereOrientedBox<{}> box {}: expected miss, got {}", Packing, i, got)); + } else if (!FloatEquals(got, expected)) { + return new std::string(std::format( + "SphereOrientedBox<{}> box {}: expected {}, got {}", Packing, i, expected, got)); + } + } + return nullptr; +} + +// OBB-vs-OBB test against the new templated SAT routine. Cycles through: +// 0: identical unit boxes at (0,0,0) and (1,0,0) -> overlap on x +// 1: identical unit boxes at (0,0,0) and (10,0,0) -> far apart, miss +// 2: rotated-45° box at origin vs identity at (1,0,0). Both have half=1. +// The rotated box's projection along world-x is half=sqrt(2)≈1.414, so +// the boxes still overlap on the world-x axis. +// 3: identical unit boxes at (0,0,0) and (3,0,0) -> miss +template +std::string* TestOBBOBBN() { + constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; + constexpr std::uint8_t Total = Packing * N; + + constexpr float kRot45 = 0.70710678f; // cos(45°) = sin(45°) + constexpr std::array, 4> halfA_pat = {{ + { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 } + }}; + constexpr std::array, 4> halfB_pat = halfA_pat; + constexpr std::array, 4> originA_pat = {{ + { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 } + }}; + constexpr std::array, 4> originB_pat = {{ + { 1, 0, 0 }, { 10, 0, 0 }, { 1, 0, 0 }, { 3, 0, 0 } + }}; + // Box A axes: identity for patterns 0/1/3, rotated 45° around z for pattern 2. + constexpr std::array, 4> xAxisA_pat = {{ + { 1, 0, 0 }, { 1, 0, 0 }, { kRot45, kRot45, 0 }, { 1, 0, 0 } + }}; + constexpr std::array, 4> yAxisA_pat = {{ + { 0, 1, 0 }, { 0, 1, 0 }, { -kRot45, kRot45, 0 }, { 0, 1, 0 } + }}; + constexpr std::array, 4> zAxisA_pat = {{ + { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 }, { 0, 0, 1 } + }}; + constexpr std::array, 4> xAxisB_pat = {{ + { 1, 0, 0 }, { 1, 0, 0 }, { 1, 0, 0 }, { 1, 0, 0 } + }}; + constexpr std::array, 4> yAxisB_pat = {{ + { 0, 1, 0 }, { 0, 1, 0 }, { 0, 1, 0 }, { 0, 1, 0 } + }}; + constexpr std::array, 4> zAxisB_pat = zAxisA_pat; + constexpr std::array expected_pat = { 0.0f, kMaxF, 0.0f, kMaxF }; + + std::array, Total> + halfA, halfB, originA, originB, + xA, yA, zA, xB, yB, zB; + for (std::uint8_t i = 0; i < Total; ++i) { + halfA[i] = halfA_pat[i % 4]; + halfB[i] = halfB_pat[i % 4]; + originA[i] = originA_pat[i % 4]; + originB[i] = originB_pat[i % 4]; + xA[i] = xAxisA_pat[i % 4]; yA[i] = yAxisA_pat[i % 4]; zA[i] = zAxisA_pat[i % 4]; + xB[i] = xAxisB_pat[i % 4]; yB[i] = yAxisB_pat[i % 4]; zB[i] = zAxisB_pat[i % 4]; + } + + auto a = PackOBBs(halfA, xA, yA, zA, originA); + auto b = PackOBBs(halfB, xB, yB, zB, originB); + auto r = IntersectionTestOrientedBoxOrientedBox(a, b); + auto stored = r.template Store(); + + for (std::uint8_t i = 0; i < Total; ++i) { + float expected = expected_pat[i % 4]; + float got = stored[i]; + if (expected == kMaxF) { + if (got != kMaxF) + return new std::string(std::format( + "OBBOBB<{}> pair {}: expected miss, got {}", Packing, i, got)); + } else if (got != expected) { + return new std::string(std::format( + "OBBOBB<{}> pair {}: expected {}, got {}", Packing, i, expected, got)); + } + } return nullptr; } MatrixRowMajor MakeBoxMatrix(float tx, float ty, float tz) { - // Box matrix the OBB intersection code expects: rows[i][0..2] is the i-th - // axis (the existing semantics treat matrix rows as the OBB axes), and - // rows[i][3] is the translation component along that axis. return MatrixRowMajor( 1, 0, 0, tx, 0, 1, 0, ty, @@ -175,42 +431,6 @@ MatrixRowMajor MakeBoxMatrix(float tx, float ty, float tz) { ); } -std::string* TestSphereOrientedBox() { - // `size` is half-extents (the intersection code clamps to ±size). - VectorF32<3, 1> sphereCenter = Vec3(0, 0, 0); - VectorF32<1, 4> radii = Vec1x4(1.0f, 0.5f, 0.5f, 1.0f); - - // A: box at origin half-extent 2 -> sphere center inside -> hit. - VectorF32<3, 1> sizeA = Vec3(2, 2, 2); - auto boxA = MakeBoxMatrix(0, 0, 0); - // B: box at (5,0,0) half-extent 1 -> box spans x in [4,6], sphere in [-0.5,0.5] -> miss. - VectorF32<3, 1> sizeB = Vec3(1, 1, 1); - auto boxB = MakeBoxMatrix(5, 0, 0); - // C: box at (3,0,0) half-extent 1 -> box spans [2,4], sphere [-0.5,0.5] -> miss. - VectorF32<3, 1> sizeC = Vec3(1, 1, 1); - auto boxC = MakeBoxMatrix(3, 0, 0); - // D: box at origin half-extent 0.5 -> sphere center inside the box -> hit. - VectorF32<3, 1> sizeD = Vec3(0.5f, 0.5f, 0.5f); - auto boxD = MakeBoxMatrix(0, 0, 0); - - VectorF32<1, 4> r = IntersectionTestSphereOrientedBox(sphereCenter, radii, - sizeA, boxA, - sizeB, boxB, - sizeC, boxC, - sizeD, boxD); - std::array s = r.template Store(); - - if (s[0] != 0.0f) - return new std::string(std::format("SphereOrientedBox A: expected hit (0), got {}", s[0])); - if (s[1] != kMaxF) - return new std::string(std::format("SphereOrientedBox B: expected max (miss), got {}", s[1])); - if (s[2] != kMaxF) - return new std::string(std::format("SphereOrientedBox C: expected max (miss), got {}", s[2])); - if (s[3] != 0.0f) - return new std::string(std::format("SphereOrientedBox D: expected hit (0), got {}", s[3])); - return nullptr; -} - std::string* TestGetOBBCorners() { // Identity matrix - the 8 corners are exactly ±size on each axis. VectorF32<3, 1> size = Vec3(2, 3, 4); @@ -231,7 +451,6 @@ std::string* TestGetOBBCorners() { } } - // Translated matrix - corners shift by the translation column. auto m2 = MakeBoxMatrix(10, 20, 30); std::array, 8> corners2 = GetOBBCorners(size, m2); for (std::uint8_t i = 0; i < 8; ++i) { @@ -251,31 +470,43 @@ std::string* TestGetOBBCorners() { return nullptr; } -std::string* TestOBBOBBOverlapping() { - VectorF32<3, 1> size = Vec3(1, 1, 1); - auto boxA = MakeBoxMatrix(0, 0, 0); - auto boxB = MakeBoxMatrix(1, 0, 0); // overlap on x in [-1, 1] (B) and [-1, 1] (A) -> overlap - if (!IntersectionTestOrientedBoxOrientedBox(size, boxA, size, boxB)) - return new std::string("OBB-OBB overlapping: expected true"); - - auto boxFar = MakeBoxMatrix(10, 0, 0); - if (IntersectionTestOrientedBoxOrientedBox(size, boxA, size, boxFar)) - return new std::string("OBB-OBB far apart: expected false"); - return nullptr; +// Top-level wrappers: exercise each refactored function at Packing=1 (always +// supported) and at its default Packing (OptimalPacking for the build target). +std::string* TestRayTriangle() { return TestRayTriangleN<1>(); } +std::string* TestRayTriangleOpt() { return TestRayTriangleN::OptimalPacking>(); } +std::string* TestRayTriangleBackFacing(){ return TestRayTriangleBackFacingN<1>(); } +std::string* TestRayTriangleBackFacingOpt() { return TestRayTriangleBackFacingN::OptimalPacking>(); } +std::string* TestRaySphere() { return TestRaySphereN<1>(); } +std::string* TestRaySphereOpt() { return TestRaySphereN::OptimalPacking>(); } +std::string* TestRayOrientedBox() { return TestRayOrientedBoxN<1>(); } +std::string* TestRayOrientedBoxOpt() { + constexpr std::uint8_t P = std::min( + VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking); + return TestRayOrientedBoxN

(); } +std::string* TestSphereOrientedBox() { return TestSphereOrientedBoxN<1>(); } +std::string* TestSphereOrientedBoxOpt() { return TestSphereOrientedBoxN::OptimalPacking>(); } +std::string* TestOBBOBB() { return TestOBBOBBN<1>(); } +std::string* TestOBBOBBOpt() { return TestOBBOBBN::OptimalPacking>(); } } // namespace int main() { using Fn = std::string* (*)(); - constexpr std::array, 7> tests = {{ - { "RayTriangle", TestRayTriangle }, - { "RayTriangleBackFacing", TestRayTriangleBackFacing }, - { "RaySphere", TestRaySphere }, - { "RayOrientedBox", TestRayOrientedBox }, - { "SphereOrientedBox", TestSphereOrientedBox }, - { "GetOBBCorners", TestGetOBBCorners }, - { "OBBOBB", TestOBBOBBOverlapping }, + constexpr std::array, 13> tests = {{ + { "RayTriangle<1>", TestRayTriangle }, + { "RayTriangle", TestRayTriangleOpt }, + { "RayTriangleBackFacing<1>", TestRayTriangleBackFacing }, + { "RayTriangleBackFacing", TestRayTriangleBackFacingOpt }, + { "RaySphere<1>", TestRaySphere }, + { "RaySphere", TestRaySphereOpt }, + { "RayOrientedBox<1>", TestRayOrientedBox }, + { "RayOrientedBox", TestRayOrientedBoxOpt }, + { "SphereOrientedBox<1>", TestSphereOrientedBox }, + { "SphereOrientedBox", TestSphereOrientedBoxOpt }, + { "GetOBBCorners", TestGetOBBCorners }, + { "OBBOBB<1>", TestOBBOBB }, + { "OBBOBB", TestOBBOBBOpt }, }}; for (auto const& [name, fn] : tests) { diff --git a/tests/Vector.cpp b/tests/Vector.cpp index 5a80d7c..28be7d6 100644 --- a/tests/Vector.cpp +++ b/tests/Vector.cpp @@ -429,7 +429,7 @@ std::string* TestAllCombinations() { VectorType vecC = vecA * 3; VectorType vecD = vecA * 4; auto result = VectorType::Normalize(vecA, vecB, vecC, vecD); - VectorType<1, 4> result2 = VectorType::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result)); + VectorType<1, 4> result2 = VectorType::Length(result[0], result[1], result[2], result[3]); std::array::AlignmentElement> stored = result2.template Store(); for(std::uint8_t i = 0; i < Len*Packing; i++) { @@ -472,7 +472,7 @@ std::string* TestAllCombinations() { VectorType vecC = vecA * 3; VectorType vecD = vecA * 4; auto result = VectorType::Normalize(vecA, vecB, vecC, vecD); - VectorType<1, 8> result2 = VectorType::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result)); + VectorType<1, 8> result2 = VectorType::Length(result[0], result[1], result[2], result[3]); std::array::AlignmentElement> stored = result2.template Store(); for(std::uint8_t i = 0; i < Len*Packing; i++) { @@ -509,7 +509,7 @@ std::string* TestAllCombinations() { VectorType vecB = vecA * 2; VectorType vecC = vecA * 3; auto result = VectorType::Normalize(vecA, vecB, vecC); - VectorType<1, 15> result2 = VectorType::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result)); + VectorType<1, 15> result2 = VectorType::Length(result[0], result[1], result[2]); std::array::AlignmentElement> stored = result2.template Store(); for(std::uint8_t i = 0; i < Len*Packing; i++) { @@ -540,7 +540,7 @@ std::string* TestAllCombinations() { VectorType vecA(floats); VectorType vecE = vecA * 2; auto result = VectorType::Normalize(vecA, vecE); - VectorType<1, Packing*2> result2 = VectorType::Length(std::get<0>(result), std::get<1>(result)); + VectorType<1, Packing*2> result2 = VectorType::Length(result[0], result[1]); std::array::AlignmentElement> stored = result2.template Store(); for(std::uint8_t i = 0; i < Len*Packing; i++) { @@ -583,7 +583,7 @@ std::string* TestAllCombinations() { VectorType vecE = vecA * 3; VectorType vecG = vecA * 4; auto result = VectorType::Normalize(vecA, vecC, vecE, vecG); - VectorType<1, Packing*4> result2 = VectorType::Length(std::get<0>(result), std::get<1>(result), std::get<2>(result), std::get<3>(result)); + VectorType<1, Packing*4> result2 = VectorType::Length(result[0], result[1], result[2], result[3]); std::array::AlignmentElement> stored = result2.template Store(); for(std::uint8_t i = 0; i < Len*Packing; i++) {