/* Crafter®.Math Copyright (C) 2026 Catcrafts® catcrafts.net This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 3.0 as published by the Free Software Foundation; This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ export module Crafter.Math:Intersection; import :VectorF32; import :MatrixRowMajor; import std; namespace Crafter { namespace detail { // Splat a single Len-vector into all Packing slots of the wider type // via a temporary float buffer. Performed once per intersection call; // the inner SIMD loop dominates so the round-trip is in the noise. template inline VectorF32 SplatToPacking(VectorF32 v) { alignas(64) float buf[VectorF32::AlignmentElement] = {}; std::array::AlignmentElement> flat = v.template Store(); for (std::uint8_t p = 0; p < Packing; ++p) { for (std::uint8_t k = 0; k < Len; ++k) buf[p * Len + k] = flat[k]; } return VectorF32(buf); } // Interleave two arrays of size N=BatchSize into the 2*N positional // argument list expected by the variadic Dot. Returns the packed // VectorF32<1, Packing*BatchSize> with one dot product per slot. template inline auto DotArrays( std::array, N> const& a, std::array, N> const& b ) { return [&](std::index_sequence) { std::array, 2 * N> flat; ((flat[2 * Is] = a[Is], flat[2 * Is + 1] = b[Is]), ...); return std::apply([](auto... args) { return VectorF32::Dot(args...); }, flat); }(std::make_index_sequence{}); } // Gather the `Component`-th lane of every sub-vector across an array // of N packed VectorF32<3, Packing> into a flat VectorF32<1, Packing*N> // with one scalar per pair. Used to materialize halfSize.x / .y / .z // alongside per-pair scalar projections in a single SIMD register. template inline auto ExtractComponent( std::array, VectorF32<3, Packing>::BatchSize> const& arr ) { constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; constexpr std::uint8_t Total = Packing * N; using OutVec = VectorF32<1, Total>; alignas(64) float buf[OutVec::AlignmentElement] = {}; for (std::uint8_t b = 0; b < N; ++b) { auto v = arr[b].template Store(); for (std::uint8_t p = 0; p < Packing; ++p) { buf[b * Packing + p] = v[p * 3 + Component]; } } return OutVec(buf); } // Lane-wise absolute value. Done via a flat float buffer because the // F32 module does not expose a SIMD Abs primitive. Only called O(15) // times per OBB-OBB call, so the round-trip is negligible compared to // the dot-product work. template inline VectorF32<1, Total> AbsVec(VectorF32<1, Total> v) { alignas(64) float buf[VectorF32<1, Total>::AlignmentElement]; v.Store(buf); for (std::uint8_t i = 0; i < Total; ++i) buf[i] = std::abs(buf[i]); return VectorF32<1, Total>(buf); } } // Packed batch of Packing * BatchSize OBBs, each described by world-space // origin, three orthonormal rotation axes (rows of the rotation matrix), // and per-axis half-extents. Each std::array element packs `Packing` // sub-OBBs; there are BatchSize such elements, so the struct holds // Packing * BatchSize OBBs total. // // Callers that have OBBs as MatrixRowMajor + halfSize need to extract the // three axes and the origin themselves — keeping the routines in terms of // packed VectorF32<3, Packing> lets every SIMD op stay in registers. export template ::OptimalPacking> struct PackedOBBs { static constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; static constexpr std::uint8_t Total = Packing * N; std::array, N> halfSize; std::array, N> xAxis; std::array, N> yAxis; std::array, N> zAxis; std::array, N> origin; }; // All intersection tests are batched over Packing*BatchSize primitives at // a time, where `Packing = VectorF32<3,1>::OptimalPacking` for the current // ISA (5 on AVX-512, 2 on AVX2, 1 on SSE/WASM/scalar) and BatchSize is the // arity that fills one output register. Callers form the packed input by // laying out `Packing` sub-primitives consecutively per vertex slot, then // assemble `BatchSize` such packed slots into the std::array argument. // Result lane `i` corresponds to triangle/sphere/box index `i`. // Möller-Trumbore against Packing*BatchSize triangles sharing one ray. // Returns ray parameter t per triangle, or float max where the ray misses. export template ::OptimalPacking> inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> IntersectionTestRayTriangle( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, std::array, VectorF32<3, Packing>::BatchSize> const& v0, std::array, VectorF32<3, Packing>::BatchSize> const& v1, std::array, VectorF32<3, Packing>::BatchSize> const& v2 ) { constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; constexpr std::uint8_t Total = Packing * N; using PVec = VectorF32<3, Packing>; PVec rayOriginP = detail::SplatToPacking(rayOrigin); PVec rayDirP = detail::SplatToPacking(rayDir); std::array E1, E2, H, S, Q, rayDirArr; for (std::uint8_t i = 0; i < N; ++i) { E1[i] = v1[i] - v0[i]; E2[i] = v2[i] - v0[i]; H[i] = PVec::Cross(rayDirP, E2[i]); S[i] = rayOriginP - v0[i]; Q[i] = PVec::Cross(S[i], E1[i]); rayDirArr[i] = rayDirP; } auto det = detail::DotArrays(E1, H); auto uNum = detail::DotArrays(S, H); auto vNum = detail::DotArrays(rayDirArr, Q); auto tNum = detail::DotArrays(E2, Q); auto detArr = det.template Store(); auto uArr = uNum.template Store(); auto vArr = vNum.template Store(); auto tArr = tNum.template Store(); constexpr float eps = std::numeric_limits::epsilon(); constexpr float maxF = std::numeric_limits::max(); alignas(64) std::array::AlignmentElement> out{}; for (std::uint8_t i = 0; i < Total; ++i) { float d = detArr[i]; if (d <= eps) { out[i] = maxF; continue; } float invD = 1.0f / d; float u = uArr[i] * invD; if (u < 0.0f || u > 1.0f) { out[i] = maxF; continue; } float v = vArr[i] * invD; if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; } out[i] = tArr[i] * invD; } return VectorF32<1, Total>(out.data()); } // One ray against Packing*BatchSize spheres. `radii` holds one radius per // sphere in lane order matching the result. export template ::OptimalPacking> inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> IntersectionTestRaySphere( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, std::array, VectorF32<3, Packing>::BatchSize> const& pos, VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> radii ) { constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; constexpr std::uint8_t Total = Packing * N; using PVec = VectorF32<3, Packing>; using OutVec = VectorF32<1, Total>; PVec rayOriginP = detail::SplatToPacking(rayOrigin); PVec rayDirP = detail::SplatToPacking(rayDir); std::array s; std::array rayDirArr; for (std::uint8_t i = 0; i < N; ++i) { s[i] = rayOriginP - pos[i]; rayDirArr[i] = rayDirP; } // dirDotS_i = rayDir · (rayOrigin - pos_i) auto dirDotS = detail::DotArrays(rayDirArr, s); // sqDist_i = |rayOrigin - pos_i|² across all packed slots. auto sqDist = std::apply([](auto... args) { return PVec::LengthSq(args...); }, s); // aScalar = rayDir · rayDir, broadcast across every lane. auto aScalar = std::apply([](auto... args) { return PVec::LengthSq(args...); }, rayDirArr); OutVec two(2.0f); OutVec four(4.0f); OutVec b = two * dirDotS; OutVec c = sqDist - radii * radii; // discriminant = b² - 4·a·c OutVec disc = b * b - four * aScalar * c; auto discArr = disc.template Store(); auto bArr = b.template Store(); auto aArr = aScalar.template Store(); constexpr float maxF = std::numeric_limits::max(); alignas(64) std::array out{}; for (std::uint8_t i = 0; i < Total; ++i) { float d = discArr[i]; if (d < 0.0f) { out[i] = maxF; continue; } float sqrtD = std::sqrt(d); float t = -0.5f * (bArr[i] + sqrtD) / aArr[i]; out[i] = (t > 0.0f) ? t : maxF; } return OutVec(out.data()); } // Packing that fits both Len=3 (positions, sizes) and Len=4 (quaternions) // in one SIMD register. Len=4's OptimalPacking is always ≤ Len=3's, so we // use the smaller of the two so a single Packing covers every type the // routine needs. inline constexpr std::uint8_t RayOBBPacking = std::min( VectorF32<3, 1>::OptimalPacking, VectorF32<4, 1>::OptimalPacking); // One ray against Packing*BatchSize OBBs. Each box is described by // world-space position, full-extent size, and a unit quaternion rotation. export template inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> IntersectionTestRayOrientedBox( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, std::array, VectorF32<3, Packing>::BatchSize> const& pos, std::array, VectorF32<3, Packing>::BatchSize> const& size, std::array, VectorF32<3, Packing>::BatchSize> const& rot ) { constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; constexpr std::uint8_t Total = Packing * N; using PVec3 = VectorF32<3, Packing>; using PVec4 = VectorF32<4, Packing>; using OutVec = VectorF32<1, Total>; PVec3 rayOriginP = detail::SplatToPacking(rayOrigin); PVec3 rayDirP = detail::SplatToPacking(rayDir); // Conjugate quaternion: negate xyz, keep w. Constant-folded into one // XOR with a mask vector inside Negate. std::array localOrigin, localDir, half; for (std::uint8_t i = 0; i < N; ++i) { PVec4 invRot = rot[i].template Negate<{{true, true, true, false}}>(); localOrigin[i] = PVec3::Rotate(rayOriginP - pos[i], invRot); localDir[i] = PVec3::Rotate(rayDirP, invRot); half[i] = size[i] * 0.5f; } std::array, N> origLanes, dirLanes, halfLanes; for (std::uint8_t i = 0; i < N; ++i) { origLanes[i] = localOrigin[i].template Store(); dirLanes[i] = localDir[i].template Store(); halfLanes[i] = half[i].template Store(); } constexpr float eps = std::numeric_limits::epsilon(); constexpr float maxF = std::numeric_limits::max(); alignas(64) std::array out{}; for (std::uint8_t b = 0; b < Total; ++b) { std::uint8_t batchIdx = b / Packing; std::uint8_t subIdx = b % Packing; float tMin = 0.0f; float tMax = maxF; bool miss = false; for (std::uint8_t i = 0; i < 3; ++i) { std::uint8_t lane = static_cast(subIdx * 3 + i); float d = dirLanes[batchIdx][lane]; float o = origLanes[batchIdx][lane]; float h = halfLanes[batchIdx][lane]; if (std::abs(d) < eps) { if (o < -h || o > h) { miss = true; break; } } else { float invD = 1.0f / d; float t1 = (-h - o) * invD; float t2 = ( h - o) * invD; if (t1 > t2) std::swap(t1, t2); tMin = std::max(tMin, t1); tMax = std::min(tMax, t2); if (tMin > tMax) { miss = true; break; } } } out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax); } return OutVec(out.data()); } // One sphere against Packing*BatchSize OBBs described by a PackedOBBs. // Returns 0.0 per pair where the sphere intersects the box, max-float // otherwise. `radii` carries one sphere radius per pair in the same lane // order as the resulting test output. export template ::OptimalPacking> inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> IntersectionTestSphereOrientedBox( VectorF32<3, 1> sphereCenter, VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> radii, PackedOBBs const& boxes ) { constexpr std::uint8_t N = VectorF32<3, Packing>::BatchSize; constexpr std::uint8_t Total = Packing * N; using PVec3 = VectorF32<3, Packing>; using OutVec = VectorF32<1, Total>; PVec3 sphereCenterP = detail::SplatToPacking(sphereCenter); std::array delta; for (std::uint8_t i = 0; i < N; ++i) { delta[i] = sphereCenterP - boxes.origin[i]; } // Project the world-space delta onto each box axis. auto locX = detail::DotArrays(delta, boxes.xAxis); auto locY = detail::DotArrays(delta, boxes.yAxis); auto locZ = detail::DotArrays(delta, boxes.zAxis); auto lxArr = locX.template Store(); auto lyArr = locY.template Store(); auto lzArr = locZ.template Store(); auto rArr = radii.template Store(); std::array, N> sizeLanes; for (std::uint8_t i = 0; i < N; ++i) { sizeLanes[i] = boxes.halfSize[i].template Store(); } constexpr float maxF = std::numeric_limits::max(); alignas(64) std::array out{}; for (std::uint8_t i = 0; i < Total; ++i) { std::uint8_t batchIdx = i / Packing; std::uint8_t subIdx = i % Packing; float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i]; float sx = sizeLanes[batchIdx][subIdx * 3 + 0]; float sy = sizeLanes[batchIdx][subIdx * 3 + 1]; float sz = sizeLanes[batchIdx][subIdx * 3 + 2]; float cx = std::clamp(lx, -sx, sx); float cy = std::clamp(ly, -sy, sy); float cz = std::clamp(lz, -sz, sz); float dx = lx - cx, dy = ly - cy, dz = lz - cz; float distSq = dx * dx + dy * dy + dz * dz; float r = rArr[i]; // Returns 0.0 on hit, max on miss — same "t-like" output signature // as the ray-vs-X tests. out[i] = (distSq <= r * r) ? 0.0f : maxF; } return OutVec(out.data()); } // Eight local corners of a unit OBB transformed by `matrix`. Uses one // batched 4-pair Dot per result row (x, y, z), reproducing the corners in // two groups of four. export inline std::array, 8> GetOBBCorners( VectorF32<3, 1> size, MatrixRowMajor matrix ) { std::array sz = size.template Store(); const float sx = sz[0], sy = sz[1], sz_ = sz[2]; VectorF32<4, 1> mx = matrix.rows[0]; VectorF32<4, 1> my = matrix.rows[1]; VectorF32<4, 1> mz = matrix.rows[2]; // Eight homogeneous corner vectors with w=1 so the translation column // of `matrix` participates in the dot product. alignas(16) float c0[4] = { -sx, -sy, -sz_, 1.0f }; alignas(16) float c1[4] = { sx, -sy, -sz_, 1.0f }; alignas(16) float c2[4] = { -sx, sy, -sz_, 1.0f }; alignas(16) float c3[4] = { sx, sy, -sz_, 1.0f }; alignas(16) float c4[4] = { -sx, -sy, sz_, 1.0f }; alignas(16) float c5[4] = { sx, -sy, sz_, 1.0f }; alignas(16) float c6[4] = { -sx, sy, sz_, 1.0f }; alignas(16) float c7[4] = { sx, sy, sz_, 1.0f }; VectorF32<4, 1> v0(c0), v1(c1), v2(c2), v3(c3); VectorF32<4, 1> v4(c4), v5(c5), v6(c6), v7(c7); // First four corners (0..3): batch x, y, z dots. VectorF32<1, 4> xLo = VectorF32<4, 1>::Dot(mx, v0, mx, v1, mx, v2, mx, v3); VectorF32<1, 4> yLo = VectorF32<4, 1>::Dot(my, v0, my, v1, my, v2, my, v3); VectorF32<1, 4> zLo = VectorF32<4, 1>::Dot(mz, v0, mz, v1, mz, v2, mz, v3); // Second four corners (4..7). VectorF32<1, 4> xHi = VectorF32<4, 1>::Dot(mx, v4, mx, v5, mx, v6, mx, v7); VectorF32<1, 4> yHi = VectorF32<4, 1>::Dot(my, v4, my, v5, my, v6, my, v7); VectorF32<1, 4> zHi = VectorF32<4, 1>::Dot(mz, v4, mz, v5, mz, v6, mz, v7); std::array xLoA = xLo.template Store(); std::array yLoA = yLo.template Store(); std::array zLoA = zLo.template Store(); std::array xHiA = xHi.template Store(); std::array yHiA = yHi.template Store(); std::array zHiA = zHi.template Store(); std::array, 8> result; for (std::uint8_t i = 0; i < 4; ++i) { alignas(16) float buf[4] = { xLoA[i], yLoA[i], zLoA[i], 0.0f }; result[i] = VectorF32<3, 1>(buf); } for (std::uint8_t i = 0; i < 4; ++i) { alignas(16) float buf[4] = { xHiA[i], yHiA[i], zHiA[i], 0.0f }; result[4 + i] = VectorF32<3, 1>(buf); } return result; } // SAT against the 15 separating axis candidates (3 from box A, 3 from // box B, 9 cross products). Returns 0.0 per pair when the boxes overlap // and max-float when a separating axis was found, matching the // "smaller-is-closer" convention of the ray-vs-X tests. // // The corner-free formulation: for an OBB (origin O, unit axes X/Y/Z, // half-extents h) and a separating-axis candidate a, the projection // interval is centered at O·a with radius hx|X·a| + hy|Y·a| + hz|Z·a|. // Each axis therefore only needs four dot products per box (and a couple // of fused-multiply-adds) instead of eight corner projections — every // sub-pair runs in parallel inside the SIMD lanes. export template ::OptimalPacking> inline VectorF32<1, static_cast(Packing * VectorF32<3, Packing>::BatchSize)> IntersectionTestOrientedBoxOrientedBox( PackedOBBs const& a, PackedOBBs const& b ) { using PVec = VectorF32<3, Packing>; constexpr std::uint8_t N = PVec::BatchSize; constexpr std::uint8_t Total = Packing * N; using OutVec = VectorF32<1, Total>; // Per-pair half-extents pulled out of each PackedOBBs into flat // VectorF32<1, Total> registers so they can multiply the projection // dots directly. OutVec halfA_x = detail::ExtractComponent<0, Packing>(a.halfSize); OutVec halfA_y = detail::ExtractComponent<1, Packing>(a.halfSize); OutVec halfA_z = detail::ExtractComponent<2, Packing>(a.halfSize); OutVec halfB_x = detail::ExtractComponent<0, Packing>(b.halfSize); OutVec halfB_y = detail::ExtractComponent<1, Packing>(b.halfSize); OutVec halfB_z = detail::ExtractComponent<2, Packing>(b.halfSize); constexpr float maxF = std::numeric_limits::max(); alignas(64) std::array out{}; for (std::uint8_t i = 0; i < Total; ++i) out[i] = 0.0f; // start: overlap auto axesOfA = [&](std::uint8_t i) -> std::array const& { return (i == 0) ? a.xAxis : (i == 1) ? a.yAxis : a.zAxis; }; auto axesOfB = [&](std::uint8_t i) -> std::array const& { return (i == 0) ? b.xAxis : (i == 1) ? b.yAxis : b.zAxis; }; // For each separating-axis candidate, compute per-pair min/max for // both boxes and OR the "separating" condition into `out`. auto checkAxis = [&](std::array const& axis) { OutVec cA = detail::DotArrays(a.origin, axis); OutVec dA_x = detail::DotArrays(a.xAxis, axis); OutVec dA_y = detail::DotArrays(a.yAxis, axis); OutVec dA_z = detail::DotArrays(a.zAxis, axis); OutVec rA = halfA_x * detail::AbsVec(dA_x) + halfA_y * detail::AbsVec(dA_y) + halfA_z * detail::AbsVec(dA_z); OutVec cB = detail::DotArrays(b.origin, axis); OutVec dB_x = detail::DotArrays(b.xAxis, axis); OutVec dB_y = detail::DotArrays(b.yAxis, axis); OutVec dB_z = detail::DotArrays(b.zAxis, axis); OutVec rB = halfB_x * detail::AbsVec(dB_x) + halfB_y * detail::AbsVec(dB_y) + halfB_z * detail::AbsVec(dB_z); OutVec minA = cA - rA; OutVec maxA = cA + rA; OutVec minB = cB - rB; OutVec maxB = cB + rB; auto minAArr = minA.template Store(); auto maxAArr = maxA.template Store(); auto minBArr = minB.template Store(); auto maxBArr = maxB.template Store(); for (std::uint8_t i = 0; i < Total; ++i) { // NaN comparisons (from degenerate cross axes) return false and // correctly leave `out[i]` untouched on this axis. if (maxAArr[i] < minBArr[i] || maxBArr[i] < minAArr[i]) { out[i] = maxF; } } }; checkAxis(a.xAxis); checkAxis(a.yAxis); checkAxis(a.zAxis); checkAxis(b.xAxis); checkAxis(b.yAxis); checkAxis(b.zAxis); // The 9 cross-product axes. Each batch slot's cross axes are computed // per-slot, then normalized together (one PVec::Normalize per cross // index processes N packed slots in parallel). for (std::uint8_t i = 0; i < 3; ++i) { auto const& aAx = axesOfA(i); for (std::uint8_t j = 0; j < 3; ++j) { auto const& bAx = axesOfB(j); std::array crossAx; for (std::uint8_t k = 0; k < N; ++k) crossAx[k] = PVec::Cross(aAx[k], bAx[k]); auto normalized = std::apply([](auto... args) { return PVec::Normalize(args...); }, crossAx); checkAxis(normalized); } } return OutVec(out.data()); } }