/* Crafter®.Math Copyright (C) 2026 Catcrafts® catcrafts.net This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 3.0 as published by the Free Software Foundation; This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ export module Crafter.Math:Intersection; import :VectorF32; import :MatrixRowMajor; import std; namespace Crafter { // All intersection tests are batched over four primitives at a time so they // feed the VectorF32<3,1>::Dot / Cross / Length / Normalize four-pair // overloads directly. The single-primitive case is just "pass the same // primitive four times and read lane 0" - there is no single-vector // fast-path because the SIMD pipelines want full lanes. // Möller-Trumbore against four triangles sharing one ray. Returns ray // parameter t per triangle, or float max where the ray misses. export inline VectorF32<1, 4> IntersectionTestRayTriangle( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, VectorF32<3, 1> aV0, VectorF32<3, 1> aV1, VectorF32<3, 1> aV2, VectorF32<3, 1> bV0, VectorF32<3, 1> bV1, VectorF32<3, 1> bV2, VectorF32<3, 1> cV0, VectorF32<3, 1> cV1, VectorF32<3, 1> cV2, VectorF32<3, 1> dV0, VectorF32<3, 1> dV1, VectorF32<3, 1> dV2 ) { VectorF32<3, 1> aE1 = aV1 - aV0; VectorF32<3, 1> aE2 = aV2 - aV0; VectorF32<3, 1> bE1 = bV1 - bV0; VectorF32<3, 1> bE2 = bV2 - bV0; VectorF32<3, 1> cE1 = cV1 - cV0; VectorF32<3, 1> cE2 = cV2 - cV0; VectorF32<3, 1> dE1 = dV1 - dV0; VectorF32<3, 1> dE2 = dV2 - dV0; VectorF32<3, 1> aH = VectorF32<3, 1>::Cross(rayDir, aE2); VectorF32<3, 1> bH = VectorF32<3, 1>::Cross(rayDir, bE2); VectorF32<3, 1> cH = VectorF32<3, 1>::Cross(rayDir, cE2); VectorF32<3, 1> dH = VectorF32<3, 1>::Cross(rayDir, dE2); VectorF32<3, 1> aS = rayOrigin - aV0; VectorF32<3, 1> bS = rayOrigin - bV0; VectorF32<3, 1> cS = rayOrigin - cV0; VectorF32<3, 1> dS = rayOrigin - dV0; VectorF32<3, 1> aQ = VectorF32<3, 1>::Cross(aS, aE1); VectorF32<3, 1> bQ = VectorF32<3, 1>::Cross(bS, bE1); VectorF32<3, 1> cQ = VectorF32<3, 1>::Cross(cS, cE1); VectorF32<3, 1> dQ = VectorF32<3, 1>::Cross(dS, dE1); // Four 3-component dots packed into one __m128 per call. VectorF32<1, 4> det = VectorF32<3, 1>::Dot( aE1, aH, bE1, bH, cE1, cH, dE1, dH); VectorF32<1, 4> uNum = VectorF32<3, 1>::Dot( aS, aH, bS, bH, cS, cH, dS, dH); VectorF32<1, 4> vNum = VectorF32<3, 1>::Dot( rayDir, aQ, rayDir, bQ, rayDir, cQ, rayDir, dQ); VectorF32<1, 4> tNum = VectorF32<3, 1>::Dot( aE2, aQ, bE2, bQ, cE2, cQ, dE2, dQ); std::array detArr = det.template Store(); std::array uArr = uNum.template Store(); std::array vArr = vNum.template Store(); std::array tArr = tNum.template Store(); constexpr float eps = std::numeric_limits::epsilon(); constexpr float maxF = std::numeric_limits::max(); alignas(16) std::array out{}; for (std::uint8_t i = 0; i < 4; ++i) { float d = detArr[i]; if (d <= eps) { out[i] = maxF; continue; } float invD = 1.0f / d; float u = uArr[i] * invD; if (u < 0.0f || u > 1.0f) { out[i] = maxF; continue; } float v = vArr[i] * invD; if (v < 0.0f || u + v > 1.0f) { out[i] = maxF; continue; } out[i] = tArr[i] * invD; } return VectorF32<1, 4>(out.data()); } // One ray against four spheres. radii must hold {rA, rB, rC, rD} in lanes // 0..3. export inline VectorF32<1, 4> IntersectionTestRaySphere( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, VectorF32<3, 1> posA, VectorF32<3, 1> posB, VectorF32<3, 1> posC, VectorF32<3, 1> posD, VectorF32<1, 4> radii ) { VectorF32<3, 1> sA = rayOrigin - posA; VectorF32<3, 1> sB = rayOrigin - posB; VectorF32<3, 1> sC = rayOrigin - posC; VectorF32<3, 1> sD = rayOrigin - posD; // dirDotS_i = rayDir · (rayOrigin - pos_i) VectorF32<1, 4> dirDotS = VectorF32<3, 1>::Dot( rayDir, sA, rayDir, sB, rayDir, sC, rayDir, sD); // sqDist_i = |rayOrigin - pos_i|² (a.k.a. LengthSq of the s vectors) VectorF32<1, 4> sqDist = VectorF32<3, 1>::LengthSq(sA, sB, sC, sD); // aScalar = rayDir · rayDir, broadcast across four lanes. VectorF32<1, 4> aScalar = VectorF32<3, 1>::LengthSq( rayDir, rayDir, rayDir, rayDir); VectorF32<1, 4> two(2.0f); VectorF32<1, 4> four(4.0f); VectorF32<1, 4> b = two * dirDotS; VectorF32<1, 4> c = sqDist - radii * radii; // discriminant = b² - 4·a·c VectorF32<1, 4> disc = b * b - four * aScalar * c; std::array discArr = disc.template Store(); std::array bArr = b.template Store(); std::array aArr = aScalar.template Store(); constexpr float maxF = std::numeric_limits::max(); alignas(16) std::array out{}; for (std::uint8_t i = 0; i < 4; ++i) { float d = discArr[i]; if (d < 0.0f) { out[i] = maxF; continue; } float sqrtD = std::sqrt(d); float t = -0.5f * (bArr[i] + sqrtD) / aArr[i]; out[i] = (t > 0.0f) ? t : maxF; } return VectorF32<1, 4>(out.data()); } // One ray against four OBBs. Each box is described by world-space position, // half-extent vector (per-axis sizes), and a unit quaternion rotation. export inline VectorF32<1, 4> IntersectionTestRayOrientedBox( VectorF32<3, 1> rayOrigin, VectorF32<3, 1> rayDir, VectorF32<3, 1> posA, VectorF32<3, 1> sizeA, VectorF32<4, 1> rotA, VectorF32<3, 1> posB, VectorF32<3, 1> sizeB, VectorF32<4, 1> rotB, VectorF32<3, 1> posC, VectorF32<3, 1> sizeC, VectorF32<4, 1> rotC, VectorF32<3, 1> posD, VectorF32<3, 1> sizeD, VectorF32<4, 1> rotD ) { // Conjugate quaternion: negate xyz, keep w. Negate<{true,true,true,false}> // is constant-folded into a single XOR with a mask vector. VectorF32<4, 1> invRotA = rotA.template Negate<{{true, true, true, false}}>(); VectorF32<4, 1> invRotB = rotB.template Negate<{{true, true, true, false}}>(); VectorF32<4, 1> invRotC = rotC.template Negate<{{true, true, true, false}}>(); VectorF32<4, 1> invRotD = rotD.template Negate<{{true, true, true, false}}>(); VectorF32<3, 1> localOriginA = VectorF32<3, 1>::Rotate(rayOrigin - posA, invRotA); VectorF32<3, 1> localOriginB = VectorF32<3, 1>::Rotate(rayOrigin - posB, invRotB); VectorF32<3, 1> localOriginC = VectorF32<3, 1>::Rotate(rayOrigin - posC, invRotC); VectorF32<3, 1> localOriginD = VectorF32<3, 1>::Rotate(rayOrigin - posD, invRotD); VectorF32<3, 1> localDirA = VectorF32<3, 1>::Rotate(rayDir, invRotA); VectorF32<3, 1> localDirB = VectorF32<3, 1>::Rotate(rayDir, invRotB); VectorF32<3, 1> localDirC = VectorF32<3, 1>::Rotate(rayDir, invRotC); VectorF32<3, 1> localDirD = VectorF32<3, 1>::Rotate(rayDir, invRotD); VectorF32<3, 1> halfA = sizeA * 0.5f; VectorF32<3, 1> halfB = sizeB * 0.5f; VectorF32<3, 1> halfC = sizeC * 0.5f; VectorF32<3, 1> halfD = sizeD * 0.5f; std::array, 4> origLanes{ localOriginA.template Store(), localOriginB.template Store(), localOriginC.template Store(), localOriginD.template Store(), }; std::array, 4> dirLanes{ localDirA.template Store(), localDirB.template Store(), localDirC.template Store(), localDirD.template Store(), }; std::array, 4> halfLanes{ halfA.template Store(), halfB.template Store(), halfC.template Store(), halfD.template Store(), }; constexpr float eps = std::numeric_limits::epsilon(); constexpr float maxF = std::numeric_limits::max(); alignas(16) std::array out{}; for (std::uint8_t b = 0; b < 4; ++b) { float tMin = 0.0f; float tMax = maxF; bool miss = false; for (std::uint8_t i = 0; i < 3; ++i) { float d = dirLanes[b][i]; float o = origLanes[b][i]; float h = halfLanes[b][i]; if (std::abs(d) < eps) { if (o < -h || o > h) { miss = true; break; } } else { float invD = 1.0f / d; float t1 = (-h - o) * invD; float t2 = ( h - o) * invD; if (t1 > t2) std::swap(t1, t2); tMin = std::max(tMin, t1); tMax = std::min(tMax, t2); if (tMin > tMax) { miss = true; break; } } } out[b] = miss ? maxF : (tMin >= 0.0f ? tMin : tMax); } return VectorF32<1, 4>(out.data()); } // One sphere against four OBBs. boxMatrix encodes rotation in m[r][0..2] // and translation in m[r][3]. export inline VectorF32<1, 4> IntersectionTestSphereOrientedBox( VectorF32<3, 1> sphereCenter, VectorF32<1, 4> radii, VectorF32<3, 1> sizeA, MatrixRowMajor boxA, VectorF32<3, 1> sizeB, MatrixRowMajor boxB, VectorF32<3, 1> sizeC, MatrixRowMajor boxC, VectorF32<3, 1> sizeD, MatrixRowMajor boxD ) { auto perBox = [&](MatrixRowMajor const& m, VectorF32<3, 1> const& size, VectorF32<3, 1>& xAxis, VectorF32<3, 1>& yAxis, VectorF32<3, 1>& zAxis, VectorF32<3, 1>& delta) { // Existing semantics: the OBB axes are read from the rows of the // upper 3x3 block, and the translation column is gathered from the // w lane of each row. std::array r0 = m.rows[0].template Store(); std::array r1 = m.rows[1].template Store(); std::array r2 = m.rows[2].template Store(); alignas(16) float xBuf[4] = { r0[0], r0[1], r0[2], 0.0f }; alignas(16) float yBuf[4] = { r1[0], r1[1], r1[2], 0.0f }; alignas(16) float zBuf[4] = { r2[0], r2[1], r2[2], 0.0f }; alignas(16) float oBuf[4] = { r0[3], r1[3], r2[3], 0.0f }; xAxis = VectorF32<3, 1>(xBuf); yAxis = VectorF32<3, 1>(yBuf); zAxis = VectorF32<3, 1>(zBuf); VectorF32<3, 1> origin(oBuf); delta = sphereCenter - origin; (void)size; }; VectorF32<3, 1> xA, yA, zA, dA; VectorF32<3, 1> xB, yB, zB, dB; VectorF32<3, 1> xC, yC, zC, dC; VectorF32<3, 1> xD, yD, zD, dD; perBox(boxA, sizeA, xA, yA, zA, dA); perBox(boxB, sizeB, xB, yB, zB, dB); perBox(boxC, sizeC, xC, yC, zC, dC); perBox(boxD, sizeD, xD, yD, zD, dD); // Local sphere center per box: project delta onto each box axis. We // produce {lx, ly, lz, lx, ly, lz, lx, ly, lz, lx, ly, lz} as three // packed 4-wide Dot results (one Dot per axis). VectorF32<1, 4> locX = VectorF32<3, 1>::Dot( dA, xA, dB, xB, dC, xC, dD, xD); VectorF32<1, 4> locY = VectorF32<3, 1>::Dot( dA, yA, dB, yB, dC, yC, dD, yD); VectorF32<1, 4> locZ = VectorF32<3, 1>::Dot( dA, zA, dB, zB, dC, zC, dD, zD); std::array lxArr = locX.template Store(); std::array lyArr = locY.template Store(); std::array lzArr = locZ.template Store(); std::array rArr = radii.template Store(); std::array, 4> sizeLanes{ sizeA.template Store(), sizeB.template Store(), sizeC.template Store(), sizeD.template Store(), }; alignas(16) std::array out{}; for (std::uint8_t i = 0; i < 4; ++i) { float lx = lxArr[i], ly = lyArr[i], lz = lzArr[i]; float sx = sizeLanes[i][0], sy = sizeLanes[i][1], sz = sizeLanes[i][2]; float cx = std::clamp(lx, -sx, sx); float cy = std::clamp(ly, -sy, sy); float cz = std::clamp(lz, -sz, sz); float dx = lx - cx, dy = ly - cy, dz = lz - cz; float distSq = dx * dx + dy * dy + dz * dz; float r = rArr[i]; // Returns 0.0 on hit, max on miss - keeps a consistent // "t-like" output signature with the other intersection tests. out[i] = (distSq <= r * r) ? 0.0f : std::numeric_limits::max(); } return VectorF32<1, 4>(out.data()); } // Eight local corners of a unit OBB transformed by `matrix`. Uses one // batched 4-pair Dot per result row (x, y, z), reproducing the corners in // two groups of four. export inline std::array, 8> GetOBBCorners( VectorF32<3, 1> size, MatrixRowMajor matrix ) { std::array sz = size.template Store(); const float sx = sz[0], sy = sz[1], sz_ = sz[2]; VectorF32<4, 1> mx = matrix.rows[0]; VectorF32<4, 1> my = matrix.rows[1]; VectorF32<4, 1> mz = matrix.rows[2]; // Eight homogeneous corner vectors with w=1 so the translation column // of `matrix` participates in the dot product. alignas(16) float c0[4] = { -sx, -sy, -sz_, 1.0f }; alignas(16) float c1[4] = { sx, -sy, -sz_, 1.0f }; alignas(16) float c2[4] = { -sx, sy, -sz_, 1.0f }; alignas(16) float c3[4] = { sx, sy, -sz_, 1.0f }; alignas(16) float c4[4] = { -sx, -sy, sz_, 1.0f }; alignas(16) float c5[4] = { sx, -sy, sz_, 1.0f }; alignas(16) float c6[4] = { -sx, sy, sz_, 1.0f }; alignas(16) float c7[4] = { sx, sy, sz_, 1.0f }; VectorF32<4, 1> v0(c0), v1(c1), v2(c2), v3(c3); VectorF32<4, 1> v4(c4), v5(c5), v6(c6), v7(c7); // First four corners (0..3): batch x, y, z dots. VectorF32<1, 4> xLo = VectorF32<4, 1>::Dot(mx, v0, mx, v1, mx, v2, mx, v3); VectorF32<1, 4> yLo = VectorF32<4, 1>::Dot(my, v0, my, v1, my, v2, my, v3); VectorF32<1, 4> zLo = VectorF32<4, 1>::Dot(mz, v0, mz, v1, mz, v2, mz, v3); // Second four corners (4..7). VectorF32<1, 4> xHi = VectorF32<4, 1>::Dot(mx, v4, mx, v5, mx, v6, mx, v7); VectorF32<1, 4> yHi = VectorF32<4, 1>::Dot(my, v4, my, v5, my, v6, my, v7); VectorF32<1, 4> zHi = VectorF32<4, 1>::Dot(mz, v4, mz, v5, mz, v6, mz, v7); std::array xLoA = xLo.template Store(); std::array yLoA = yLo.template Store(); std::array zLoA = zLo.template Store(); std::array xHiA = xHi.template Store(); std::array yHiA = yHi.template Store(); std::array zHiA = zHi.template Store(); std::array, 8> result; for (std::uint8_t i = 0; i < 4; ++i) { alignas(16) float buf[4] = { xLoA[i], yLoA[i], zLoA[i], 0.0f }; result[i] = VectorF32<3, 1>(buf); } for (std::uint8_t i = 0; i < 4; ++i) { alignas(16) float buf[4] = { xHiA[i], yHiA[i], zHiA[i], 0.0f }; result[4 + i] = VectorF32<3, 1>(buf); } return result; } // SAT against fifteen separating axes (3 box-A, 3 box-B, 9 cross products). // We compute every corner projection with batched 4-pair Dots: each axis // projects four corners per call, two calls per axis covers the 8 corners. export inline bool IntersectionTestOrientedBoxOrientedBox( VectorF32<3, 1> sizeA, MatrixRowMajor boxA, VectorF32<3, 1> sizeB, MatrixRowMajor boxB ) { std::array, 8> cornersA = GetOBBCorners(sizeA, boxA); std::array, 8> cornersB = GetOBBCorners(sizeB, boxB); // Axes are the upper-3 lanes of each matrix row (same convention as // SphereOrientedBox). ExtractLo<3> just retypes the SIMD register; the // 4th lane is ignored by the Len=3 ops below. std::array, 3> axesA = { boxA.rows[0].template ExtractLo<3>(), boxA.rows[1].template ExtractLo<3>(), boxA.rows[2].template ExtractLo<3>(), }; std::array, 3> axesB = { boxB.rows[0].template ExtractLo<3>(), boxB.rows[1].template ExtractLo<3>(), boxB.rows[2].template ExtractLo<3>(), }; std::array, 15> axes{}; axes[0] = axesA[0]; axes[1] = axesA[1]; axes[2] = axesA[2]; axes[3] = axesB[0]; axes[4] = axesB[1]; axes[5] = axesB[2]; // Normalize all nine cross axes together with a single batched // Normalize call (Packing=3 not in the API, so two calls of four + // one of one would be needed; for now just normalize in two batches // of four and the trailing one inline). std::array, 9> crossAxes{}; std::uint8_t k = 0; for (std::uint8_t i = 0; i < 3; ++i) { for (std::uint8_t j = 0; j < 3; ++j) { crossAxes[k++] = VectorF32<3, 1>::Cross(axesA[i], axesB[j]); } } auto norm0 = VectorF32<3, 1>::Normalize(crossAxes[0], crossAxes[1], crossAxes[2], crossAxes[3]); auto norm1 = VectorF32<3, 1>::Normalize(crossAxes[4], crossAxes[5], crossAxes[6], crossAxes[7]); auto norm2 = VectorF32<3, 1>::Normalize(crossAxes[8], crossAxes[8], crossAxes[8], crossAxes[8]); axes[6] = std::get<0>(norm0); axes[7] = std::get<1>(norm0); axes[8] = std::get<2>(norm0); axes[9] = std::get<3>(norm0); axes[10] = std::get<0>(norm1); axes[11] = std::get<1>(norm1); axes[12] = std::get<2>(norm1); axes[13] = std::get<3>(norm1); axes[14] = std::get<0>(norm2); for (std::uint8_t axisIdx = 0; axisIdx < 15; ++axisIdx) { VectorF32<3, 1> axis = axes[axisIdx]; // Project all 8 corners of each box onto `axis` using two batched // 4-pair Dot calls (lo and hi corners). VectorF32<1, 4> projA_lo = VectorF32<3, 1>::Dot( cornersA[0], axis, cornersA[1], axis, cornersA[2], axis, cornersA[3], axis); VectorF32<1, 4> projA_hi = VectorF32<3, 1>::Dot( cornersA[4], axis, cornersA[5], axis, cornersA[6], axis, cornersA[7], axis); VectorF32<1, 4> projB_lo = VectorF32<3, 1>::Dot( cornersB[0], axis, cornersB[1], axis, cornersB[2], axis, cornersB[3], axis); VectorF32<1, 4> projB_hi = VectorF32<3, 1>::Dot( cornersB[4], axis, cornersB[5], axis, cornersB[6], axis, cornersB[7], axis); std::array aLo = projA_lo.template Store(); std::array aHi = projA_hi.template Store(); std::array bLo = projB_lo.template Store(); std::array bHi = projB_hi.template Store(); float minA = aLo[0], maxA = aLo[0]; for (std::uint8_t i = 1; i < 4; ++i) { minA = std::min(minA, aLo[i]); maxA = std::max(maxA, aLo[i]); } for (std::uint8_t i = 0; i < 4; ++i) { minA = std::min(minA, aHi[i]); maxA = std::max(maxA, aHi[i]); } float minB = bLo[0], maxB = bLo[0]; for (std::uint8_t i = 1; i < 4; ++i) { minB = std::min(minB, bLo[i]); maxB = std::max(maxB, bLo[i]); } for (std::uint8_t i = 0; i < 4; ++i) { minB = std::min(minB, bHi[i]); maxB = std::max(maxB, bHi[i]); } if (maxA < minB || maxB < minA) return false; } return true; } }