diff --git a/interfaces/Crafter.Math-Basic.cppm b/interfaces/Crafter.Math-Basic.cppm index 83095c1..0c8a877 100755 --- a/interfaces/Crafter.Math-Basic.cppm +++ b/interfaces/Crafter.Math-Basic.cppm @@ -28,10 +28,8 @@ namespace Crafter { return degrees * (std::numbers::pi / 180); } - #ifdef __x86_64 - #ifndef __AVX512FP16__ + #if (defined(__x86_64) && !defined(__AVX512FP16__)) || !defined(__FLT16_MAX__) export template using VectorF16 = VectorF32; #endif - #endif } \ No newline at end of file diff --git a/interfaces/Crafter.Math-Common.cppm b/interfaces/Crafter.Math-Common.cppm index 4cef9f6..f690cee 100644 --- a/interfaces/Crafter.Math-Common.cppm +++ b/interfaces/Crafter.Math-Common.cppm @@ -2,20 +2,31 @@ module; #ifdef __x86_64 #include #endif +#ifdef __wasm_simd128__ +#include +#endif export module Crafter.Math:Common; import std; +// VectorF16 exists as a real struct when _Float16 is available AND we are not +// on x86_64 without AVX512FP16 (that path aliases VectorF16 to VectorF32 in +// Crafter.Math:Basic for performance). Each translation unit that needs this +// distinction redefines the same condition since macros do not cross module +// boundaries. +#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) namespace Crafter { - #ifdef __AVX512FP16__ export template struct VectorF16; - #endif +} +#endif + +namespace Crafter { export template struct VectorF32; template struct VectorBase { - #ifdef __AVX512FP16__ + #if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) template friend struct VectorF16; #endif @@ -33,8 +44,13 @@ namespace Crafter { } #ifdef __x86_64 - using VectorType = std::conditional_t, - + using VectorType = std::conditional_t< + #ifdef __FLT16_MAX__ + std::is_same_v + #else + false + #endif + , #ifdef __AVX512FP16__ std::conditional_t<(Len * Packing > 16), __m512h, std::conditional_t<(Len * Packing > 8), __m256h, __m128h>>, @@ -45,9 +61,13 @@ namespace Crafter { std::conditional_t<(Len * Packing > 8), __m512, std::conditional_t<(Len * Packing > 4), __m256, __m128>> >; + #elif defined(__wasm_simd128__) + using VectorType = v128_t; + #else + using VectorType = std::array; + #endif VectorType v; - #endif public: @@ -56,6 +76,10 @@ namespace Crafter { #ifdef __AVX512F__ static constexpr std::uint8_t Max = 64; + #elif defined(__wasm_simd128__) + // WASM SIMD only has 128-bit vectors; cap at 16 bytes so the entire + // VectorType always fits in a single v128_t. + static constexpr std::uint8_t Max = 16; #else static constexpr std::uint8_t Max = 32; #endif diff --git a/interfaces/Crafter.Math-VectorF16.cppm b/interfaces/Crafter.Math-VectorF16.cppm index 371e5a0..d893f62 100755 --- a/interfaces/Crafter.Math-VectorF16.cppm +++ b/interfaces/Crafter.Math-VectorF16.cppm @@ -24,6 +24,11 @@ export module Crafter.Math:VectorF16; import std; import :Common; +// A real VectorF16 struct is provided only when _Float16 is available AND we +// are either on an x86_64 build with AVX512FP16 (intrinsic impl below) or on a +// non-x86_64 target (scalar fallback further below). On x86_64 without +// AVX512FP16 or anywhere without _Float16, Crafter.Math:Basic aliases +// VectorF16 to VectorF32 instead. #ifdef __AVX512FP16__ namespace Crafter { export template @@ -1027,6 +1032,259 @@ namespace Crafter { } +#elif !defined(__x86_64) && defined(__FLT16_MAX__) +// Scalar software fallback for non-x86_64 targets that still have _Float16. +namespace Crafter { + export template + struct VectorF16 : public VectorBase { + template + friend struct VectorF16; + using Base = VectorBase; + static constexpr std::uint8_t NElems = Base::AlignmentElement; + + constexpr VectorF16() = default; + constexpr VectorF16(typename Base::VectorType vv) { + this->v = vv; + } + constexpr VectorF16(const _Float16* vB) { Load(vB); } + constexpr VectorF16(_Float16 val) { + for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val; + } + + constexpr void Load(const _Float16* vB) { + for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i]; + } + constexpr void Store(_Float16* vB) const { + for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i]; + } + + template + constexpr std::array<_Float16, NElems> Store() const { + std::array<_Float16, NElems> r{}; + Store(r.data()); + return r; + } + + template + constexpr operator VectorF16() const { + VectorF16 r; + const std::uint8_t copyLen = (BLen < Len) ? BLen : Len; + const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing; + for (std::uint8_t p = 0; p < copyPack; ++p) + for (std::uint8_t i = 0; i < copyLen; ++i) + r.v[p * BLen + i] = this->v[p * Len + i]; + return r; + } + + constexpr VectorF16 operator+(VectorF16 b) const { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] + b.v[i]; + return r; + } + constexpr VectorF16 operator-(VectorF16 b) const { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] - b.v[i]; + return r; + } + constexpr VectorF16 operator*(VectorF16 b) const { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] * b.v[i]; + return r; + } + constexpr VectorF16 operator/(VectorF16 b) const { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] / b.v[i]; + return r; + } + + constexpr void operator+=(VectorF16 b) { for (std::uint8_t i=0;iv[i] += b.v[i]; } + constexpr void operator-=(VectorF16 b) { for (std::uint8_t i=0;iv[i] -= b.v[i]; } + constexpr void operator*=(VectorF16 b) { for (std::uint8_t i=0;iv[i] *= b.v[i]; } + constexpr void operator/=(VectorF16 b) { for (std::uint8_t i=0;iv[i] /= b.v[i]; } + + constexpr VectorF16 operator+(_Float16 b) const { return *this + VectorF16(b); } + constexpr VectorF16 operator-(_Float16 b) const { return *this - VectorF16(b); } + constexpr VectorF16 operator*(_Float16 b) const { return *this * VectorF16(b); } + constexpr VectorF16 operator/(_Float16 b) const { return *this / VectorF16(b); } + constexpr void operator+=(_Float16 b) { *this += VectorF16(b); } + constexpr void operator-=(_Float16 b) { *this -= VectorF16(b); } + constexpr void operator*=(_Float16 b) { *this *= VectorF16(b); } + constexpr void operator/=(_Float16 b) { *this /= VectorF16(b); } + + constexpr VectorF16 operator-() const { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = -this->v[i]; + return r; + } + + constexpr bool operator==(VectorF16 b) const { + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + if (this->v[p * Len + i] != b.v[p * Len + i]) return false; + return true; + } + constexpr bool operator!=(VectorF16 b) const { return !(*this == b); } + + template + constexpr VectorF16 ExtractLo() const { + VectorF16 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < ExtractLen; ++i) + r.v[p * ExtractLen + i] = this->v[p * Len + i]; + return r; + } + + // Transcendentals are computed via float since libstdc++ doesn't always + // provide overloads for _Float16; the result is rounded back to half. + constexpr VectorF16 Cos() const { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = static_cast<_Float16>(std::cos(static_cast(this->v[i]))); + return r; + } + constexpr VectorF16 Sin() const { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = static_cast<_Float16>(std::sin(static_cast(this->v[i]))); + return r; + } + constexpr std::tuple, VectorF16> SinCos() const { + return { Sin(), Cos() }; + } + + template values> + constexpr VectorF16 Negate() const { + VectorF16 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i]; + return r; + } + + static constexpr VectorF16 MulitplyAdd(VectorF16 a, VectorF16 b, VectorF16 add) { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] + add.v[i]; + return r; + } + static constexpr VectorF16 MulitplySub(VectorF16 a, VectorF16 b, VectorF16 sub) { + VectorF16 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] - sub.v[i]; + return r; + } + + constexpr static VectorF16 Cross(VectorF16 a, VectorF16 b) requires(Len == 3) { + VectorF16 r; + for (std::uint8_t p = 0; p < Packing; ++p) { + const std::uint8_t base = p * 3; + r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1]; + r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2]; + r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0]; + } + return r; + } + + template ShuffleValues> + constexpr VectorF16 Shuffle() const { + VectorF16 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]]; + return r; + } + + template ShuffleValues> + constexpr static VectorF16 Blend(VectorF16 a, VectorF16 b) { + VectorF16 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i]; + return r; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto LengthSq(VectorF16 first, Rest... rest) { + constexpr std::uint8_t N = 1 + sizeof...(Rest); + VectorF16<1, static_cast(Packing * N)> r; + std::array, N> args{ first, rest... }; + for (std::uint8_t i = 0; i < N; ++i) + for (std::uint8_t p = 0; p < Packing; ++p) { + _Float16 acc = _Float16(0); + for (std::uint8_t k = 0; k < Len; ++k) { + _Float16 x = args[i].v[p * Len + k]; + acc += x * x; + } + r.v[i * Packing + p] = acc; + } + return r; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto Length(VectorF16 first, Rest... rest) { + auto sq = LengthSq(first, rest...); + for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) + sq.v[i] = static_cast<_Float16>(std::sqrt(static_cast(sq.v[i]))); + return sq; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto Normalize(VectorF16 first, Rest... rest) { + auto normOne = [](VectorF16 u) { + VectorF16 out; + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) { + float x = static_cast(u.v[p * Len + k]); + acc += x * x; + } + _Float16 invLen = acc > 0.0f ? static_cast<_Float16>(1.0f / std::sqrt(acc)) : _Float16(0); + for (std::uint8_t k = 0; k < Len; ++k) + out.v[p * Len + k] = u.v[p * Len + k] * invLen; + } + return out; + }; + return std::make_tuple(normOne(first), normOne(rest)...); + } + + constexpr static VectorF16 Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) { + VectorF16<3, Packing> qv; + VectorF16<3, Packing> qwBroadcast; + for (std::uint8_t p = 0; p < Packing; ++p) { + qv.v[p * 3 + 0] = q.v[p * 4 + 0]; + qv.v[p * 3 + 1] = q.v[p * 4 + 1]; + qv.v[p * 3 + 2] = q.v[p * 4 + 2]; + for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3]; + } + VectorF16<3, Packing> t = Cross(qv, v) * _Float16(2); + return v + t * qwBroadcast + Cross(qv, t); + } + + constexpr static VectorF16<3, Packing> RotatePivot(VectorF16<3, Packing> v, VectorF16<4, Packing> q, VectorF16<3, Packing> pivot) requires(Len == 3) { + VectorF16<3, Packing> translated = v - pivot; + return Rotate(translated, q) + pivot; + } + + constexpr static VectorF16<4, Packing> QuanternionFromEuler(VectorF16<3, Packing> eulerHalf) requires(Len == 4) { + VectorF16<4, Packing> r; + for (std::uint8_t p = 0; p < Packing; ++p) { + float roll = static_cast(eulerHalf.v[p * 3 + 0]); + float pitch = static_cast(eulerHalf.v[p * 3 + 1]); + float yaw = static_cast(eulerHalf.v[p * 3 + 2]); + float sr = std::sin(roll), cr = std::cos(roll); + float sp = std::sin(pitch), cp = std::cos(pitch); + float sy = std::sin(yaw), cy = std::cos(yaw); + r.v[p * 4 + 0] = static_cast<_Float16>(sr * cp * cy - cr * sp * sy); + r.v[p * 4 + 1] = static_cast<_Float16>(cr * sp * cy + sr * cp * sy); + r.v[p * 4 + 2] = static_cast<_Float16>(cr * cp * sy - sr * sp * cy); + r.v[p * 4 + 3] = static_cast<_Float16>(cr * cp * cy + sr * sp * sy); + } + return r; + } + }; +} +#endif + +#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) export template struct std::formatter> : std::formatter { constexpr auto format(const Crafter::VectorF16& obj, format_context& ctx) const { diff --git a/interfaces/Crafter.Math-VectorF32.cppm b/interfaces/Crafter.Math-VectorF32.cppm index aad1d37..cc86764 100755 --- a/interfaces/Crafter.Math-VectorF32.cppm +++ b/interfaces/Crafter.Math-VectorF32.cppm @@ -20,6 +20,9 @@ module; #ifdef __x86_64 #include #endif +#ifdef __wasm_simd128__ +#include +#endif export module Crafter.Math:VectorF32; import std; import :Common; @@ -1383,10 +1386,519 @@ namespace Crafter { return row1; } }; +#elif defined(__wasm_simd128__) + // WebAssembly SIMD128 implementation. VectorType is always v128_t and we + // cap Len*Packing*sizeof(float) at 16 bytes (i.e. up to 4 floats per + // vector) in Common.cppm so a single v128_t covers every instantiation. + // Operations without a direct SIMD equivalent (Shuffle with runtime indices, + // transcendentals, etc.) round-trip through a float[4] scratch buffer. + export template + struct VectorF32 : public VectorBase { + template + friend struct VectorF32; + using Base = VectorBase; + static constexpr std::uint8_t NElems = Base::AlignmentElement; + static_assert(NElems == 4, "WASM SIMD VectorF32 assumes 4-lane vectors"); + + constexpr VectorF32() = default; + constexpr VectorF32(v128_t vv) { this->v = vv; } + constexpr VectorF32(const float* vB) { Load(vB); } + constexpr VectorF32(float val) { this->v = wasm_f32x4_splat(val); } + + constexpr void Load(const float* vB) { this->v = wasm_v128_load(vB); } + constexpr void Store(float* vB) const { wasm_v128_store(vB, this->v); } + + template + constexpr std::array Store() const { + std::array r{}; + Store(r.data()); + return r; + } + + template + constexpr operator VectorF32() const { + alignas(16) float tmp[4]; + wasm_v128_store(tmp, this->v); + alignas(16) float out[4] = {0,0,0,0}; + const std::uint8_t copyLen = (BLen < Len) ? BLen : Len; + const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing; + for (std::uint8_t p = 0; p < copyPack; ++p) + for (std::uint8_t i = 0; i < copyLen; ++i) + out[p * BLen + i] = tmp[p * Len + i]; + return VectorF32(wasm_v128_load(out)); + } + + constexpr VectorF32 operator+(VectorF32 b) const { return VectorF32(wasm_f32x4_add(this->v, b.v)); } + constexpr VectorF32 operator-(VectorF32 b) const { return VectorF32(wasm_f32x4_sub(this->v, b.v)); } + constexpr VectorF32 operator*(VectorF32 b) const { return VectorF32(wasm_f32x4_mul(this->v, b.v)); } + constexpr VectorF32 operator/(VectorF32 b) const { return VectorF32(wasm_f32x4_div(this->v, b.v)); } + constexpr void operator+=(VectorF32 b) { this->v = wasm_f32x4_add(this->v, b.v); } + constexpr void operator-=(VectorF32 b) { this->v = wasm_f32x4_sub(this->v, b.v); } + constexpr void operator*=(VectorF32 b) { this->v = wasm_f32x4_mul(this->v, b.v); } + constexpr void operator/=(VectorF32 b) { this->v = wasm_f32x4_div(this->v, b.v); } + + constexpr VectorF32 operator+(float b) const { return *this + VectorF32(b); } + constexpr VectorF32 operator-(float b) const { return *this - VectorF32(b); } + constexpr VectorF32 operator*(float b) const { return *this * VectorF32(b); } + constexpr VectorF32 operator/(float b) const { return *this / VectorF32(b); } + constexpr void operator+=(float b) { *this += VectorF32(b); } + constexpr void operator-=(float b) { *this -= VectorF32(b); } + constexpr void operator*=(float b) { *this *= VectorF32(b); } + constexpr void operator/=(float b) { *this /= VectorF32(b); } + + constexpr VectorF32 operator-() const { return VectorF32(wasm_f32x4_neg(this->v)); } + + constexpr bool operator==(VectorF32 b) const { + return wasm_i32x4_bitmask(wasm_f32x4_eq(this->v, b.v)) == 0b1111; + } + constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } + + template + constexpr VectorF32 ExtractLo() const { + alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); + alignas(16) float out[4] = {0,0,0,0}; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < ExtractLen; ++i) + out[p * ExtractLen + i] = tmp[p * Len + i]; + return VectorF32(wasm_v128_load(out)); + } + + constexpr VectorF32 Cos() const { + alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); + for (int i = 0; i < 4; ++i) tmp[i] = std::cos(tmp[i]); + return VectorF32(wasm_v128_load(tmp)); + } + constexpr VectorF32 Sin() const { + alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); + for (int i = 0; i < 4; ++i) tmp[i] = std::sin(tmp[i]); + return VectorF32(wasm_v128_load(tmp)); + } + constexpr std::tuple, VectorF32> SinCos() const { + return { Sin(), Cos() }; + } + + template values> + constexpr VectorF32 Negate() const { + constexpr auto mask = []() { + std::array m{}; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + m[p * Len + i] = values[i] ? 0x80000000u : 0u; + return m; + }(); + v128_t maskVec = wasm_v128_load(mask.data()); + return VectorF32(wasm_v128_xor(this->v, maskVec)); + } + + static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { + #ifdef __wasm_relaxed_simd__ + // Single-rounded FMA (a*b + c). Host-defined when FMA hardware is + // missing — accuracy may differ from the strict-SIMD wasm path. + return VectorF32(wasm_f32x4_relaxed_madd(a.v, b.v, add.v)); + #else + return VectorF32(wasm_f32x4_add(wasm_f32x4_mul(a.v, b.v), add.v)); + #endif + } + static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { + #ifdef __wasm_relaxed_simd__ + // a*b - c is fused as madd(a, b, -c) — same op count as mul+sub + // but one rounding instead of two. + return VectorF32(wasm_f32x4_relaxed_madd(a.v, b.v, wasm_f32x4_neg(sub.v))); + #else + return VectorF32(wasm_f32x4_sub(wasm_f32x4_mul(a.v, b.v), sub.v)); + #endif + } + + constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { + v128_t a_yzx = wasm_i32x4_shuffle(a.v, a.v, 1, 2, 0, 3); + v128_t a_zxy = wasm_i32x4_shuffle(a.v, a.v, 2, 0, 1, 3); + v128_t b_yzx = wasm_i32x4_shuffle(b.v, b.v, 1, 2, 0, 3); + v128_t b_zxy = wasm_i32x4_shuffle(b.v, b.v, 2, 0, 1, 3); + #ifdef __wasm_relaxed_simd__ + // a_yzx*b_zxy - a_zxy*b_yzx fused as nmadd(a_zxy, b_yzx, a_yzx*b_zxy) + // = -(a_zxy*b_yzx) + a_yzx*b_zxy. Replaces a mul+sub pair with a + // single FMA. + return VectorF32(wasm_f32x4_relaxed_nmadd(a_zxy, b_yzx, wasm_f32x4_mul(a_yzx, b_zxy))); + #else + return VectorF32(wasm_f32x4_sub(wasm_f32x4_mul(a_yzx, b_zxy), wasm_f32x4_mul(a_zxy, b_yzx))); + #endif + } + + template ShuffleValues> + constexpr VectorF32 Shuffle() const { + alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v); + alignas(16) float out[4] = {0,0,0,0}; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + out[p * Len + i] = tmp[p * Len + ShuffleValues[i]]; + return VectorF32(wasm_v128_load(out)); + } + + template ShuffleValues> + constexpr static VectorF32 Blend(VectorF32 a, VectorF32 b) { + constexpr auto mask = []() { + std::array m{}; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + m[p * Len + i] = ShuffleValues[i] ? 0xFFFFFFFFu : 0u; + return m; + }(); + v128_t maskVec = wasm_v128_load(mask.data()); + return VectorF32(wasm_v128_bitselect(b.v, a.v, maskVec)); + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto LengthSq(VectorF32 first, Rest... rest) { + constexpr std::uint8_t N = 1 + sizeof...(Rest); + VectorF32<1, static_cast(Packing * N)> r; + std::array, N> args{ first, rest... }; + alignas(16) float buf[4] = {0,0,0,0}; + for (std::uint8_t i = 0; i < N; ++i) { + alignas(16) float tmp[4]; + wasm_v128_store(tmp, args[i].v); + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) { + float x = tmp[p * Len + k]; + acc += x * x; + } + buf[i * Packing + p] = acc; + } + } + r.v = wasm_v128_load(buf); + return r; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto Length(VectorF32 first, Rest... rest) { + auto sq = LengthSq(first, rest...); + sq.v = wasm_f32x4_sqrt(sq.v); + return sq; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto Normalize(VectorF32 first, Rest... rest) { + auto normOne = [](VectorF32 u) { + alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v); + alignas(16) float out[4] = {0,0,0,0}; + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) { + float x = tmp[p * Len + k]; + acc += x * x; + } + float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) + out[p * Len + k] = tmp[p * Len + k] * invLen; + } + return VectorF32(wasm_v128_load(out)); + }; + return std::make_tuple(normOne(first), normOne(rest)...); + } + + constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { + alignas(16) float qBuf[4]; wasm_v128_store(qBuf, q.v); + alignas(16) float qvBuf[4] = {0,0,0,0}; + alignas(16) float qwBuf[4] = {0,0,0,0}; + for (std::uint8_t p = 0; p < Packing; ++p) { + qvBuf[p * 3 + 0] = qBuf[p * 4 + 0]; + qvBuf[p * 3 + 1] = qBuf[p * 4 + 1]; + qvBuf[p * 3 + 2] = qBuf[p * 4 + 2]; + for (std::uint8_t i = 0; i < 3; ++i) qwBuf[p * 3 + i] = qBuf[p * 4 + 3]; + } + VectorF32<3, Packing> qv(wasm_v128_load(qvBuf)); + VectorF32<3, Packing> qwBroadcast(wasm_v128_load(qwBuf)); + VectorF32<3, Packing> t = Cross(qv, v) * 2.0f; + return v + t * qwBroadcast + Cross(qv, t); + } + + constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) { + VectorF32<3, Packing> translated = v - pivot; + return Rotate(translated, q) + pivot; + } + + constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) { + alignas(16) float eulerBuf[4]; wasm_v128_store(eulerBuf, eulerHalf.v); + alignas(16) float outBuf[4] = {0,0,0,0}; + for (std::uint8_t p = 0; p < Packing; ++p) { + float roll = eulerBuf[p * 3 + 0]; + float pitch = eulerBuf[p * 3 + 1]; + float yaw = eulerBuf[p * 3 + 2]; + float sr = std::sin(roll), cr = std::cos(roll); + float sp = std::sin(pitch), cp = std::cos(pitch); + float sy = std::sin(yaw), cy = std::cos(yaw); + outBuf[p * 4 + 0] = sr * cp * cy - cr * sp * sy; + outBuf[p * 4 + 1] = cr * sp * cy + sr * cp * sy; + outBuf[p * 4 + 2] = cr * cp * sy - sr * sp * cy; + outBuf[p * 4 + 3] = cr * cp * cy + sr * sp * sy; + } + return VectorF32<4, Packing>(wasm_v128_load(outBuf)); + } + }; +#else + // Scalar software fallback for non-x86_64 targets. Future arches can swap + // in their own intrinsic implementation by adding an arch-specific branch + // above and gating this one out. + export template + struct VectorF32 : public VectorBase { + template + friend struct VectorF32; + using Base = VectorBase; + static constexpr std::uint8_t NElems = Base::AlignmentElement; + + constexpr VectorF32() = default; + constexpr VectorF32(typename Base::VectorType vv) { + this->v = vv; + } + constexpr VectorF32(const float* vB) { Load(vB); } +#ifdef __FLT16_MAX__ + constexpr VectorF32(const _Float16* vB) { Load(vB); } +#endif + constexpr VectorF32(float val) { + for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val; + } + + constexpr void Load(const float* vB) { + for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i]; + } + constexpr void Store(float* vB) const { + for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i]; + } +#ifdef __FLT16_MAX__ + constexpr void Load(const _Float16* vB) { + for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = static_cast(vB[i]); + } + constexpr void Store(_Float16* vB) const { + for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = static_cast<_Float16>(this->v[i]); + } +#endif + + template + constexpr std::array Store() const { + std::array r{}; + Store(r.data()); + return r; + } + + template + constexpr operator VectorF32() const { + VectorF32 r; + const std::uint8_t copyLen = (BLen < Len) ? BLen : Len; + const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing; + for (std::uint8_t p = 0; p < copyPack; ++p) + for (std::uint8_t i = 0; i < copyLen; ++i) + r.v[p * BLen + i] = this->v[p * Len + i]; + return r; + } + + constexpr VectorF32 operator+(VectorF32 b) const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] + b.v[i]; + return r; + } + constexpr VectorF32 operator-(VectorF32 b) const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] - b.v[i]; + return r; + } + constexpr VectorF32 operator*(VectorF32 b) const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] * b.v[i]; + return r; + } + constexpr VectorF32 operator/(VectorF32 b) const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] / b.v[i]; + return r; + } + + constexpr void operator+=(VectorF32 b) { for (std::uint8_t i=0;iv[i] += b.v[i]; } + constexpr void operator-=(VectorF32 b) { for (std::uint8_t i=0;iv[i] -= b.v[i]; } + constexpr void operator*=(VectorF32 b) { for (std::uint8_t i=0;iv[i] *= b.v[i]; } + constexpr void operator/=(VectorF32 b) { for (std::uint8_t i=0;iv[i] /= b.v[i]; } + + constexpr VectorF32 operator+(float b) const { return *this + VectorF32(b); } + constexpr VectorF32 operator-(float b) const { return *this - VectorF32(b); } + constexpr VectorF32 operator*(float b) const { return *this * VectorF32(b); } + constexpr VectorF32 operator/(float b) const { return *this / VectorF32(b); } + constexpr void operator+=(float b) { *this += VectorF32(b); } + constexpr void operator-=(float b) { *this -= VectorF32(b); } + constexpr void operator*=(float b) { *this *= VectorF32(b); } + constexpr void operator/=(float b) { *this /= VectorF32(b); } + + constexpr VectorF32 operator-() const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = -this->v[i]; + return r; + } + + constexpr bool operator==(VectorF32 b) const { + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + if (this->v[p * Len + i] != b.v[p * Len + i]) return false; + return true; + } + constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } + + template + constexpr VectorF32 ExtractLo() const { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < ExtractLen; ++i) + r.v[p * ExtractLen + i] = this->v[p * Len + i]; + return r; + } + + constexpr VectorF32 Cos() const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::cos(this->v[i]); + return r; + } + constexpr VectorF32 Sin() const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::sin(this->v[i]); + return r; + } + constexpr std::tuple, VectorF32> SinCos() const { + return { Sin(), Cos() }; + } + + template values> + constexpr VectorF32 Negate() const { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i]; + return r; + } + + static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] + add.v[i]; + return r; + } + static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] - sub.v[i]; + return r; + } + + constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) { + const std::uint8_t base = p * 3; + r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1]; + r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2]; + r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0]; + } + return r; + } + + template ShuffleValues> + constexpr VectorF32 Shuffle() const { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]]; + return r; + } + + template ShuffleValues> + constexpr static VectorF32 Blend(VectorF32 a, VectorF32 b) { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i]; + return r; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto LengthSq(VectorF32 first, Rest... rest) { + constexpr std::uint8_t N = 1 + sizeof...(Rest); + VectorF32<1, static_cast(Packing * N)> r; + std::array, N> args{ first, rest... }; + for (std::uint8_t i = 0; i < N; ++i) + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) { + float x = args[i].v[p * Len + k]; + acc += x * x; + } + r.v[i * Packing + p] = acc; + } + return r; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto Length(VectorF32 first, Rest... rest) { + auto sq = LengthSq(first, rest...); + for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]); + return sq; + } + + template + requires((std::is_same_v> && ...)) + constexpr static auto Normalize(VectorF32 first, Rest... rest) { + auto normOne = [](VectorF32 u) { + VectorF32 out; + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) { + float x = u.v[p * Len + k]; + acc += x * x; + } + float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) + out.v[p * Len + k] = u.v[p * Len + k] * invLen; + } + return out; + }; + return std::make_tuple(normOne(first), normOne(rest)...); + } + + constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { + VectorF32<3, Packing> qv; + VectorF32<3, Packing> qwBroadcast; + for (std::uint8_t p = 0; p < Packing; ++p) { + qv.v[p * 3 + 0] = q.v[p * 4 + 0]; + qv.v[p * 3 + 1] = q.v[p * 4 + 1]; + qv.v[p * 3 + 2] = q.v[p * 4 + 2]; + for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3]; + } + VectorF32<3, Packing> t = Cross(qv, v) * 2.0f; + return v + t * qwBroadcast + Cross(qv, t); + } + + constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) { + VectorF32<3, Packing> translated = v - pivot; + return Rotate(translated, q) + pivot; + } + + constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) { + VectorF32<4, Packing> r; + for (std::uint8_t p = 0; p < Packing; ++p) { + float roll = eulerHalf.v[p * 3 + 0]; + float pitch = eulerHalf.v[p * 3 + 1]; + float yaw = eulerHalf.v[p * 3 + 2]; + float sr = std::sin(roll), cr = std::cos(roll); + float sp = std::sin(pitch), cp = std::cos(pitch); + float sy = std::sin(yaw), cy = std::cos(yaw); + r.v[p * 4 + 0] = sr * cp * cy - cr * sp * sy; + r.v[p * 4 + 1] = cr * sp * cy + sr * cp * sy; + r.v[p * 4 + 2] = cr * cp * sy - sr * sp * cy; + r.v[p * 4 + 3] = cr * cp * cy + sr * sp * sy; + } + return r; + } + }; #endif } -#ifdef __x86_64 export template struct std::formatter> : std::formatter { constexpr auto format(const Crafter::VectorF32& obj, format_context& ctx) const { @@ -1403,5 +1915,4 @@ struct std::formatter> : std::formatter::format(out, ctx); } -}; -#endif \ No newline at end of file +}; \ No newline at end of file