wasm SIMD
This commit is contained in:
parent
99972d8c81
commit
48e3b8e26c
4 changed files with 803 additions and 12 deletions
|
|
@ -28,10 +28,8 @@ namespace Crafter {
|
||||||
return degrees * (std::numbers::pi / 180);
|
return degrees * (std::numbers::pi / 180);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64
|
#if (defined(__x86_64) && !defined(__AVX512FP16__)) || !defined(__FLT16_MAX__)
|
||||||
#ifndef __AVX512FP16__
|
|
||||||
export template <std::uint32_t Len, std::uint32_t Packing>
|
export template <std::uint32_t Len, std::uint32_t Packing>
|
||||||
using VectorF16 = VectorF32<Len, Packing>;
|
using VectorF16 = VectorF32<Len, Packing>;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
@ -2,20 +2,31 @@ module;
|
||||||
#ifdef __x86_64
|
#ifdef __x86_64
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef __wasm_simd128__
|
||||||
|
#include <wasm_simd128.h>
|
||||||
|
#endif
|
||||||
export module Crafter.Math:Common;
|
export module Crafter.Math:Common;
|
||||||
import std;
|
import std;
|
||||||
|
|
||||||
|
// VectorF16 exists as a real struct when _Float16 is available AND we are not
|
||||||
|
// on x86_64 without AVX512FP16 (that path aliases VectorF16 to VectorF32 in
|
||||||
|
// Crafter.Math:Basic for performance). Each translation unit that needs this
|
||||||
|
// distinction redefines the same condition since macros do not cross module
|
||||||
|
// boundaries.
|
||||||
|
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
|
||||||
namespace Crafter {
|
namespace Crafter {
|
||||||
#ifdef __AVX512FP16__
|
|
||||||
export template <std::uint8_t Len, std::uint8_t Packing>
|
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||||
struct VectorF16;
|
struct VectorF16;
|
||||||
#endif
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace Crafter {
|
||||||
export template <std::uint8_t Len, std::uint8_t Packing>
|
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||||
struct VectorF32;
|
struct VectorF32;
|
||||||
|
|
||||||
template <std::uint8_t Len, std::uint8_t Packing, typename T>
|
template <std::uint8_t Len, std::uint8_t Packing, typename T>
|
||||||
struct VectorBase {
|
struct VectorBase {
|
||||||
#ifdef __AVX512FP16__
|
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
|
||||||
template <std::uint8_t L, std::uint8_t P>
|
template <std::uint8_t L, std::uint8_t P>
|
||||||
friend struct VectorF16;
|
friend struct VectorF16;
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -33,8 +44,13 @@ namespace Crafter {
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64
|
#ifdef __x86_64
|
||||||
using VectorType = std::conditional_t<std::is_same_v<T, _Float16>,
|
using VectorType = std::conditional_t<
|
||||||
|
#ifdef __FLT16_MAX__
|
||||||
|
std::is_same_v<T, _Float16>
|
||||||
|
#else
|
||||||
|
false
|
||||||
|
#endif
|
||||||
|
,
|
||||||
#ifdef __AVX512FP16__
|
#ifdef __AVX512FP16__
|
||||||
std::conditional_t<(Len * Packing > 16), __m512h,
|
std::conditional_t<(Len * Packing > 16), __m512h,
|
||||||
std::conditional_t<(Len * Packing > 8), __m256h, __m128h>>,
|
std::conditional_t<(Len * Packing > 8), __m256h, __m128h>>,
|
||||||
|
|
@ -45,9 +61,13 @@ namespace Crafter {
|
||||||
std::conditional_t<(Len * Packing > 8), __m512,
|
std::conditional_t<(Len * Packing > 8), __m512,
|
||||||
std::conditional_t<(Len * Packing > 4), __m256, __m128>>
|
std::conditional_t<(Len * Packing > 4), __m256, __m128>>
|
||||||
>;
|
>;
|
||||||
|
#elif defined(__wasm_simd128__)
|
||||||
|
using VectorType = v128_t;
|
||||||
|
#else
|
||||||
|
using VectorType = std::array<T, GetAlingment()/sizeof(T)>;
|
||||||
|
#endif
|
||||||
|
|
||||||
VectorType v;
|
VectorType v;
|
||||||
#endif
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
|
@ -56,6 +76,10 @@ namespace Crafter {
|
||||||
|
|
||||||
#ifdef __AVX512F__
|
#ifdef __AVX512F__
|
||||||
static constexpr std::uint8_t Max = 64;
|
static constexpr std::uint8_t Max = 64;
|
||||||
|
#elif defined(__wasm_simd128__)
|
||||||
|
// WASM SIMD only has 128-bit vectors; cap at 16 bytes so the entire
|
||||||
|
// VectorType always fits in a single v128_t.
|
||||||
|
static constexpr std::uint8_t Max = 16;
|
||||||
#else
|
#else
|
||||||
static constexpr std::uint8_t Max = 32;
|
static constexpr std::uint8_t Max = 32;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,11 @@ export module Crafter.Math:VectorF16;
|
||||||
import std;
|
import std;
|
||||||
import :Common;
|
import :Common;
|
||||||
|
|
||||||
|
// A real VectorF16 struct is provided only when _Float16 is available AND we
|
||||||
|
// are either on an x86_64 build with AVX512FP16 (intrinsic impl below) or on a
|
||||||
|
// non-x86_64 target (scalar fallback further below). On x86_64 without
|
||||||
|
// AVX512FP16 or anywhere without _Float16, Crafter.Math:Basic aliases
|
||||||
|
// VectorF16 to VectorF32 instead.
|
||||||
#ifdef __AVX512FP16__
|
#ifdef __AVX512FP16__
|
||||||
namespace Crafter {
|
namespace Crafter {
|
||||||
export template <std::uint8_t Len, std::uint8_t Packing>
|
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||||
|
|
@ -1027,6 +1032,259 @@ namespace Crafter {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#elif !defined(__x86_64) && defined(__FLT16_MAX__)
|
||||||
|
// Scalar software fallback for non-x86_64 targets that still have _Float16.
|
||||||
|
namespace Crafter {
|
||||||
|
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||||
|
struct VectorF16 : public VectorBase<Len, Packing, _Float16> {
|
||||||
|
template <std::uint8_t Len2, std::uint8_t Packing2>
|
||||||
|
friend struct VectorF16;
|
||||||
|
using Base = VectorBase<Len, Packing, _Float16>;
|
||||||
|
static constexpr std::uint8_t NElems = Base::AlignmentElement;
|
||||||
|
|
||||||
|
constexpr VectorF16() = default;
|
||||||
|
constexpr VectorF16(typename Base::VectorType vv) {
|
||||||
|
this->v = vv;
|
||||||
|
}
|
||||||
|
constexpr VectorF16(const _Float16* vB) { Load(vB); }
|
||||||
|
constexpr VectorF16(_Float16 val) {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr void Load(const _Float16* vB) {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i];
|
||||||
|
}
|
||||||
|
constexpr void Store(_Float16* vB) const {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
constexpr std::array<_Float16, NElems> Store() const {
|
||||||
|
std::array<_Float16, NElems> r{};
|
||||||
|
Store(r.data());
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::uint8_t BLen, std::uint8_t BPacking>
|
||||||
|
constexpr operator VectorF16<BLen, BPacking>() const {
|
||||||
|
VectorF16<BLen, BPacking> r;
|
||||||
|
const std::uint8_t copyLen = (BLen < Len) ? BLen : Len;
|
||||||
|
const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing;
|
||||||
|
for (std::uint8_t p = 0; p < copyPack; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < copyLen; ++i)
|
||||||
|
r.v[p * BLen + i] = this->v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr VectorF16<Len, Packing> operator+(VectorF16<Len, Packing> b) const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] + b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF16<Len, Packing> operator-(VectorF16<Len, Packing> b) const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] - b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF16<Len, Packing> operator*(VectorF16<Len, Packing> b) const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] * b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF16<Len, Packing> operator/(VectorF16<Len, Packing> b) const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] / b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr void operator+=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] += b.v[i]; }
|
||||||
|
constexpr void operator-=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] -= b.v[i]; }
|
||||||
|
constexpr void operator*=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] *= b.v[i]; }
|
||||||
|
constexpr void operator/=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] /= b.v[i]; }
|
||||||
|
|
||||||
|
constexpr VectorF16<Len, Packing> operator+(_Float16 b) const { return *this + VectorF16<Len, Packing>(b); }
|
||||||
|
constexpr VectorF16<Len, Packing> operator-(_Float16 b) const { return *this - VectorF16<Len, Packing>(b); }
|
||||||
|
constexpr VectorF16<Len, Packing> operator*(_Float16 b) const { return *this * VectorF16<Len, Packing>(b); }
|
||||||
|
constexpr VectorF16<Len, Packing> operator/(_Float16 b) const { return *this / VectorF16<Len, Packing>(b); }
|
||||||
|
constexpr void operator+=(_Float16 b) { *this += VectorF16<Len, Packing>(b); }
|
||||||
|
constexpr void operator-=(_Float16 b) { *this -= VectorF16<Len, Packing>(b); }
|
||||||
|
constexpr void operator*=(_Float16 b) { *this *= VectorF16<Len, Packing>(b); }
|
||||||
|
constexpr void operator/=(_Float16 b) { *this /= VectorF16<Len, Packing>(b); }
|
||||||
|
|
||||||
|
constexpr VectorF16<Len, Packing> operator-() const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = -this->v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr bool operator==(VectorF16<Len, Packing> b) const {
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
if (this->v[p * Len + i] != b.v[p * Len + i]) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
constexpr bool operator!=(VectorF16<Len, Packing> b) const { return !(*this == b); }
|
||||||
|
|
||||||
|
template<std::uint32_t ExtractLen>
|
||||||
|
constexpr VectorF16<ExtractLen, Packing> ExtractLo() const {
|
||||||
|
VectorF16<ExtractLen, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < ExtractLen; ++i)
|
||||||
|
r.v[p * ExtractLen + i] = this->v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transcendentals are computed via float since libstdc++ doesn't always
|
||||||
|
// provide overloads for _Float16; the result is rounded back to half.
|
||||||
|
constexpr VectorF16<Len, Packing> Cos() const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = static_cast<_Float16>(std::cos(static_cast<float>(this->v[i])));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF16<Len, Packing> Sin() const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = static_cast<_Float16>(std::sin(static_cast<float>(this->v[i])));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>> SinCos() const {
|
||||||
|
return { Sin(), Cos() };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::array<bool, Len> values>
|
||||||
|
constexpr VectorF16<Len, Packing> Negate() const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr VectorF16<Len, Packing> MulitplyAdd(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b, VectorF16<Len, Packing> add) {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] + add.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
static constexpr VectorF16<Len, Packing> MulitplySub(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b, VectorF16<Len, Packing> sub) {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] - sub.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF16<Len, Packing> Cross(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) requires(Len == 3) {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
const std::uint8_t base = p * 3;
|
||||||
|
r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1];
|
||||||
|
r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2];
|
||||||
|
r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0];
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <const std::array<std::uint8_t, Len> ShuffleValues>
|
||||||
|
constexpr VectorF16<Len, Packing> Shuffle() const {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::array<bool, Len> ShuffleValues>
|
||||||
|
constexpr static VectorF16<Len, Packing> Blend(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) {
|
||||||
|
VectorF16<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
|
||||||
|
constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
|
||||||
|
constexpr std::uint8_t N = 1 + sizeof...(Rest);
|
||||||
|
VectorF16<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||||
|
std::array<VectorF16<Len, Packing>, N> args{ first, rest... };
|
||||||
|
for (std::uint8_t i = 0; i < N; ++i)
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
_Float16 acc = _Float16(0);
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||||
|
_Float16 x = args[i].v[p * Len + k];
|
||||||
|
acc += x * x;
|
||||||
|
}
|
||||||
|
r.v[i * Packing + p] = acc;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
|
||||||
|
constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
|
||||||
|
auto sq = LengthSq(first, rest...);
|
||||||
|
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i)
|
||||||
|
sq.v[i] = static_cast<_Float16>(std::sqrt(static_cast<float>(sq.v[i])));
|
||||||
|
return sq;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
|
||||||
|
constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
|
||||||
|
auto normOne = [](VectorF16<Len, Packing> u) {
|
||||||
|
VectorF16<Len, Packing> out;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float acc = 0.0f;
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||||
|
float x = static_cast<float>(u.v[p * Len + k]);
|
||||||
|
acc += x * x;
|
||||||
|
}
|
||||||
|
_Float16 invLen = acc > 0.0f ? static_cast<_Float16>(1.0f / std::sqrt(acc)) : _Float16(0);
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k)
|
||||||
|
out.v[p * Len + k] = u.v[p * Len + k] * invLen;
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
return std::make_tuple(normOne(first), normOne(rest)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF16<Len, Packing> Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) {
|
||||||
|
VectorF16<3, Packing> qv;
|
||||||
|
VectorF16<3, Packing> qwBroadcast;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
qv.v[p * 3 + 0] = q.v[p * 4 + 0];
|
||||||
|
qv.v[p * 3 + 1] = q.v[p * 4 + 1];
|
||||||
|
qv.v[p * 3 + 2] = q.v[p * 4 + 2];
|
||||||
|
for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3];
|
||||||
|
}
|
||||||
|
VectorF16<3, Packing> t = Cross(qv, v) * _Float16(2);
|
||||||
|
return v + t * qwBroadcast + Cross(qv, t);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF16<3, Packing> RotatePivot(VectorF16<3, Packing> v, VectorF16<4, Packing> q, VectorF16<3, Packing> pivot) requires(Len == 3) {
|
||||||
|
VectorF16<3, Packing> translated = v - pivot;
|
||||||
|
return Rotate(translated, q) + pivot;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF16<4, Packing> QuanternionFromEuler(VectorF16<3, Packing> eulerHalf) requires(Len == 4) {
|
||||||
|
VectorF16<4, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float roll = static_cast<float>(eulerHalf.v[p * 3 + 0]);
|
||||||
|
float pitch = static_cast<float>(eulerHalf.v[p * 3 + 1]);
|
||||||
|
float yaw = static_cast<float>(eulerHalf.v[p * 3 + 2]);
|
||||||
|
float sr = std::sin(roll), cr = std::cos(roll);
|
||||||
|
float sp = std::sin(pitch), cp = std::cos(pitch);
|
||||||
|
float sy = std::sin(yaw), cy = std::cos(yaw);
|
||||||
|
r.v[p * 4 + 0] = static_cast<_Float16>(sr * cp * cy - cr * sp * sy);
|
||||||
|
r.v[p * 4 + 1] = static_cast<_Float16>(cr * sp * cy + sr * cp * sy);
|
||||||
|
r.v[p * 4 + 2] = static_cast<_Float16>(cr * cp * sy - sr * sp * cy);
|
||||||
|
r.v[p * 4 + 3] = static_cast<_Float16>(cr * cp * cy + sr * sp * sy);
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
|
||||||
export template <std::uint32_t Len, std::uint32_t Packing>
|
export template <std::uint32_t Len, std::uint32_t Packing>
|
||||||
struct std::formatter<Crafter::VectorF16<Len, Packing>> : std::formatter<std::string> {
|
struct std::formatter<Crafter::VectorF16<Len, Packing>> : std::formatter<std::string> {
|
||||||
constexpr auto format(const Crafter::VectorF16<Len, Packing>& obj, format_context& ctx) const {
|
constexpr auto format(const Crafter::VectorF16<Len, Packing>& obj, format_context& ctx) const {
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,9 @@ module;
|
||||||
#ifdef __x86_64
|
#ifdef __x86_64
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef __wasm_simd128__
|
||||||
|
#include <wasm_simd128.h>
|
||||||
|
#endif
|
||||||
export module Crafter.Math:VectorF32;
|
export module Crafter.Math:VectorF32;
|
||||||
import std;
|
import std;
|
||||||
import :Common;
|
import :Common;
|
||||||
|
|
@ -1383,10 +1386,519 @@ namespace Crafter {
|
||||||
return row1;
|
return row1;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#elif defined(__wasm_simd128__)
|
||||||
|
// WebAssembly SIMD128 implementation. VectorType is always v128_t and we
|
||||||
|
// cap Len*Packing*sizeof(float) at 16 bytes (i.e. up to 4 floats per
|
||||||
|
// vector) in Common.cppm so a single v128_t covers every instantiation.
|
||||||
|
// Operations without a direct SIMD equivalent (Shuffle with runtime indices,
|
||||||
|
// transcendentals, etc.) round-trip through a float[4] scratch buffer.
|
||||||
|
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||||
|
struct VectorF32 : public VectorBase<Len, Packing, float> {
|
||||||
|
template <std::uint8_t Len2, std::uint8_t Packing2>
|
||||||
|
friend struct VectorF32;
|
||||||
|
using Base = VectorBase<Len, Packing, float>;
|
||||||
|
static constexpr std::uint8_t NElems = Base::AlignmentElement;
|
||||||
|
static_assert(NElems == 4, "WASM SIMD VectorF32 assumes 4-lane vectors");
|
||||||
|
|
||||||
|
constexpr VectorF32() = default;
|
||||||
|
constexpr VectorF32(v128_t vv) { this->v = vv; }
|
||||||
|
constexpr VectorF32(const float* vB) { Load(vB); }
|
||||||
|
constexpr VectorF32(float val) { this->v = wasm_f32x4_splat(val); }
|
||||||
|
|
||||||
|
constexpr void Load(const float* vB) { this->v = wasm_v128_load(vB); }
|
||||||
|
constexpr void Store(float* vB) const { wasm_v128_store(vB, this->v); }
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
constexpr std::array<T, NElems> Store() const {
|
||||||
|
std::array<T, NElems> r{};
|
||||||
|
Store(r.data());
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::uint8_t BLen, std::uint8_t BPacking>
|
||||||
|
constexpr operator VectorF32<BLen, BPacking>() const {
|
||||||
|
alignas(16) float tmp[4];
|
||||||
|
wasm_v128_store(tmp, this->v);
|
||||||
|
alignas(16) float out[4] = {0,0,0,0};
|
||||||
|
const std::uint8_t copyLen = (BLen < Len) ? BLen : Len;
|
||||||
|
const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing;
|
||||||
|
for (std::uint8_t p = 0; p < copyPack; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < copyLen; ++i)
|
||||||
|
out[p * BLen + i] = tmp[p * Len + i];
|
||||||
|
return VectorF32<BLen, BPacking>(wasm_v128_load(out));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> operator+(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_add(this->v, b.v)); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator-(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_sub(this->v, b.v)); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator*(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_mul(this->v, b.v)); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator/(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(wasm_f32x4_div(this->v, b.v)); }
|
||||||
|
constexpr void operator+=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_add(this->v, b.v); }
|
||||||
|
constexpr void operator-=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_sub(this->v, b.v); }
|
||||||
|
constexpr void operator*=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_mul(this->v, b.v); }
|
||||||
|
constexpr void operator/=(VectorF32<Len, Packing> b) { this->v = wasm_f32x4_div(this->v, b.v); }
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> operator+(float b) const { return *this + VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator-(float b) const { return *this - VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator*(float b) const { return *this * VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator/(float b) const { return *this / VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator+=(float b) { *this += VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator-=(float b) { *this -= VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator*=(float b) { *this *= VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator/=(float b) { *this /= VectorF32<Len, Packing>(b); }
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> operator-() const { return VectorF32<Len, Packing>(wasm_f32x4_neg(this->v)); }
|
||||||
|
|
||||||
|
constexpr bool operator==(VectorF32<Len, Packing> b) const {
|
||||||
|
return wasm_i32x4_bitmask(wasm_f32x4_eq(this->v, b.v)) == 0b1111;
|
||||||
|
}
|
||||||
|
constexpr bool operator!=(VectorF32<Len, Packing> b) const { return !(*this == b); }
|
||||||
|
|
||||||
|
template<std::uint32_t ExtractLen>
|
||||||
|
constexpr VectorF32<ExtractLen, Packing> ExtractLo() const {
|
||||||
|
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
|
||||||
|
alignas(16) float out[4] = {0,0,0,0};
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < ExtractLen; ++i)
|
||||||
|
out[p * ExtractLen + i] = tmp[p * Len + i];
|
||||||
|
return VectorF32<ExtractLen, Packing>(wasm_v128_load(out));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> Cos() const {
|
||||||
|
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
|
||||||
|
for (int i = 0; i < 4; ++i) tmp[i] = std::cos(tmp[i]);
|
||||||
|
return VectorF32<Len, Packing>(wasm_v128_load(tmp));
|
||||||
|
}
|
||||||
|
constexpr VectorF32<Len, Packing> Sin() const {
|
||||||
|
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
|
||||||
|
for (int i = 0; i < 4; ++i) tmp[i] = std::sin(tmp[i]);
|
||||||
|
return VectorF32<Len, Packing>(wasm_v128_load(tmp));
|
||||||
|
}
|
||||||
|
constexpr std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> SinCos() const {
|
||||||
|
return { Sin(), Cos() };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::array<bool, Len> values>
|
||||||
|
constexpr VectorF32<Len, Packing> Negate() const {
|
||||||
|
constexpr auto mask = []() {
|
||||||
|
std::array<std::uint32_t, 4> m{};
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
m[p * Len + i] = values[i] ? 0x80000000u : 0u;
|
||||||
|
return m;
|
||||||
|
}();
|
||||||
|
v128_t maskVec = wasm_v128_load(mask.data());
|
||||||
|
return VectorF32<Len, Packing>(wasm_v128_xor(this->v, maskVec));
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr VectorF32<Len, Packing> MulitplyAdd(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> add) {
|
||||||
|
#ifdef __wasm_relaxed_simd__
|
||||||
|
// Single-rounded FMA (a*b + c). Host-defined when FMA hardware is
|
||||||
|
// missing — accuracy may differ from the strict-SIMD wasm path.
|
||||||
|
return VectorF32<Len, Packing>(wasm_f32x4_relaxed_madd(a.v, b.v, add.v));
|
||||||
|
#else
|
||||||
|
return VectorF32<Len, Packing>(wasm_f32x4_add(wasm_f32x4_mul(a.v, b.v), add.v));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
static constexpr VectorF32<Len, Packing> MulitplySub(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> sub) {
|
||||||
|
#ifdef __wasm_relaxed_simd__
|
||||||
|
// a*b - c is fused as madd(a, b, -c) — same op count as mul+sub
|
||||||
|
// but one rounding instead of two.
|
||||||
|
return VectorF32<Len, Packing>(wasm_f32x4_relaxed_madd(a.v, b.v, wasm_f32x4_neg(sub.v)));
|
||||||
|
#else
|
||||||
|
return VectorF32<Len, Packing>(wasm_f32x4_sub(wasm_f32x4_mul(a.v, b.v), sub.v));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) {
|
||||||
|
v128_t a_yzx = wasm_i32x4_shuffle(a.v, a.v, 1, 2, 0, 3);
|
||||||
|
v128_t a_zxy = wasm_i32x4_shuffle(a.v, a.v, 2, 0, 1, 3);
|
||||||
|
v128_t b_yzx = wasm_i32x4_shuffle(b.v, b.v, 1, 2, 0, 3);
|
||||||
|
v128_t b_zxy = wasm_i32x4_shuffle(b.v, b.v, 2, 0, 1, 3);
|
||||||
|
#ifdef __wasm_relaxed_simd__
|
||||||
|
// a_yzx*b_zxy - a_zxy*b_yzx fused as nmadd(a_zxy, b_yzx, a_yzx*b_zxy)
|
||||||
|
// = -(a_zxy*b_yzx) + a_yzx*b_zxy. Replaces a mul+sub pair with a
|
||||||
|
// single FMA.
|
||||||
|
return VectorF32<Len, Packing>(wasm_f32x4_relaxed_nmadd(a_zxy, b_yzx, wasm_f32x4_mul(a_yzx, b_zxy)));
|
||||||
|
#else
|
||||||
|
return VectorF32<Len, Packing>(wasm_f32x4_sub(wasm_f32x4_mul(a_yzx, b_zxy), wasm_f32x4_mul(a_zxy, b_yzx)));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <const std::array<std::uint8_t, Len> ShuffleValues>
|
||||||
|
constexpr VectorF32<Len, Packing> Shuffle() const {
|
||||||
|
alignas(16) float tmp[4]; wasm_v128_store(tmp, this->v);
|
||||||
|
alignas(16) float out[4] = {0,0,0,0};
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
out[p * Len + i] = tmp[p * Len + ShuffleValues[i]];
|
||||||
|
return VectorF32<Len, Packing>(wasm_v128_load(out));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::array<bool, Len> ShuffleValues>
|
||||||
|
constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) {
|
||||||
|
constexpr auto mask = []() {
|
||||||
|
std::array<std::uint32_t, 4> m{};
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
m[p * Len + i] = ShuffleValues[i] ? 0xFFFFFFFFu : 0u;
|
||||||
|
return m;
|
||||||
|
}();
|
||||||
|
v128_t maskVec = wasm_v128_load(mask.data());
|
||||||
|
return VectorF32<Len, Packing>(wasm_v128_bitselect(b.v, a.v, maskVec));
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||||
|
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
|
||||||
|
constexpr std::uint8_t N = 1 + sizeof...(Rest);
|
||||||
|
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||||
|
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
|
||||||
|
alignas(16) float buf[4] = {0,0,0,0};
|
||||||
|
for (std::uint8_t i = 0; i < N; ++i) {
|
||||||
|
alignas(16) float tmp[4];
|
||||||
|
wasm_v128_store(tmp, args[i].v);
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float acc = 0.0f;
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||||
|
float x = tmp[p * Len + k];
|
||||||
|
acc += x * x;
|
||||||
|
}
|
||||||
|
buf[i * Packing + p] = acc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r.v = wasm_v128_load(buf);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||||
|
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
|
||||||
|
auto sq = LengthSq(first, rest...);
|
||||||
|
sq.v = wasm_f32x4_sqrt(sq.v);
|
||||||
|
return sq;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||||
|
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
|
||||||
|
auto normOne = [](VectorF32<Len, Packing> u) {
|
||||||
|
alignas(16) float tmp[4]; wasm_v128_store(tmp, u.v);
|
||||||
|
alignas(16) float out[4] = {0,0,0,0};
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float acc = 0.0f;
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||||
|
float x = tmp[p * Len + k];
|
||||||
|
acc += x * x;
|
||||||
|
}
|
||||||
|
float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f;
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k)
|
||||||
|
out[p * Len + k] = tmp[p * Len + k] * invLen;
|
||||||
|
}
|
||||||
|
return VectorF32<Len, Packing>(wasm_v128_load(out));
|
||||||
|
};
|
||||||
|
return std::make_tuple(normOne(first), normOne(rest)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
|
||||||
|
alignas(16) float qBuf[4]; wasm_v128_store(qBuf, q.v);
|
||||||
|
alignas(16) float qvBuf[4] = {0,0,0,0};
|
||||||
|
alignas(16) float qwBuf[4] = {0,0,0,0};
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
qvBuf[p * 3 + 0] = qBuf[p * 4 + 0];
|
||||||
|
qvBuf[p * 3 + 1] = qBuf[p * 4 + 1];
|
||||||
|
qvBuf[p * 3 + 2] = qBuf[p * 4 + 2];
|
||||||
|
for (std::uint8_t i = 0; i < 3; ++i) qwBuf[p * 3 + i] = qBuf[p * 4 + 3];
|
||||||
|
}
|
||||||
|
VectorF32<3, Packing> qv(wasm_v128_load(qvBuf));
|
||||||
|
VectorF32<3, Packing> qwBroadcast(wasm_v128_load(qwBuf));
|
||||||
|
VectorF32<3, Packing> t = Cross(qv, v) * 2.0f;
|
||||||
|
return v + t * qwBroadcast + Cross(qv, t);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) {
|
||||||
|
VectorF32<3, Packing> translated = v - pivot;
|
||||||
|
return Rotate(translated, q) + pivot;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) {
|
||||||
|
alignas(16) float eulerBuf[4]; wasm_v128_store(eulerBuf, eulerHalf.v);
|
||||||
|
alignas(16) float outBuf[4] = {0,0,0,0};
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float roll = eulerBuf[p * 3 + 0];
|
||||||
|
float pitch = eulerBuf[p * 3 + 1];
|
||||||
|
float yaw = eulerBuf[p * 3 + 2];
|
||||||
|
float sr = std::sin(roll), cr = std::cos(roll);
|
||||||
|
float sp = std::sin(pitch), cp = std::cos(pitch);
|
||||||
|
float sy = std::sin(yaw), cy = std::cos(yaw);
|
||||||
|
outBuf[p * 4 + 0] = sr * cp * cy - cr * sp * sy;
|
||||||
|
outBuf[p * 4 + 1] = cr * sp * cy + sr * cp * sy;
|
||||||
|
outBuf[p * 4 + 2] = cr * cp * sy - sr * sp * cy;
|
||||||
|
outBuf[p * 4 + 3] = cr * cp * cy + sr * sp * sy;
|
||||||
|
}
|
||||||
|
return VectorF32<4, Packing>(wasm_v128_load(outBuf));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
// Scalar software fallback for non-x86_64 targets. Future arches can swap
|
||||||
|
// in their own intrinsic implementation by adding an arch-specific branch
|
||||||
|
// above and gating this one out.
|
||||||
|
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||||
|
struct VectorF32 : public VectorBase<Len, Packing, float> {
|
||||||
|
template <std::uint8_t Len2, std::uint8_t Packing2>
|
||||||
|
friend struct VectorF32;
|
||||||
|
using Base = VectorBase<Len, Packing, float>;
|
||||||
|
static constexpr std::uint8_t NElems = Base::AlignmentElement;
|
||||||
|
|
||||||
|
constexpr VectorF32() = default;
|
||||||
|
constexpr VectorF32(typename Base::VectorType vv) {
|
||||||
|
this->v = vv;
|
||||||
|
}
|
||||||
|
constexpr VectorF32(const float* vB) { Load(vB); }
|
||||||
|
#ifdef __FLT16_MAX__
|
||||||
|
constexpr VectorF32(const _Float16* vB) { Load(vB); }
|
||||||
|
#endif
|
||||||
|
constexpr VectorF32(float val) {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr void Load(const float* vB) {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i];
|
||||||
|
}
|
||||||
|
constexpr void Store(float* vB) const {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i];
|
||||||
|
}
|
||||||
|
#ifdef __FLT16_MAX__
|
||||||
|
constexpr void Load(const _Float16* vB) {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = static_cast<float>(vB[i]);
|
||||||
|
}
|
||||||
|
constexpr void Store(_Float16* vB) const {
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = static_cast<_Float16>(this->v[i]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
constexpr std::array<T, NElems> Store() const {
|
||||||
|
std::array<T, NElems> r{};
|
||||||
|
Store(r.data());
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::uint8_t BLen, std::uint8_t BPacking>
|
||||||
|
constexpr operator VectorF32<BLen, BPacking>() const {
|
||||||
|
VectorF32<BLen, BPacking> r;
|
||||||
|
const std::uint8_t copyLen = (BLen < Len) ? BLen : Len;
|
||||||
|
const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing;
|
||||||
|
for (std::uint8_t p = 0; p < copyPack; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < copyLen; ++i)
|
||||||
|
r.v[p * BLen + i] = this->v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> operator+(VectorF32<Len, Packing> b) const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] + b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF32<Len, Packing> operator-(VectorF32<Len, Packing> b) const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] - b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF32<Len, Packing> operator*(VectorF32<Len, Packing> b) const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] * b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF32<Len, Packing> operator/(VectorF32<Len, Packing> b) const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] / b.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr void operator+=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] += b.v[i]; }
|
||||||
|
constexpr void operator-=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] -= b.v[i]; }
|
||||||
|
constexpr void operator*=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] *= b.v[i]; }
|
||||||
|
constexpr void operator/=(VectorF32<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] /= b.v[i]; }
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> operator+(float b) const { return *this + VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator-(float b) const { return *this - VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator*(float b) const { return *this * VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr VectorF32<Len, Packing> operator/(float b) const { return *this / VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator+=(float b) { *this += VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator-=(float b) { *this -= VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator*=(float b) { *this *= VectorF32<Len, Packing>(b); }
|
||||||
|
constexpr void operator/=(float b) { *this /= VectorF32<Len, Packing>(b); }
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> operator-() const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = -this->v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr bool operator==(VectorF32<Len, Packing> b) const {
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
if (this->v[p * Len + i] != b.v[p * Len + i]) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
constexpr bool operator!=(VectorF32<Len, Packing> b) const { return !(*this == b); }
|
||||||
|
|
||||||
|
template<std::uint32_t ExtractLen>
|
||||||
|
constexpr VectorF32<ExtractLen, Packing> ExtractLo() const {
|
||||||
|
VectorF32<ExtractLen, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < ExtractLen; ++i)
|
||||||
|
r.v[p * ExtractLen + i] = this->v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr VectorF32<Len, Packing> Cos() const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::cos(this->v[i]);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr VectorF32<Len, Packing> Sin() const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::sin(this->v[i]);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
constexpr std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> SinCos() const {
|
||||||
|
return { Sin(), Cos() };
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::array<bool, Len> values>
|
||||||
|
constexpr VectorF32<Len, Packing> Negate() const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr VectorF32<Len, Packing> MulitplyAdd(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> add) {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] + add.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
static constexpr VectorF32<Len, Packing> MulitplySub(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> sub) {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] - sub.v[i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
const std::uint8_t base = p * 3;
|
||||||
|
r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1];
|
||||||
|
r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2];
|
||||||
|
r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0];
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <const std::array<std::uint8_t, Len> ShuffleValues>
|
||||||
|
constexpr VectorF32<Len, Packing> Shuffle() const {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <std::array<bool, Len> ShuffleValues>
|
||||||
|
constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) {
|
||||||
|
VectorF32<Len, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||||
|
for (std::uint8_t i = 0; i < Len; ++i)
|
||||||
|
r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i];
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||||
|
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
|
||||||
|
constexpr std::uint8_t N = 1 + sizeof...(Rest);
|
||||||
|
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||||
|
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
|
||||||
|
for (std::uint8_t i = 0; i < N; ++i)
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float acc = 0.0f;
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||||
|
float x = args[i].v[p * Len + k];
|
||||||
|
acc += x * x;
|
||||||
|
}
|
||||||
|
r.v[i * Packing + p] = acc;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||||
|
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
|
||||||
|
auto sq = LengthSq(first, rest...);
|
||||||
|
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]);
|
||||||
|
return sq;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename... Rest>
|
||||||
|
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...))
|
||||||
|
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
|
||||||
|
auto normOne = [](VectorF32<Len, Packing> u) {
|
||||||
|
VectorF32<Len, Packing> out;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float acc = 0.0f;
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||||
|
float x = u.v[p * Len + k];
|
||||||
|
acc += x * x;
|
||||||
|
}
|
||||||
|
float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f;
|
||||||
|
for (std::uint8_t k = 0; k < Len; ++k)
|
||||||
|
out.v[p * Len + k] = u.v[p * Len + k] * invLen;
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
return std::make_tuple(normOne(first), normOne(rest)...);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
|
||||||
|
VectorF32<3, Packing> qv;
|
||||||
|
VectorF32<3, Packing> qwBroadcast;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
qv.v[p * 3 + 0] = q.v[p * 4 + 0];
|
||||||
|
qv.v[p * 3 + 1] = q.v[p * 4 + 1];
|
||||||
|
qv.v[p * 3 + 2] = q.v[p * 4 + 2];
|
||||||
|
for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3];
|
||||||
|
}
|
||||||
|
VectorF32<3, Packing> t = Cross(qv, v) * 2.0f;
|
||||||
|
return v + t * qwBroadcast + Cross(qv, t);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) {
|
||||||
|
VectorF32<3, Packing> translated = v - pivot;
|
||||||
|
return Rotate(translated, q) + pivot;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) {
|
||||||
|
VectorF32<4, Packing> r;
|
||||||
|
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||||
|
float roll = eulerHalf.v[p * 3 + 0];
|
||||||
|
float pitch = eulerHalf.v[p * 3 + 1];
|
||||||
|
float yaw = eulerHalf.v[p * 3 + 2];
|
||||||
|
float sr = std::sin(roll), cr = std::cos(roll);
|
||||||
|
float sp = std::sin(pitch), cp = std::cos(pitch);
|
||||||
|
float sy = std::sin(yaw), cy = std::cos(yaw);
|
||||||
|
r.v[p * 4 + 0] = sr * cp * cy - cr * sp * sy;
|
||||||
|
r.v[p * 4 + 1] = cr * sp * cy + sr * cp * sy;
|
||||||
|
r.v[p * 4 + 2] = cr * cp * sy - sr * sp * cy;
|
||||||
|
r.v[p * 4 + 3] = cr * cp * cy + sr * sp * sy;
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64
|
|
||||||
export template <std::uint32_t Len, std::uint32_t Packing>
|
export template <std::uint32_t Len, std::uint32_t Packing>
|
||||||
struct std::formatter<Crafter::VectorF32<Len, Packing>> : std::formatter<std::string> {
|
struct std::formatter<Crafter::VectorF32<Len, Packing>> : std::formatter<std::string> {
|
||||||
constexpr auto format(const Crafter::VectorF32<Len, Packing>& obj, format_context& ctx) const {
|
constexpr auto format(const Crafter::VectorF32<Len, Packing>& obj, format_context& ctx) const {
|
||||||
|
|
@ -1403,5 +1915,4 @@ struct std::formatter<Crafter::VectorF32<Len, Packing>> : std::formatter<std::st
|
||||||
out += "}";
|
out += "}";
|
||||||
return std::formatter<std::string>::format(out, ctx);
|
return std::formatter<std::string>::format(out, ctx);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue