wasm SIMD

This commit is contained in:
Jorijn van der Graaf 2026-05-18 05:23:49 +02:00
commit 48e3b8e26c
4 changed files with 803 additions and 12 deletions

View file

@ -24,6 +24,11 @@ export module Crafter.Math:VectorF16;
import std;
import :Common;
// A real VectorF16 struct is provided only when _Float16 is available AND we
// are either on an x86_64 build with AVX512FP16 (intrinsic impl below) or on a
// non-x86_64 target (scalar fallback further below). On x86_64 without
// AVX512FP16 or anywhere without _Float16, Crafter.Math:Basic aliases
// VectorF16 to VectorF32 instead.
#ifdef __AVX512FP16__
namespace Crafter {
export template <std::uint8_t Len, std::uint8_t Packing>
@ -1027,6 +1032,259 @@ namespace Crafter {
}
#elif !defined(__x86_64) && defined(__FLT16_MAX__)
// Scalar software fallback for non-x86_64 targets that still have _Float16.
namespace Crafter {
export template <std::uint8_t Len, std::uint8_t Packing>
struct VectorF16 : public VectorBase<Len, Packing, _Float16> {
template <std::uint8_t Len2, std::uint8_t Packing2>
friend struct VectorF16;
using Base = VectorBase<Len, Packing, _Float16>;
static constexpr std::uint8_t NElems = Base::AlignmentElement;
constexpr VectorF16() = default;
constexpr VectorF16(typename Base::VectorType vv) {
this->v = vv;
}
constexpr VectorF16(const _Float16* vB) { Load(vB); }
constexpr VectorF16(_Float16 val) {
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val;
}
constexpr void Load(const _Float16* vB) {
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i];
}
constexpr void Store(_Float16* vB) const {
for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i];
}
template<typename T>
constexpr std::array<_Float16, NElems> Store() const {
std::array<_Float16, NElems> r{};
Store(r.data());
return r;
}
template <std::uint8_t BLen, std::uint8_t BPacking>
constexpr operator VectorF16<BLen, BPacking>() const {
VectorF16<BLen, BPacking> r;
const std::uint8_t copyLen = (BLen < Len) ? BLen : Len;
const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing;
for (std::uint8_t p = 0; p < copyPack; ++p)
for (std::uint8_t i = 0; i < copyLen; ++i)
r.v[p * BLen + i] = this->v[p * Len + i];
return r;
}
constexpr VectorF16<Len, Packing> operator+(VectorF16<Len, Packing> b) const {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] + b.v[i];
return r;
}
constexpr VectorF16<Len, Packing> operator-(VectorF16<Len, Packing> b) const {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] - b.v[i];
return r;
}
constexpr VectorF16<Len, Packing> operator*(VectorF16<Len, Packing> b) const {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] * b.v[i];
return r;
}
constexpr VectorF16<Len, Packing> operator/(VectorF16<Len, Packing> b) const {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = this->v[i] / b.v[i];
return r;
}
constexpr void operator+=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] += b.v[i]; }
constexpr void operator-=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] -= b.v[i]; }
constexpr void operator*=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] *= b.v[i]; }
constexpr void operator/=(VectorF16<Len, Packing> b) { for (std::uint8_t i=0;i<NElems;++i) this->v[i] /= b.v[i]; }
constexpr VectorF16<Len, Packing> operator+(_Float16 b) const { return *this + VectorF16<Len, Packing>(b); }
constexpr VectorF16<Len, Packing> operator-(_Float16 b) const { return *this - VectorF16<Len, Packing>(b); }
constexpr VectorF16<Len, Packing> operator*(_Float16 b) const { return *this * VectorF16<Len, Packing>(b); }
constexpr VectorF16<Len, Packing> operator/(_Float16 b) const { return *this / VectorF16<Len, Packing>(b); }
constexpr void operator+=(_Float16 b) { *this += VectorF16<Len, Packing>(b); }
constexpr void operator-=(_Float16 b) { *this -= VectorF16<Len, Packing>(b); }
constexpr void operator*=(_Float16 b) { *this *= VectorF16<Len, Packing>(b); }
constexpr void operator/=(_Float16 b) { *this /= VectorF16<Len, Packing>(b); }
constexpr VectorF16<Len, Packing> operator-() const {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = -this->v[i];
return r;
}
constexpr bool operator==(VectorF16<Len, Packing> b) const {
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
if (this->v[p * Len + i] != b.v[p * Len + i]) return false;
return true;
}
constexpr bool operator!=(VectorF16<Len, Packing> b) const { return !(*this == b); }
template<std::uint32_t ExtractLen>
constexpr VectorF16<ExtractLen, Packing> ExtractLo() const {
VectorF16<ExtractLen, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < ExtractLen; ++i)
r.v[p * ExtractLen + i] = this->v[p * Len + i];
return r;
}
// Transcendentals are computed via float since libstdc++ doesn't always
// provide overloads for _Float16; the result is rounded back to half.
constexpr VectorF16<Len, Packing> Cos() const {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = static_cast<_Float16>(std::cos(static_cast<float>(this->v[i])));
return r;
}
constexpr VectorF16<Len, Packing> Sin() const {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = static_cast<_Float16>(std::sin(static_cast<float>(this->v[i])));
return r;
}
constexpr std::tuple<VectorF16<Len, Packing>, VectorF16<Len, Packing>> SinCos() const {
return { Sin(), Cos() };
}
template <std::array<bool, Len> values>
constexpr VectorF16<Len, Packing> Negate() const {
VectorF16<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i];
return r;
}
static constexpr VectorF16<Len, Packing> MulitplyAdd(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b, VectorF16<Len, Packing> add) {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] + add.v[i];
return r;
}
static constexpr VectorF16<Len, Packing> MulitplySub(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b, VectorF16<Len, Packing> sub) {
VectorF16<Len, Packing> r;
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = a.v[i] * b.v[i] - sub.v[i];
return r;
}
constexpr static VectorF16<Len, Packing> Cross(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) requires(Len == 3) {
VectorF16<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p) {
const std::uint8_t base = p * 3;
r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1];
r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2];
r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0];
}
return r;
}
template <const std::array<std::uint8_t, Len> ShuffleValues>
constexpr VectorF16<Len, Packing> Shuffle() const {
VectorF16<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]];
return r;
}
template <std::array<bool, Len> ShuffleValues>
constexpr static VectorF16<Len, Packing> Blend(VectorF16<Len, Packing> a, VectorF16<Len, Packing> b) {
VectorF16<Len, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p)
for (std::uint8_t i = 0; i < Len; ++i)
r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i];
return r;
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
constexpr static auto LengthSq(VectorF16<Len, Packing> first, Rest... rest) {
constexpr std::uint8_t N = 1 + sizeof...(Rest);
VectorF16<1, static_cast<std::uint8_t>(Packing * N)> r;
std::array<VectorF16<Len, Packing>, N> args{ first, rest... };
for (std::uint8_t i = 0; i < N; ++i)
for (std::uint8_t p = 0; p < Packing; ++p) {
_Float16 acc = _Float16(0);
for (std::uint8_t k = 0; k < Len; ++k) {
_Float16 x = args[i].v[p * Len + k];
acc += x * x;
}
r.v[i * Packing + p] = acc;
}
return r;
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
constexpr static auto Length(VectorF16<Len, Packing> first, Rest... rest) {
auto sq = LengthSq(first, rest...);
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i)
sq.v[i] = static_cast<_Float16>(std::sqrt(static_cast<float>(sq.v[i])));
return sq;
}
template<typename... Rest>
requires((std::is_same_v<Rest, VectorF16<Len, Packing>> && ...))
constexpr static auto Normalize(VectorF16<Len, Packing> first, Rest... rest) {
auto normOne = [](VectorF16<Len, Packing> u) {
VectorF16<Len, Packing> out;
for (std::uint8_t p = 0; p < Packing; ++p) {
float acc = 0.0f;
for (std::uint8_t k = 0; k < Len; ++k) {
float x = static_cast<float>(u.v[p * Len + k]);
acc += x * x;
}
_Float16 invLen = acc > 0.0f ? static_cast<_Float16>(1.0f / std::sqrt(acc)) : _Float16(0);
for (std::uint8_t k = 0; k < Len; ++k)
out.v[p * Len + k] = u.v[p * Len + k] * invLen;
}
return out;
};
return std::make_tuple(normOne(first), normOne(rest)...);
}
constexpr static VectorF16<Len, Packing> Rotate(VectorF16<3, Packing> v, VectorF16<4, Packing> q) requires(Len == 3) {
VectorF16<3, Packing> qv;
VectorF16<3, Packing> qwBroadcast;
for (std::uint8_t p = 0; p < Packing; ++p) {
qv.v[p * 3 + 0] = q.v[p * 4 + 0];
qv.v[p * 3 + 1] = q.v[p * 4 + 1];
qv.v[p * 3 + 2] = q.v[p * 4 + 2];
for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3];
}
VectorF16<3, Packing> t = Cross(qv, v) * _Float16(2);
return v + t * qwBroadcast + Cross(qv, t);
}
constexpr static VectorF16<3, Packing> RotatePivot(VectorF16<3, Packing> v, VectorF16<4, Packing> q, VectorF16<3, Packing> pivot) requires(Len == 3) {
VectorF16<3, Packing> translated = v - pivot;
return Rotate(translated, q) + pivot;
}
constexpr static VectorF16<4, Packing> QuanternionFromEuler(VectorF16<3, Packing> eulerHalf) requires(Len == 4) {
VectorF16<4, Packing> r;
for (std::uint8_t p = 0; p < Packing; ++p) {
float roll = static_cast<float>(eulerHalf.v[p * 3 + 0]);
float pitch = static_cast<float>(eulerHalf.v[p * 3 + 1]);
float yaw = static_cast<float>(eulerHalf.v[p * 3 + 2]);
float sr = std::sin(roll), cr = std::cos(roll);
float sp = std::sin(pitch), cp = std::cos(pitch);
float sy = std::sin(yaw), cy = std::cos(yaw);
r.v[p * 4 + 0] = static_cast<_Float16>(sr * cp * cy - cr * sp * sy);
r.v[p * 4 + 1] = static_cast<_Float16>(cr * sp * cy + sr * cp * sy);
r.v[p * 4 + 2] = static_cast<_Float16>(cr * cp * sy - sr * sp * cy);
r.v[p * 4 + 3] = static_cast<_Float16>(cr * cp * cy + sr * sp * sy);
}
return r;
}
};
}
#endif
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
export template <std::uint32_t Len, std::uint32_t Packing>
struct std::formatter<Crafter::VectorF16<Len, Packing>> : std::formatter<std::string> {
constexpr auto format(const Crafter::VectorF16<Len, Packing>& obj, format_context& ctx) const {