RVV
This commit is contained in:
parent
f0becd1582
commit
35091b2c53
5 changed files with 322 additions and 12 deletions
|
|
@ -28,7 +28,7 @@ namespace Crafter {
|
|||
return degrees * (std::numbers::pi / 180);
|
||||
}
|
||||
|
||||
#if (defined(__x86_64) && !defined(__AVX512FP16__)) || !defined(__FLT16_MAX__)
|
||||
#if (defined(__x86_64) && !defined(__AVX512FP16__)) || defined(__riscv_vector) || !defined(__FLT16_MAX__)
|
||||
export template <std::uint32_t Len, std::uint32_t Packing>
|
||||
using VectorF16 = VectorF32<Len, Packing>;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -5,15 +5,38 @@ module;
|
|||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#endif
|
||||
#ifdef __riscv_vector
|
||||
#include <riscv_vector.h>
|
||||
// Compile-time VLEN selection. RVV is VLA at the ISA level, but storage in
|
||||
// this library is fixed-size, so we pin to the widest VLEN the toolchain has
|
||||
// guaranteed at compile time:
|
||||
// __riscv_v_fixed_vlen — Clang's -mrvv-vector-bits=N mode.
|
||||
// __riscv_v_min_vlen — minimum guaranteed VLEN from the march (e.g.
|
||||
// rv64gcv_zvl256b → 256). Set by both GCC and Clang.
|
||||
// Falls back to the RVA23 baseline of ZVL128B otherwise.
|
||||
#if defined(__riscv_v_fixed_vlen)
|
||||
#define CRAFTER_RVV_VLEN __riscv_v_fixed_vlen
|
||||
#elif defined(__riscv_v_min_vlen)
|
||||
#define CRAFTER_RVV_VLEN __riscv_v_min_vlen
|
||||
#else
|
||||
#define CRAFTER_RVV_VLEN 128
|
||||
#endif
|
||||
// 16/32/64-byte storage types, mirroring x86's __m128/__m256/__m512 tier.
|
||||
// The compiler emits RVV vle/vse/vfadd/... on these GNU vectors when the
|
||||
// target's V extension is enabled.
|
||||
typedef float __crafter_rvv_v128_f32 __attribute__((vector_size(16), aligned(16)));
|
||||
typedef float __crafter_rvv_v256_f32 __attribute__((vector_size(32), aligned(32)));
|
||||
typedef float __crafter_rvv_v512_f32 __attribute__((vector_size(64), aligned(64)));
|
||||
#endif
|
||||
export module Crafter.Math:Common;
|
||||
import std;
|
||||
|
||||
// VectorF16 exists as a real struct when _Float16 is available AND we are not
|
||||
// on x86_64 without AVX512FP16 (that path aliases VectorF16 to VectorF32 in
|
||||
// Crafter.Math:Basic for performance). Each translation unit that needs this
|
||||
// distinction redefines the same condition since macros do not cross module
|
||||
// boundaries.
|
||||
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
|
||||
// Crafter.Math:Basic for performance). The same alias kicks in on RISC-V until
|
||||
// a Zvfh path lands. Each translation unit that needs this distinction
|
||||
// redefines the same condition since macros do not cross module boundaries.
|
||||
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) && !defined(__riscv_vector)
|
||||
namespace Crafter {
|
||||
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||
struct VectorF16;
|
||||
|
|
@ -26,7 +49,7 @@ namespace Crafter {
|
|||
|
||||
template <std::uint8_t Len, std::uint8_t Packing, typename T>
|
||||
struct VectorBase {
|
||||
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
|
||||
#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) && !defined(__riscv_vector)
|
||||
template <std::uint8_t L, std::uint8_t P>
|
||||
friend struct VectorF16;
|
||||
#endif
|
||||
|
|
@ -63,6 +86,18 @@ namespace Crafter {
|
|||
>;
|
||||
#elif defined(__wasm_simd128__)
|
||||
using VectorType = v128_t;
|
||||
#elif defined(__riscv_vector)
|
||||
// RVV tier mirrors the x86 selector: pick the widest register the
|
||||
// toolchain guarantees, then size each instantiation down to the
|
||||
// smallest tier that fits Len*Packing. _Float16 never materialises
|
||||
// here because VectorF16 aliases VectorF32 on RISC-V until a Zvfh
|
||||
// path lands.
|
||||
using VectorType = std::conditional_t<
|
||||
std::is_same_v<T, float>,
|
||||
std::conditional_t<(Len * Packing > 8), __crafter_rvv_v512_f32,
|
||||
std::conditional_t<(Len * Packing > 4), __crafter_rvv_v256_f32, __crafter_rvv_v128_f32>>,
|
||||
std::array<T, GetAlingment()/sizeof(T)>
|
||||
>;
|
||||
#else
|
||||
using VectorType = std::array<T, GetAlingment()/sizeof(T)>;
|
||||
#endif
|
||||
|
|
@ -80,6 +115,13 @@ namespace Crafter {
|
|||
// WASM SIMD only has 128-bit vectors; cap at 16 bytes so the entire
|
||||
// VectorType always fits in a single v128_t.
|
||||
static constexpr std::uint8_t Max = 16;
|
||||
#elif defined(__riscv_vector)
|
||||
// RVV tier selected at compile time from the guaranteed VLEN. ZVL128B
|
||||
// is the RVA23 baseline; ZVL256B / ZVL512B unlock wider registers
|
||||
// when present. LMUL>1 groupings are a separate axis and could land
|
||||
// later as a batched-op path on top of this.
|
||||
static constexpr std::uint8_t Max = (CRAFTER_RVV_VLEN >= 512) ? 64 :
|
||||
(CRAFTER_RVV_VLEN >= 256) ? 32 : 16;
|
||||
#else
|
||||
static constexpr std::uint8_t Max = 32;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -23,6 +23,9 @@ module;
|
|||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#endif
|
||||
#ifdef __riscv_vector
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
export module Crafter.Math:VectorF32;
|
||||
import std;
|
||||
import :Common;
|
||||
|
|
@ -1695,6 +1698,254 @@ namespace Crafter {
|
|||
return VectorF32<4, Packing>(wasm_v128_load(outBuf));
|
||||
}
|
||||
};
|
||||
#elif defined(__riscv_vector)
|
||||
// RISC-V V extension implementation. Storage is a GNU vector of 16/32/64
|
||||
// bytes (picked in Common.cppm from the guaranteed VLEN); native operators
|
||||
// map to vfadd/vfsub/vfmul/vfdiv. Per-element loops compile to vrgather/
|
||||
// vmerge/vfsqrt when the autovectoriser can see the pattern, and to
|
||||
// scalar fallback otherwise. Hand-tuned <riscv_vector.h> intrinsic paths
|
||||
// (e.g. vsetvl + vfwmacc for batched dot) can land incrementally.
|
||||
export template <std::uint8_t Len, std::uint8_t Packing>
|
||||
struct VectorF32 : public VectorBase<Len, Packing, float> {
|
||||
template <std::uint8_t Len2, std::uint8_t Packing2>
|
||||
friend struct VectorF32;
|
||||
using Base = VectorBase<Len, Packing, float>;
|
||||
static constexpr std::uint8_t NElems = Base::AlignmentElement;
|
||||
|
||||
constexpr VectorF32() = default;
|
||||
constexpr VectorF32(typename Base::VectorType vv) { this->v = vv; }
|
||||
constexpr VectorF32(const float* vB) { Load(vB); }
|
||||
constexpr VectorF32(float val) {
|
||||
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val;
|
||||
}
|
||||
|
||||
constexpr void Load(const float* vB) {
|
||||
for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i];
|
||||
}
|
||||
constexpr void Store(float* vB) const {
|
||||
for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
constexpr std::array<T, NElems> Store() const {
|
||||
std::array<T, NElems> r{};
|
||||
Store(r.data());
|
||||
return r;
|
||||
}
|
||||
|
||||
template <std::uint8_t BLen, std::uint8_t BPacking>
|
||||
constexpr operator VectorF32<BLen, BPacking>() const {
|
||||
VectorF32<BLen, BPacking> r;
|
||||
const std::uint8_t copyLen = (BLen < Len) ? BLen : Len;
|
||||
const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing;
|
||||
for (std::uint8_t p = 0; p < copyPack; ++p)
|
||||
for (std::uint8_t i = 0; i < copyLen; ++i)
|
||||
r.v[p * BLen + i] = this->v[p * Len + i];
|
||||
return r;
|
||||
}
|
||||
|
||||
constexpr VectorF32<Len, Packing> operator+(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(this->v + b.v); }
|
||||
constexpr VectorF32<Len, Packing> operator-(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(this->v - b.v); }
|
||||
constexpr VectorF32<Len, Packing> operator*(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(this->v * b.v); }
|
||||
constexpr VectorF32<Len, Packing> operator/(VectorF32<Len, Packing> b) const { return VectorF32<Len, Packing>(this->v / b.v); }
|
||||
constexpr void operator+=(VectorF32<Len, Packing> b) { this->v = this->v + b.v; }
|
||||
constexpr void operator-=(VectorF32<Len, Packing> b) { this->v = this->v - b.v; }
|
||||
constexpr void operator*=(VectorF32<Len, Packing> b) { this->v = this->v * b.v; }
|
||||
constexpr void operator/=(VectorF32<Len, Packing> b) { this->v = this->v / b.v; }
|
||||
|
||||
constexpr VectorF32<Len, Packing> operator+(float b) const { return *this + VectorF32<Len, Packing>(b); }
|
||||
constexpr VectorF32<Len, Packing> operator-(float b) const { return *this - VectorF32<Len, Packing>(b); }
|
||||
constexpr VectorF32<Len, Packing> operator*(float b) const { return *this * VectorF32<Len, Packing>(b); }
|
||||
constexpr VectorF32<Len, Packing> operator/(float b) const { return *this / VectorF32<Len, Packing>(b); }
|
||||
constexpr void operator+=(float b) { *this += VectorF32<Len, Packing>(b); }
|
||||
constexpr void operator-=(float b) { *this -= VectorF32<Len, Packing>(b); }
|
||||
constexpr void operator*=(float b) { *this *= VectorF32<Len, Packing>(b); }
|
||||
constexpr void operator/=(float b) { *this /= VectorF32<Len, Packing>(b); }
|
||||
|
||||
constexpr VectorF32<Len, Packing> operator-() const { return VectorF32<Len, Packing>(-this->v); }
|
||||
|
||||
constexpr bool operator==(VectorF32<Len, Packing> b) const {
|
||||
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||
for (std::uint8_t i = 0; i < Len; ++i)
|
||||
if (this->v[p * Len + i] != b.v[p * Len + i]) return false;
|
||||
return true;
|
||||
}
|
||||
constexpr bool operator!=(VectorF32<Len, Packing> b) const { return !(*this == b); }
|
||||
|
||||
template<std::uint32_t ExtractLen>
|
||||
constexpr VectorF32<ExtractLen, Packing> ExtractLo() const {
|
||||
VectorF32<ExtractLen, Packing> r;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||
for (std::uint8_t i = 0; i < ExtractLen; ++i)
|
||||
r.v[p * ExtractLen + i] = this->v[p * Len + i];
|
||||
return r;
|
||||
}
|
||||
|
||||
constexpr VectorF32<Len, Packing> Cos() const {
|
||||
VectorF32<Len, Packing> r;
|
||||
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::cos(this->v[i]);
|
||||
return r;
|
||||
}
|
||||
constexpr VectorF32<Len, Packing> Sin() const {
|
||||
VectorF32<Len, Packing> r;
|
||||
for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::sin(this->v[i]);
|
||||
return r;
|
||||
}
|
||||
constexpr std::tuple<VectorF32<Len, Packing>, VectorF32<Len, Packing>> SinCos() const {
|
||||
return { Sin(), Cos() };
|
||||
}
|
||||
|
||||
template <std::array<bool, Len> values>
|
||||
constexpr VectorF32<Len, Packing> Negate() const {
|
||||
VectorF32<Len, Packing> r;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||
for (std::uint8_t i = 0; i < Len; ++i)
|
||||
r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i];
|
||||
return r;
|
||||
}
|
||||
|
||||
// a*b + c — the compiler fuses to vfmacc.vv under the default
|
||||
// -ffp-contract=on. No explicit intrinsic needed.
|
||||
static constexpr VectorF32<Len, Packing> MulitplyAdd(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> add) {
|
||||
return VectorF32<Len, Packing>(a.v * b.v + add.v);
|
||||
}
|
||||
static constexpr VectorF32<Len, Packing> MulitplySub(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b, VectorF32<Len, Packing> sub) {
|
||||
return VectorF32<Len, Packing>(a.v * b.v - sub.v);
|
||||
}
|
||||
|
||||
constexpr static VectorF32<Len, Packing> Cross(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) requires(Len == 3) {
|
||||
VectorF32<Len, Packing> r;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
const std::uint8_t base = p * 3;
|
||||
r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1];
|
||||
r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2];
|
||||
r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
template <const std::array<std::uint8_t, Len> ShuffleValues>
|
||||
constexpr VectorF32<Len, Packing> Shuffle() const {
|
||||
VectorF32<Len, Packing> r;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||
for (std::uint8_t i = 0; i < Len; ++i)
|
||||
r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]];
|
||||
return r;
|
||||
}
|
||||
|
||||
template <std::array<bool, Len> ShuffleValues>
|
||||
constexpr static VectorF32<Len, Packing> Blend(VectorF32<Len, Packing> a, VectorF32<Len, Packing> b) {
|
||||
VectorF32<Len, Packing> r;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p)
|
||||
for (std::uint8_t i = 0; i < Len; ++i)
|
||||
r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i];
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto LengthSq(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
|
||||
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||
std::array<VectorF32<Len, Packing>, N> args{ first, rest... };
|
||||
for (std::uint8_t i = 0; i < N; ++i)
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
float acc = 0.0f;
|
||||
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||
float x = args[i].v[p * Len + k];
|
||||
acc += x * x;
|
||||
}
|
||||
r.v[i * Packing + p] = acc;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Length(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
auto sq = LengthSq(first, rest...);
|
||||
for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]);
|
||||
return sq;
|
||||
}
|
||||
|
||||
// Pairwise dot products across BatchSize pairs. The 4th lane of Len==3
|
||||
// inputs may carry garbage from Cross(), so only the first Len lanes
|
||||
// are summed per pair.
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == 2 * VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Dot(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
constexpr std::uint8_t N = VectorBase<Len, Packing, float>::BatchSize;
|
||||
VectorF32<1, static_cast<std::uint8_t>(Packing * N)> r;
|
||||
std::array<VectorF32<Len, Packing>, 2 * N> args{ first, rest... };
|
||||
for (std::uint8_t i = 0; i < N; ++i)
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
float acc = 0.0f;
|
||||
for (std::uint8_t k = 0; k < Len; ++k)
|
||||
acc += args[2 * i].v[p * Len + k] * args[2 * i + 1].v[p * Len + k];
|
||||
r.v[i * Packing + p] = acc;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
template<typename... Rest>
|
||||
requires((std::is_same_v<Rest, VectorF32<Len, Packing>> && ...) &&
|
||||
(1 + sizeof...(Rest) == VectorBase<Len, Packing, float>::BatchSize))
|
||||
constexpr static auto Normalize(VectorF32<Len, Packing> first, Rest... rest) {
|
||||
auto normOne = [](VectorF32<Len, Packing> u) {
|
||||
VectorF32<Len, Packing> out;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
float acc = 0.0f;
|
||||
for (std::uint8_t k = 0; k < Len; ++k) {
|
||||
float x = u.v[p * Len + k];
|
||||
acc += x * x;
|
||||
}
|
||||
float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f;
|
||||
for (std::uint8_t k = 0; k < Len; ++k)
|
||||
out.v[p * Len + k] = u.v[p * Len + k] * invLen;
|
||||
}
|
||||
return out;
|
||||
};
|
||||
return std::array<VectorF32<Len, Packing>, VectorBase<Len, Packing, float>::BatchSize>{ normOne(first), normOne(rest)... };
|
||||
}
|
||||
|
||||
constexpr static VectorF32<Len, Packing> Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) {
|
||||
VectorF32<3, Packing> qv;
|
||||
VectorF32<3, Packing> qwBroadcast;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
qv.v[p * 3 + 0] = q.v[p * 4 + 0];
|
||||
qv.v[p * 3 + 1] = q.v[p * 4 + 1];
|
||||
qv.v[p * 3 + 2] = q.v[p * 4 + 2];
|
||||
for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3];
|
||||
}
|
||||
VectorF32<3, Packing> t = Cross(qv, v) * 2.0f;
|
||||
return v + t * qwBroadcast + Cross(qv, t);
|
||||
}
|
||||
|
||||
constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) {
|
||||
VectorF32<3, Packing> translated = v - pivot;
|
||||
return Rotate(translated, q) + pivot;
|
||||
}
|
||||
|
||||
constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) {
|
||||
VectorF32<4, Packing> r;
|
||||
for (std::uint8_t p = 0; p < Packing; ++p) {
|
||||
float roll = eulerHalf.v[p * 3 + 0];
|
||||
float pitch = eulerHalf.v[p * 3 + 1];
|
||||
float yaw = eulerHalf.v[p * 3 + 2];
|
||||
float sr = std::sin(roll), cr = std::cos(roll);
|
||||
float sp = std::sin(pitch), cp = std::cos(pitch);
|
||||
float sy = std::sin(yaw), cy = std::cos(yaw);
|
||||
r.v[p * 4 + 0] = sr * cp * cy - cr * sp * sy;
|
||||
r.v[p * 4 + 1] = cr * sp * cy + sr * cp * sy;
|
||||
r.v[p * 4 + 2] = cr * cp * sy - sr * sp * cy;
|
||||
r.v[p * 4 + 3] = cr * cp * cy + sr * sp * sy;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
};
|
||||
#else
|
||||
// Scalar software fallback for non-x86_64 targets. Future arches can swap
|
||||
// in their own intrinsic implementation by adding an arch-specific branch
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue