From 35091b2c53dde95f77799f6a73a01c2f679544ba Mon Sep 17 00:00:00 2001 From: Jorijn van der Graaf Date: Mon, 18 May 2026 20:33:47 +0200 Subject: [PATCH] RVV --- README.md | 6 +- interfaces/Crafter.Math-Basic.cppm | 2 +- interfaces/Crafter.Math-Common.cppm | 52 ++++- interfaces/Crafter.Math-VectorF32.cppm | 251 +++++++++++++++++++++++++ project.cpp | 23 ++- 5 files changed, 322 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b8dbf5a..8f2ac88 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ A C++23 math library for the Crafter engine, distributed as a set of C++ modules. Provides generic vector and matrix types alongside SIMD-specialized fixed-size vectors for `float` and `_Float16`, plus a small set of ray intersection routines. -The library is hardware-aware: it picks the widest SIMD path the target supports (SSE / AVX / AVX-512 on x86_64, SIMD128 on WebAssembly) and falls back to scalar code elsewhere. +The library is hardware-aware: it picks the widest SIMD path the target supports (SSE / AVX / AVX-512 on x86_64, RVV on RISC-V, SIMD128 on WebAssembly) and falls back to scalar code elsewhere. ## Modules @@ -49,12 +49,16 @@ This project uses [Crafter.Build](https://forgejo.catcrafts.net/Catcrafts/Crafte - `Vector-sapphirerapids` — AVX-512 with FP16 (`-march=sapphirerapids`) - `Vector-x86-64-v4` — AVX-512 baseline - `Vector-x86-64-v3` — AVX2 baseline +- `Vector-rv64gcv_zvl512b` — RVV with VLEN≥512 (uses the 64-byte tier) +- `Vector-rv64gcv_zvl256b` — RVV with VLEN≥256 (32-byte tier) +- `Vector-rv64gcv` — RVA23 baseline ZVL128B (16-byte tier) Build and run tests via your usual Crafter.Build entry point; `build/` and `bin/` are git-ignored. ## Target support - **x86_64**: SSE / AVX / AVX-512F selected per target; AVX512-FP16 is required for native `VectorF16` (otherwise it aliases `VectorF32`). F16C is used for fp16 ↔ fp32 conversion when available. +- **RISC-V**: RVV 1.0 (the `V` extension, ratified Nov 2021, mandatory in the RVA23 profile). Storage is a 16/32/64-byte GNU vector picked at compile time from the guaranteed VLEN (`__riscv_v_fixed_vlen` from Clang's `-mrvv-vector-bits=`, else `__riscv_v_min_vlen` from the march's `Zvl*` suffix, else the ZVL128B baseline). `VectorF16` aliases `VectorF32` until a `Zvfh` path lands. - **WebAssembly**: `wasm_simd128.h` 128-bit path; SIMD width is capped at 16 bytes. - **Other targets**: scalar fallback via `std::array`. diff --git a/interfaces/Crafter.Math-Basic.cppm b/interfaces/Crafter.Math-Basic.cppm index 0c8a877..d6a6ff1 100755 --- a/interfaces/Crafter.Math-Basic.cppm +++ b/interfaces/Crafter.Math-Basic.cppm @@ -28,7 +28,7 @@ namespace Crafter { return degrees * (std::numbers::pi / 180); } - #if (defined(__x86_64) && !defined(__AVX512FP16__)) || !defined(__FLT16_MAX__) + #if (defined(__x86_64) && !defined(__AVX512FP16__)) || defined(__riscv_vector) || !defined(__FLT16_MAX__) export template using VectorF16 = VectorF32; #endif diff --git a/interfaces/Crafter.Math-Common.cppm b/interfaces/Crafter.Math-Common.cppm index a730d53..e04f3cd 100644 --- a/interfaces/Crafter.Math-Common.cppm +++ b/interfaces/Crafter.Math-Common.cppm @@ -5,15 +5,38 @@ module; #ifdef __wasm_simd128__ #include #endif +#ifdef __riscv_vector +#include +// Compile-time VLEN selection. RVV is VLA at the ISA level, but storage in +// this library is fixed-size, so we pin to the widest VLEN the toolchain has +// guaranteed at compile time: +// __riscv_v_fixed_vlen — Clang's -mrvv-vector-bits=N mode. +// __riscv_v_min_vlen — minimum guaranteed VLEN from the march (e.g. +// rv64gcv_zvl256b → 256). Set by both GCC and Clang. +// Falls back to the RVA23 baseline of ZVL128B otherwise. +#if defined(__riscv_v_fixed_vlen) +#define CRAFTER_RVV_VLEN __riscv_v_fixed_vlen +#elif defined(__riscv_v_min_vlen) +#define CRAFTER_RVV_VLEN __riscv_v_min_vlen +#else +#define CRAFTER_RVV_VLEN 128 +#endif +// 16/32/64-byte storage types, mirroring x86's __m128/__m256/__m512 tier. +// The compiler emits RVV vle/vse/vfadd/... on these GNU vectors when the +// target's V extension is enabled. +typedef float __crafter_rvv_v128_f32 __attribute__((vector_size(16), aligned(16))); +typedef float __crafter_rvv_v256_f32 __attribute__((vector_size(32), aligned(32))); +typedef float __crafter_rvv_v512_f32 __attribute__((vector_size(64), aligned(64))); +#endif export module Crafter.Math:Common; import std; // VectorF16 exists as a real struct when _Float16 is available AND we are not // on x86_64 without AVX512FP16 (that path aliases VectorF16 to VectorF32 in -// Crafter.Math:Basic for performance). Each translation unit that needs this -// distinction redefines the same condition since macros do not cross module -// boundaries. -#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) +// Crafter.Math:Basic for performance). The same alias kicks in on RISC-V until +// a Zvfh path lands. Each translation unit that needs this distinction +// redefines the same condition since macros do not cross module boundaries. +#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) && !defined(__riscv_vector) namespace Crafter { export template struct VectorF16; @@ -26,7 +49,7 @@ namespace Crafter { template struct VectorBase { - #if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) + #if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) && !defined(__riscv_vector) template friend struct VectorF16; #endif @@ -63,6 +86,18 @@ namespace Crafter { >; #elif defined(__wasm_simd128__) using VectorType = v128_t; + #elif defined(__riscv_vector) + // RVV tier mirrors the x86 selector: pick the widest register the + // toolchain guarantees, then size each instantiation down to the + // smallest tier that fits Len*Packing. _Float16 never materialises + // here because VectorF16 aliases VectorF32 on RISC-V until a Zvfh + // path lands. + using VectorType = std::conditional_t< + std::is_same_v, + std::conditional_t<(Len * Packing > 8), __crafter_rvv_v512_f32, + std::conditional_t<(Len * Packing > 4), __crafter_rvv_v256_f32, __crafter_rvv_v128_f32>>, + std::array + >; #else using VectorType = std::array; #endif @@ -80,6 +115,13 @@ namespace Crafter { // WASM SIMD only has 128-bit vectors; cap at 16 bytes so the entire // VectorType always fits in a single v128_t. static constexpr std::uint8_t Max = 16; + #elif defined(__riscv_vector) + // RVV tier selected at compile time from the guaranteed VLEN. ZVL128B + // is the RVA23 baseline; ZVL256B / ZVL512B unlock wider registers + // when present. LMUL>1 groupings are a separate axis and could land + // later as a batched-op path on top of this. + static constexpr std::uint8_t Max = (CRAFTER_RVV_VLEN >= 512) ? 64 : + (CRAFTER_RVV_VLEN >= 256) ? 32 : 16; #else static constexpr std::uint8_t Max = 32; #endif diff --git a/interfaces/Crafter.Math-VectorF32.cppm b/interfaces/Crafter.Math-VectorF32.cppm index 1b40c59..91d473d 100755 --- a/interfaces/Crafter.Math-VectorF32.cppm +++ b/interfaces/Crafter.Math-VectorF32.cppm @@ -23,6 +23,9 @@ module; #ifdef __wasm_simd128__ #include #endif +#ifdef __riscv_vector +#include +#endif export module Crafter.Math:VectorF32; import std; import :Common; @@ -1695,6 +1698,254 @@ namespace Crafter { return VectorF32<4, Packing>(wasm_v128_load(outBuf)); } }; +#elif defined(__riscv_vector) + // RISC-V V extension implementation. Storage is a GNU vector of 16/32/64 + // bytes (picked in Common.cppm from the guaranteed VLEN); native operators + // map to vfadd/vfsub/vfmul/vfdiv. Per-element loops compile to vrgather/ + // vmerge/vfsqrt when the autovectoriser can see the pattern, and to + // scalar fallback otherwise. Hand-tuned intrinsic paths + // (e.g. vsetvl + vfwmacc for batched dot) can land incrementally. + export template + struct VectorF32 : public VectorBase { + template + friend struct VectorF32; + using Base = VectorBase; + static constexpr std::uint8_t NElems = Base::AlignmentElement; + + constexpr VectorF32() = default; + constexpr VectorF32(typename Base::VectorType vv) { this->v = vv; } + constexpr VectorF32(const float* vB) { Load(vB); } + constexpr VectorF32(float val) { + for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = val; + } + + constexpr void Load(const float* vB) { + for (std::uint8_t i = 0; i < NElems; ++i) this->v[i] = vB[i]; + } + constexpr void Store(float* vB) const { + for (std::uint8_t i = 0; i < NElems; ++i) vB[i] = this->v[i]; + } + + template + constexpr std::array Store() const { + std::array r{}; + Store(r.data()); + return r; + } + + template + constexpr operator VectorF32() const { + VectorF32 r; + const std::uint8_t copyLen = (BLen < Len) ? BLen : Len; + const std::uint8_t copyPack = (BPacking < Packing) ? BPacking : Packing; + for (std::uint8_t p = 0; p < copyPack; ++p) + for (std::uint8_t i = 0; i < copyLen; ++i) + r.v[p * BLen + i] = this->v[p * Len + i]; + return r; + } + + constexpr VectorF32 operator+(VectorF32 b) const { return VectorF32(this->v + b.v); } + constexpr VectorF32 operator-(VectorF32 b) const { return VectorF32(this->v - b.v); } + constexpr VectorF32 operator*(VectorF32 b) const { return VectorF32(this->v * b.v); } + constexpr VectorF32 operator/(VectorF32 b) const { return VectorF32(this->v / b.v); } + constexpr void operator+=(VectorF32 b) { this->v = this->v + b.v; } + constexpr void operator-=(VectorF32 b) { this->v = this->v - b.v; } + constexpr void operator*=(VectorF32 b) { this->v = this->v * b.v; } + constexpr void operator/=(VectorF32 b) { this->v = this->v / b.v; } + + constexpr VectorF32 operator+(float b) const { return *this + VectorF32(b); } + constexpr VectorF32 operator-(float b) const { return *this - VectorF32(b); } + constexpr VectorF32 operator*(float b) const { return *this * VectorF32(b); } + constexpr VectorF32 operator/(float b) const { return *this / VectorF32(b); } + constexpr void operator+=(float b) { *this += VectorF32(b); } + constexpr void operator-=(float b) { *this -= VectorF32(b); } + constexpr void operator*=(float b) { *this *= VectorF32(b); } + constexpr void operator/=(float b) { *this /= VectorF32(b); } + + constexpr VectorF32 operator-() const { return VectorF32(-this->v); } + + constexpr bool operator==(VectorF32 b) const { + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + if (this->v[p * Len + i] != b.v[p * Len + i]) return false; + return true; + } + constexpr bool operator!=(VectorF32 b) const { return !(*this == b); } + + template + constexpr VectorF32 ExtractLo() const { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < ExtractLen; ++i) + r.v[p * ExtractLen + i] = this->v[p * Len + i]; + return r; + } + + constexpr VectorF32 Cos() const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::cos(this->v[i]); + return r; + } + constexpr VectorF32 Sin() const { + VectorF32 r; + for (std::uint8_t i = 0; i < NElems; ++i) r.v[i] = std::sin(this->v[i]); + return r; + } + constexpr std::tuple, VectorF32> SinCos() const { + return { Sin(), Cos() }; + } + + template values> + constexpr VectorF32 Negate() const { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = values[i] ? -this->v[p * Len + i] : this->v[p * Len + i]; + return r; + } + + // a*b + c — the compiler fuses to vfmacc.vv under the default + // -ffp-contract=on. No explicit intrinsic needed. + static constexpr VectorF32 MulitplyAdd(VectorF32 a, VectorF32 b, VectorF32 add) { + return VectorF32(a.v * b.v + add.v); + } + static constexpr VectorF32 MulitplySub(VectorF32 a, VectorF32 b, VectorF32 sub) { + return VectorF32(a.v * b.v - sub.v); + } + + constexpr static VectorF32 Cross(VectorF32 a, VectorF32 b) requires(Len == 3) { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) { + const std::uint8_t base = p * 3; + r.v[base + 0] = a.v[base + 1] * b.v[base + 2] - a.v[base + 2] * b.v[base + 1]; + r.v[base + 1] = a.v[base + 2] * b.v[base + 0] - a.v[base + 0] * b.v[base + 2]; + r.v[base + 2] = a.v[base + 0] * b.v[base + 1] - a.v[base + 1] * b.v[base + 0]; + } + return r; + } + + template ShuffleValues> + constexpr VectorF32 Shuffle() const { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = this->v[p * Len + ShuffleValues[i]]; + return r; + } + + template ShuffleValues> + constexpr static VectorF32 Blend(VectorF32 a, VectorF32 b) { + VectorF32 r; + for (std::uint8_t p = 0; p < Packing; ++p) + for (std::uint8_t i = 0; i < Len; ++i) + r.v[p * Len + i] = ShuffleValues[i] ? b.v[p * Len + i] : a.v[p * Len + i]; + return r; + } + + template + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto LengthSq(VectorF32 first, Rest... rest) { + constexpr std::uint8_t N = VectorBase::BatchSize; + VectorF32<1, static_cast(Packing * N)> r; + std::array, N> args{ first, rest... }; + for (std::uint8_t i = 0; i < N; ++i) + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) { + float x = args[i].v[p * Len + k]; + acc += x * x; + } + r.v[i * Packing + p] = acc; + } + return r; + } + + template + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto Length(VectorF32 first, Rest... rest) { + auto sq = LengthSq(first, rest...); + for (std::uint8_t i = 0; i < decltype(sq)::NElems; ++i) sq.v[i] = std::sqrt(sq.v[i]); + return sq; + } + + // Pairwise dot products across BatchSize pairs. The 4th lane of Len==3 + // inputs may carry garbage from Cross(), so only the first Len lanes + // are summed per pair. + template + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == 2 * VectorBase::BatchSize)) + constexpr static auto Dot(VectorF32 first, Rest... rest) { + constexpr std::uint8_t N = VectorBase::BatchSize; + VectorF32<1, static_cast(Packing * N)> r; + std::array, 2 * N> args{ first, rest... }; + for (std::uint8_t i = 0; i < N; ++i) + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) + acc += args[2 * i].v[p * Len + k] * args[2 * i + 1].v[p * Len + k]; + r.v[i * Packing + p] = acc; + } + return r; + } + + template + requires((std::is_same_v> && ...) && + (1 + sizeof...(Rest) == VectorBase::BatchSize)) + constexpr static auto Normalize(VectorF32 first, Rest... rest) { + auto normOne = [](VectorF32 u) { + VectorF32 out; + for (std::uint8_t p = 0; p < Packing; ++p) { + float acc = 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) { + float x = u.v[p * Len + k]; + acc += x * x; + } + float invLen = acc > 0.0f ? 1.0f / std::sqrt(acc) : 0.0f; + for (std::uint8_t k = 0; k < Len; ++k) + out.v[p * Len + k] = u.v[p * Len + k] * invLen; + } + return out; + }; + return std::array, VectorBase::BatchSize>{ normOne(first), normOne(rest)... }; + } + + constexpr static VectorF32 Rotate(VectorF32<3, Packing> v, VectorF32<4, Packing> q) requires(Len == 3) { + VectorF32<3, Packing> qv; + VectorF32<3, Packing> qwBroadcast; + for (std::uint8_t p = 0; p < Packing; ++p) { + qv.v[p * 3 + 0] = q.v[p * 4 + 0]; + qv.v[p * 3 + 1] = q.v[p * 4 + 1]; + qv.v[p * 3 + 2] = q.v[p * 4 + 2]; + for (std::uint8_t i = 0; i < 3; ++i) qwBroadcast.v[p * 3 + i] = q.v[p * 4 + 3]; + } + VectorF32<3, Packing> t = Cross(qv, v) * 2.0f; + return v + t * qwBroadcast + Cross(qv, t); + } + + constexpr static VectorF32<3, Packing> RotatePivot(VectorF32<3, Packing> v, VectorF32<4, Packing> q, VectorF32<3, Packing> pivot) requires(Len == 3) { + VectorF32<3, Packing> translated = v - pivot; + return Rotate(translated, q) + pivot; + } + + constexpr static VectorF32<4, Packing> QuanternionFromEuler(VectorF32<3, Packing> eulerHalf) requires(Len == 4) { + VectorF32<4, Packing> r; + for (std::uint8_t p = 0; p < Packing; ++p) { + float roll = eulerHalf.v[p * 3 + 0]; + float pitch = eulerHalf.v[p * 3 + 1]; + float yaw = eulerHalf.v[p * 3 + 2]; + float sr = std::sin(roll), cr = std::cos(roll); + float sp = std::sin(pitch), cp = std::cos(pitch); + float sy = std::sin(yaw), cy = std::cos(yaw); + r.v[p * 4 + 0] = sr * cp * cy - cr * sp * sy; + r.v[p * 4 + 1] = cr * sp * cy + sr * cp * sy; + r.v[p * 4 + 2] = cr * cp * sy - sr * sp * cy; + r.v[p * 4 + 3] = cr * cp * cy + sr * sp * sy; + } + return r; + } + }; #else // Scalar software fallback for non-x86_64 targets. Future arches can swap // in their own intrinsic implementation by adding an arch-specific branch diff --git a/project.cpp b/project.cpp index 512f672..2456012 100644 --- a/project.cpp +++ b/project.cpp @@ -28,12 +28,12 @@ extern "C" Configuration CrafterBuildProject(std::span a cfg.GetInterfacesAndImplementations(ifaces, impls); } - auto addTest = [&](std::string_view testName, std::string march, std::string mtune) { + auto addTest = [&](std::string_view testName, std::string march, std::string mtune, std::string target = {}) { Test t; t.config.path = "./"; t.config.name = std::format("{}-{}", testName, march); t.config.outputName = t.config.name; - t.config.target = cfg.target; + t.config.target = target.empty() ? cfg.target : target; t.config.type = ConfigurationType::Executable; t.config.march = march; t.config.mtune = mtune; @@ -44,10 +44,23 @@ extern "C" Configuration CrafterBuildProject(std::span a t.config.GetInterfacesAndImplementations(ifaces, impls); cfg.tests.push_back(std::move(t)); }; + const std::string_view target = cfg.target; + const bool isX86 = target.starts_with("x86_64") || target.starts_with("i686"); + const bool isRiscv = target.starts_with("riscv"); for (std::string_view name : { "Vector", "Intersection", "Matrix" }) { - addTest(name, "sapphirerapids", "native"); - addTest(name, "x86-64-v4", "generic"); - addTest(name, "x86-64-v3", "generic"); + if (isX86) { + addTest(name, "sapphirerapids", "native"); + addTest(name, "x86-64-v4", "generic"); + addTest(name, "x86-64-v3", "generic"); + } + if (isRiscv) { + // RISC-V tiers, mirroring the x86 selector: register width is + // picked from the guaranteed VLEN encoded in the march's Zvl* + // suffix. rv64gcv → ZVL128B (matches the RVA23 baseline). + addTest(name, "rv64gcv_zvl512b", "generic"); + addTest(name, "rv64gcv_zvl256b", "generic"); + addTest(name, "rv64gcv", "generic"); + } } return cfg;