RVV

2026-05-18 20:33:47 +02:00 · 2026-05-18 20:33:47 +02:00 · 35091b2c53
commit 35091b2c53
parent f0becd1582
5 changed files with 322 additions and 12 deletions
--- a/interfaces/Crafter.Math-Common.cppm
+++ b/interfaces/Crafter.Math-Common.cppm
@ -5,15 +5,38 @@ module;
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #endif
+#ifdef __riscv_vector
+#include <riscv_vector.h>
+// Compile-time VLEN selection. RVV is VLA at the ISA level, but storage in
+// this library is fixed-size, so we pin to the widest VLEN the toolchain has
+// guaranteed at compile time:
+//   __riscv_v_fixed_vlen  — Clang's -mrvv-vector-bits=N mode.
+//   __riscv_v_min_vlen    — minimum guaranteed VLEN from the march (e.g.
+//                           rv64gcv_zvl256b → 256). Set by both GCC and Clang.
+// Falls back to the RVA23 baseline of ZVL128B otherwise.
+#if defined(__riscv_v_fixed_vlen)
+#define CRAFTER_RVV_VLEN __riscv_v_fixed_vlen
+#elif defined(__riscv_v_min_vlen)
+#define CRAFTER_RVV_VLEN __riscv_v_min_vlen
+#else
+#define CRAFTER_RVV_VLEN 128
+#endif
+// 16/32/64-byte storage types, mirroring x86's __m128/__m256/__m512 tier.
+// The compiler emits RVV vle/vse/vfadd/... on these GNU vectors when the
+// target's V extension is enabled.
+typedef float __crafter_rvv_v128_f32 __attribute__((vector_size(16), aligned(16)));
+typedef float __crafter_rvv_v256_f32 __attribute__((vector_size(32), aligned(32)));
+typedef float __crafter_rvv_v512_f32 __attribute__((vector_size(64), aligned(64)));
+#endif
 export module Crafter.Math:Common;
 import std;

 // VectorF16 exists as a real struct when _Float16 is available AND we are not
 // on x86_64 without AVX512FP16 (that path aliases VectorF16 to VectorF32 in
-// Crafter.Math:Basic for performance). Each translation unit that needs this
-// distinction redefines the same condition since macros do not cross module
-// boundaries.
-#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
+// Crafter.Math:Basic for performance). The same alias kicks in on RISC-V until
+// a Zvfh path lands. Each translation unit that needs this distinction
+// redefines the same condition since macros do not cross module boundaries.
+#if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) && !defined(__riscv_vector)
 namespace Crafter {
    export template <std::uint8_t Len, std::uint8_t Packing>
    struct VectorF16;
@ -26,7 +49,7 @@ namespace Crafter {

    template <std::uint8_t Len, std::uint8_t Packing, typename T>
    struct VectorBase {
-        #if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__))
+        #if defined(__FLT16_MAX__) && (!defined(__x86_64) || defined(__AVX512FP16__)) && !defined(__riscv_vector)
        template <std::uint8_t L, std::uint8_t P>
        friend struct VectorF16;
        #endif
@ -63,6 +86,18 @@ namespace Crafter {
        >;
        #elif defined(__wasm_simd128__)
        using VectorType = v128_t;
+        #elif defined(__riscv_vector)
+        // RVV tier mirrors the x86 selector: pick the widest register the
+        // toolchain guarantees, then size each instantiation down to the
+        // smallest tier that fits Len*Packing. _Float16 never materialises
+        // here because VectorF16 aliases VectorF32 on RISC-V until a Zvfh
+        // path lands.
+        using VectorType = std::conditional_t<
+            std::is_same_v<T, float>,
+            std::conditional_t<(Len * Packing > 8), __crafter_rvv_v512_f32,
+                std::conditional_t<(Len * Packing > 4), __crafter_rvv_v256_f32, __crafter_rvv_v128_f32>>,
+            std::array<T, GetAlingment()/sizeof(T)>
+        >;
        #else
        using VectorType = std::array<T, GetAlingment()/sizeof(T)>;
        #endif
@ -80,6 +115,13 @@ namespace Crafter {
        // WASM SIMD only has 128-bit vectors; cap at 16 bytes so the entire
        // VectorType always fits in a single v128_t.
        static constexpr std::uint8_t Max = 16;
+        #elif defined(__riscv_vector)
+        // RVV tier selected at compile time from the guaranteed VLEN. ZVL128B
+        // is the RVA23 baseline; ZVL256B / ZVL512B unlock wider registers
+        // when present. LMUL>1 groupings are a separate axis and could land
+        // later as a batched-op path on top of this.
+        static constexpr std::uint8_t Max = (CRAFTER_RVV_VLEN >= 512) ? 64 :
+                                            (CRAFTER_RVV_VLEN >= 256) ? 32 : 16;
        #else
        static constexpr std::uint8_t Max = 32;
        #endif