stash

2025-05-05 02:01:44 +02:00 · 2025-05-05 02:01:44 +02:00 · d0a8b12c1a
commit d0a8b12c1a
22 changed files with 1329 additions and 0 deletions
--- a/Crafter.Math-BasicTypes.cppm
+++ b/Crafter.Math-BasicTypes.cppm
@ -0,0 +1,43 @@
+module;
+
+#include <cstdint>
+#include <stdfloat>
+#include <format>
+
+export module Crafter.Math:BasicTypes;
+
+namespace Crafter {
+	export struct Float2 {
+		float x;
+		float y;
+	};
+	export struct Float3 {
+		float x;
+		float y;
+		float z;
+	};
+	export struct Float4 {
+		float x;
+		float y;
+		float z;
+		float w;
+	};
+	export struct Float4x4 {
+		float c1[4];
+		float c2[4];
+		float c3[4];
+		float c4[4];
+	};
+}
+
+template <>
+struct std::formatter<Crafter::Float4x4> : std::formatter<std::string> {
+    auto format(const Crafter::Float4x4& obj, format_context& ctx) const {
+        return std::formatter<std::string>::format(std::format("{{{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}\n{}, {}, {}, {}}}", 
+			obj.c1[0], obj.c2[0], obj.c3[0], obj.c4[0], 
+			obj.c1[1], obj.c2[1], obj.c3[1], obj.c4[1], 
+			obj.c1[2], obj.c2[2], obj.c3[2], obj.c4[2], 
+			obj.c1[3], obj.c2[3], obj.c3[3], obj.c4[3]
+		), ctx);
+    }
+};
--- a/Crafter.Math-Matrix.cppm
+++ b/Crafter.Math-Matrix.cppm
@ -0,0 +1,278 @@
+module;
+
+#include <type_traits>
+#include <concepts>
+#include <immintrin.h>
+#include <string>
+#include <sstream>
+#include <iostream>
+
+export module Crafter.Math:Matrix;
+
+import :BasicTypes;
+import :Vector;
+import :Misc;
+
+namespace Crafter {
+	export template <typename T, uint32_t collumSize, uint32_t rowSize, uint32_t repeats>
+	class Matrix {
+	public:
+		typedef
+			typename std::conditional<(sizeof(T)* collumSize*repeats > 32 && (std::same_as<T, int64_t> || std::same_as<T, int32_t> || std::same_as<T, int16_t> || std::same_as<T, int8_t>)), __m512i,
+			typename std::conditional<(sizeof(T)* collumSize*repeats > 16 && (std::same_as<T, int64_t> || std::same_as<T, int32_t> || std::same_as<T, int16_t> || std::same_as<T, int8_t>)), __m256i,
+			typename std::conditional<(sizeof(T)* collumSize*repeats <= 16 && (std::same_as<T, int64_t> || std::same_as<T, int32_t> || std::same_as<T, int16_t> || std::same_as<T, int8_t>)), __m128i,
+			typename std::conditional<(collumSize*repeats > 16 && std::same_as<T, __fp16>), __m512h,
+			typename std::conditional<(collumSize*repeats > 8 && std::same_as<T, __fp16>), __m256h,
+			typename std::conditional<(collumSize*repeats <= 8 && std::same_as<T, __fp16>), __m128h,
+			typename std::conditional<(collumSize*repeats > 8 && std::same_as<T, float>), __m512,
+			typename std::conditional<(collumSize*repeats > 4 && std::same_as<T, float>), __m256,
+			typename std::conditional<(collumSize*repeats <= 4 && std::same_as<T, float>), __m128,
+			typename std::conditional<(collumSize*repeats > 4 && std::same_as<T, double>), __m512d,
+			typename std::conditional<(collumSize*repeats > 2 && std::same_as<T, double>), __m256d, __m128d
+			>::type>::type>::type>::type>::type>::type>::type>::type>::type>::type>::type collum_type;
+
+		collum_type c[rowSize];
+		
+		Matrix() {
+
+		}
+
+		Matrix(__m128 c0, __m128 c1, __m128 c2, __m128 c3) requires(collumSize == 4 && rowSize == 4 && repeats == 1 && std::same_as<T, float>)  {
+			c[0] = c0;
+			c[1] = c1;
+			c[2] = c2;
+			c[3] = c3;
+		}
+
+		Matrix(
+			float x0, float y0, float z0, float w0,
+			float x1, float y1, float z1, float w1,
+			float x2, float y2, float z2, float w2,
+			float x3, float y3, float z3, float w3
+		) requires(collumSize == 4 && rowSize == 4 && repeats == 1 && std::same_as<T, float>) {
+			c[0] = _mm_set_ps(x3, x2, x1, x0);
+			c[1] = _mm_set_ps(y3, y2, y1, y0);
+			c[2] = _mm_set_ps(z3, z2, z1, z0);
+			c[3] = _mm_set_ps(w3, w2, w1, w0);
+		}
+
+		Vector<T, rowSize> operator*(Vector<T, 4> b) const requires(collumSize == 4 && rowSize == 4 && repeats == 1 && std::same_as<T, float>) {
+			__m128 result = _mm_mul_ps(reinterpret_cast<__m128>(c[0]), reinterpret_cast<__m128>(b.v));
+			result = _mm_fmadd_ps(reinterpret_cast<__m128>(c[1]), reinterpret_cast<__m128>(b.v), result);
+			result = _mm_fmadd_ps(reinterpret_cast<__m128>(c[2]), reinterpret_cast<__m128>(b.v), result);
+			result = _mm_fmadd_ps(reinterpret_cast<__m128>(c[3]), reinterpret_cast<__m128>(b.v), result);
+			return Vector<T, 4>(result);
+		}
+		
+
+		// static Matrix<T, collums, rowSize, vectorSize> Scaling(float x, float y, float z) requires(collums == 4 && (rowSize == 3 || rowSize == 4) && vectorSize == 1 && std::same_as<T, float>) {
+		// 	return Matrix<T, collums, rowSize, vectorSize>(
+		// 		_mm_set_ps(0, 0, 0, x),
+		// 		_mm_set_ps(0, 0, y, 0),
+		// 		_mm_set_ps(0, z, 0, 0),
+		// 		_mm_set_ps(1, 0, 0, 0)
+		// 	);
+		// }
+
+		// static Matrix<T, collums, rowSize, vectorSize> Translation(float x, float y, float z) requires(collums == 4 && (rowSize == 3 || rowSize == 4) && vectorSize == 1 && std::same_as<T, float>) {
+		// 	return Matrix<T, collums, rowSize, vectorSize>(
+		// 		_mm_set_ps(0, 0, 0, 1),
+		// 		_mm_set_ps(0, 0, 1, 0),
+		// 		_mm_set_ps(0, 1, 0, 0),
+		// 		_mm_set_ps(1, z, y, x)
+		// 	);
+		// }
+
+		// // static Matrix<T, collums, rowSize, vectorSize> Rotation(float x, float y, float z) requires(collums == 4 && (rowSize == 3 || rowSize == 4) && vectorSize == 1 && std::same_as<T, float>) {
+		// // 	return Matrix<T, collums, rowSize, vectorSize>(
+		// // 		_mm_set_ps(0, 0, 0, 1),
+		// // 		_mm_set_ps(0, 0, 1, 0),
+		// // 		_mm_set_ps(0, 1, 0, 0),
+		// // 		_mm_set_ps(1, z, y, x)
+		// // 	);
+		// // }
+
+		// static Matrix<T, collums, rowSize, vectorSize> Idenity() requires(collums == 4 && (rowSize == 3 || rowSize == 4) && vectorSize == 1 && std::same_as<T, float>) {
+		// 	return Matrix<T, collums, rowSize, vectorSize>(
+		// 		_mm_set_ps(0, 0, 0, 1),
+		// 		_mm_set_ps(0, 0, 1, 0),
+		// 		_mm_set_ps(0, 1, 0, 0),
+		// 		_mm_set_ps(1, 0, 0, 0)
+		// 	);
+		// }
+
+		// static Matrix<T, collums, rowSize, vectorSize> Projection(float FovAngleY, float AspectRatio, float NearZ, float FarZ) requires(collums == 4 && (rowSize == 3 || rowSize == 4) && vectorSize == 1 && std::same_as<T, float>) {
+		// 	float    SinFov;
+		// 	float    CosFov;
+		// 	XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY);
+		// 	float fRange = FarZ / (NearZ - FarZ);
+		// 	// Note: This is recorded on the stack
+		// 	float Height = CosFov / SinFov;
+		// 	__m128 rMem = {
+		// 		Height / AspectRatio,
+		// 		Height,
+		// 		fRange,
+		// 		fRange * NearZ
+		// 	};
+		// 	// Copy from memory to SSE register
+		// 	__m128 vValues = rMem;
+		// 	__m128 vTemp = _mm_setzero_ps();
+		// 	// Copy x only
+		// 	vTemp = _mm_move_ss(vTemp, vValues);
+		// 	// Height / AspectRatio,0,0,0
+		// 	Matrix<T, collums, rowSize, vectorSize> M;
+		// 	M.r[0] = vTemp;
+		// 	// 0,Height,0,0
+		// 	vTemp = vValues;
+		// 	vTemp = _mm_and_ps(vTemp, g_XMMaskY.v);
+		// 	M.r[1] = vTemp;
+		// 	// x=fRange,y=-fRange * NearZ,0,-1.0f
+		// 	vTemp = _mm_setzero_ps();
+		// 	vValues = _mm_shuffle_ps(vValues, g_XMNegIdentityR3.v, _MM_SHUFFLE(3, 2, 3, 2));
+		// 	// 0,0,fRange,-1.0f
+		// 	vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(3, 0, 0, 0));
+		// 	M.r[2] = vTemp;
+		// 	// 0,0,fRange * NearZ,0.0f
+		// 	vTemp = _mm_shuffle_ps(vTemp, vValues, _MM_SHUFFLE(2, 1, 0, 0));
+		// 	M.r[3] = vTemp;
+		// 	return M;
+		// }
+
+		// template <uint32_t vectorRowSize>
+		// Vector<T, rowSize> operator*(Vector<T, vectorRowSize> b) const requires(collums == 4 && vectorRowSize == 4 && rowSize >= 4 && vectorSize == 1 && std::same_as<T, float>) {
+
+
+		// 	//std::cout << Vector<T, rowSize>(allX).ToString() << std::endl;
+			
+		// 	// __m128 result = _mm_permute_ps(b.v, 0b00000000);
+		// 	// result = _mm_fmadd_ps(result, r[0], r[3]);
+
+		// 	// __m128 allY = _mm_permute_ps(b.v, 0b10101010);
+		// 	// result = _mm_fmadd_ps(allY, r[1], result);
+
+		// 	// __m128 allZ = _mm_permute_ps(b.v, 0b01010101);
+		// 	// return Vector<T, rowSize>(_mm_fmadd_ps(allZ, r[2], result));
+		// 	return Vector<T, vectorRowSize>(1, 2, 3, 4);
+		// }
+
+
+		// Matrix<T, collums, rowSize, vectorSize> operator*(Matrix<T, collums, rowSize, vectorSize> b) const requires(collums == 4 && rowSize == 4 && vectorSize == 1 && std::same_as<T, float>) {
+		// 	Matrix<T, collums, rowSize, vectorSize> result;
+		// 	result.r[0] = _mm_permute_ps(b.r[0], 0b00000000);
+		// 	result.r[1] = _mm_fmadd_ps(_mm_permute_ps(b.r[1], 0b00000000), reinterpret_cast<__m128>(r[1]), reinterpret_cast<__m128>(result.r[0]));
+		// 	result.r[1] = _mm_permute_ps(b.r[1], 0b00000000);
+		// 	result.r[2] = _mm_permute_ps(b.r[2], 0b00000000);
+		// 	result.r[3] = _mm_permute_ps(b.r[3], 0b00000000);
+
+		// 	// result.r[0] = _mm_fmadd_ps(allY, reinterpret_cast<__m128>(r[1]), reinterpret_cast<__m128>(result.r[0]));
+		// 	// result.r[0] = _mm_fmadd_ps(allZ, reinterpret_cast<__m128>(r[2]), reinterpret_cast<__m128>(result.r[0]));
+		// 	// result.r[0] = _mm_fmadd_ps(allW, reinterpret_cast<__m128>(r[3]), reinterpret_cast<__m128>(result.r[0]));
+
+		// 	Float4x4 store;
+		// 	result.Store(&store);
+
+		// 	std::cout << std::format("{}", store) << std::endl;
+
+		// 	return result;
+		// }
+
+		// void Store(Float4x4* store) const requires(collums == 4 && rowSize == 4 && vectorSize == 1 && std::same_as<T, float>) {
+		// 	_mm_storeu_ps(store->r1, reinterpret_cast<__m128>(r[0]));
+		// 	_mm_storeu_ps(store->r2, reinterpret_cast<__m128>(r[1]));
+		// 	_mm_storeu_ps(store->r3, reinterpret_cast<__m128>(r[2]));
+		// 	_mm_storeu_ps(store->r4, reinterpret_cast<__m128>(r[3]));
+		// }
+
+
+		// // VectorVector<T, 4, 4> operator*(VectorVector<T, 4, 4> b) requires(collums == 4 && rowSize == 4 && vectorSize == 4 && std::same_as<T, float>) {
+		// // 	__m512 result = _mm512_permute_ps(b.v, 0b11111111);
+		// // 	result = _mm512_fmadd_ps(result, reinterpret_cast<__m512>(r[0]), reinterpret_cast<__m512>(r[3]));
+
+		// // 	__m512 allY = _mm512_permute_ps(b.v, 0b10101010);
+		// // 	result = _mm512_fmadd_ps(allY, reinterpret_cast<__m512>(r[1]), result);
+
+		// // 	__m512 allZ = _mm512_permute_ps(b.v, 0b01010101);
+		// // 	return VectorVector<T, 4, 4>(_mm512_fmadd_ps(allZ, reinterpret_cast<__m512>(r[2]), result));
+		// // }
+
+		// // m4x4float Transpose() const {
+		// // 	// x.x,x.y,y.x,y.y
+		// // 	__m128 vTemp1 = _mm_shuffle_ps(r[0], r[1], _MM_SHUFFLE(1, 0, 1, 0));
+		// // 	// x.z,x.w,y.z,y.w
+		// // 	__m128 vTemp3 = _mm_shuffle_ps(r[0], r[1], _MM_SHUFFLE(3, 2, 3, 2));
+		// // 	// z.x,z.y,w.x,w.y
+		// // 	__m128 vTemp2 = _mm_shuffle_ps(r[2], r[3], _MM_SHUFFLE(1, 0, 1, 0));
+		// // 	// z.z,z.w,w.z,w.w
+		// // 	__m128 vTemp4 = _mm_shuffle_ps(r[2], r[3], _MM_SHUFFLE(3, 2, 3, 2));
+
+		// // 	return m4x4float(
+		// // 		_mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(2, 0, 2, 0)),
+		// // 		_mm_shuffle_ps(vTemp1, vTemp2, _MM_SHUFFLE(3, 1, 3, 1)),
+		// // 		_mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(2, 0, 2, 0)),
+		// // 		_mm_shuffle_ps(vTemp3, vTemp4, _MM_SHUFFLE(3, 1, 3, 1))
+		// // 	);
+		// // }
+		// // m4x4float operator*(m4x4float b) const {
+		// // 	__m256 t0 = _mm256_castps128_ps256(r[0]);
+		// // 	t0 = _mm256_insertf128_ps(t0, r[1], 1);
+		// // 	__m256 t1 = _mm256_castps128_ps256(r[2]);
+		// // 	t1 = _mm256_insertf128_ps(t1, r[3], 1);
+
+		// // 	__m256 u0 = _mm256_castps128_ps256(b.r[0]);
+		// // 	u0 = _mm256_insertf128_ps(u0, b.r[1], 1);
+		// // 	__m256 u1 = _mm256_castps128_ps256(b.r[2]);
+		// // 	u1 = _mm256_insertf128_ps(u1, b.r[3], 1);
+
+		// // 	__m256 a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 0, 0, 0));
+		// // 	__m256 a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0));
+		// // 	__m256 b0 = _mm256_permute2f128_ps(u0, u0, 0x00);
+		// // 	__m256 c0 = _mm256_mul_ps(a0, b0);
+		// // 	__m256 c1 = _mm256_mul_ps(a1, b0);
+
+		// // 	a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(1, 1, 1, 1));
+		// // 	a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1));
+		// // 	b0 = _mm256_permute2f128_ps(u0, u0, 0x11);
+		// // 	__m256 c2 = _mm256_fmadd_ps(a0, b0, c0);
+		// // 	__m256 c3 = _mm256_fmadd_ps(a1, b0, c1);
+
+		// // 	a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(2, 2, 2, 2));
+		// // 	a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(2, 2, 2, 2));
+		// // 	__m256 b1 = _mm256_permute2f128_ps(u1, u1, 0x00);
+		// // 	__m256 c4 = _mm256_mul_ps(a0, b1);
+		// // 	__m256 c5 = _mm256_mul_ps(a1, b1);
+
+		// // 	a0 = _mm256_shuffle_ps(t0, t0, _MM_SHUFFLE(3, 3, 3, 3));
+		// // 	a1 = _mm256_shuffle_ps(t1, t1, _MM_SHUFFLE(3, 3, 3, 3));
+		// // 	b1 = _mm256_permute2f128_ps(u1, u1, 0x11);
+		// // 	__m256 c6 = _mm256_fmadd_ps(a0, b1, c4);
+		// // 	__m256 c7 = _mm256_fmadd_ps(a1, b1, c5);
+
+		// // 	t0 = _mm256_add_ps(c2, c6);
+		// // 	t1 = _mm256_add_ps(c3, c7);
+
+		// // 	return m4x4float(
+		// // 		_mm256_castps256_ps128(t0),
+		// // 		_mm256_extractf128_ps(t0, 1),
+		// // 		_mm256_castps256_ps128(t1),
+		// // 		_mm256_extractf128_ps(t1, 1)
+		// // 	);
+		// // }
+
+
+		void Store(Crafter::Float4x4& store) const {
+			_mm_storeu_ps(store.c1, c[0]);
+			_mm_storeu_ps(store.c2, c[1]);
+			_mm_storeu_ps(store.c3, c[2]);
+			_mm_storeu_ps(store.c4, c[3]);
+		}
+	};
+}
+
+template <>
+struct std::formatter<Crafter::Matrix<float, 4, 4, 1>> : std::formatter<std::string> {
+    auto format(const Crafter::Matrix<float, 4, 4, 1>& obj, format_context& ctx) const {
+		Crafter::Float4x4 store;
+		obj.Store(store);
+        return std::formatter<std::string>::format(std::format("{}", store), ctx);
+    }
+};
--- a/Crafter.Math-Misc.cppm
+++ b/Crafter.Math-Misc.cppm
@ -0,0 +1,66 @@
+
+module;
+
+#include <cstdint>
+#include <stdfloat>
+
+export module Crafter.Math:Misc;
+
+export namespace Crafter {
+    //-------------------------------------------------------------------------------------
+    // DirectXMathMisc.inl -- SIMD C++ Math library
+    //
+    // Copyright (c) Microsoft Corporation.
+    // Licensed under the MIT License.
+    //
+    // http://go.microsoft.com/fwlink/?LinkID=615560
+    //-------------------------------------------------------------------------------------
+    constexpr float XM_PI = 3.141592654f;
+    constexpr float XM_2PI = 6.283185307f;
+    constexpr float XM_1DIVPI = 0.318309886f;
+    constexpr float XM_1DIV2PI = 0.159154943f;
+    constexpr float XM_PIDIV2 = 1.570796327f;
+    constexpr float XM_PIDIV4 = 0.785398163f;
+
+
+    inline void XMScalarSinCos(float* pSin, float* pCos, float  Value) noexcept
+    {
+        // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
+        float quotient = XM_1DIV2PI * Value;
+        if (Value >= 0.0f)
+        {
+            quotient = static_cast<float>(static_cast<int>(quotient + 0.5f));
+        }
+        else
+        {
+            quotient = static_cast<float>(static_cast<int>(quotient - 0.5f));
+        }
+        float y = Value - XM_2PI * quotient;
+
+        // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
+        float sign;
+        if (y > XM_PIDIV2)
+        {
+            y = XM_PI - y;
+            sign = -1.0f;
+        }
+        else if (y < -XM_PIDIV2)
+        {
+            y = -XM_PI - y;
+            sign = -1.0f;
+        }
+        else
+        {
+            sign = +1.0f;
+        }
+
+        float y2 = y * y;
+
+        // 11-degree minimax approximation
+        *pSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
+
+        // 10-degree minimax approximation
+        float p = ((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f;
+        *pCos = sign * p;
+    }
+}
--- a/Crafter.Math-Vector.cppm
+++ b/Crafter.Math-Vector.cppm
@ -0,0 +1,881 @@
+module;
+
+#include <type_traits>
+#include <concepts>
+#include <immintrin.h>
+#include <string>
+#include <sstream>
+#include <iostream>
+
+export module Crafter.Math:Vector;
+
+import :BasicTypes;
+
+namespace Crafter {
+	export template <typename T, uint32_t len>
+		class Vector {
+		typedef
+			typename std::conditional<(sizeof(T)* len > 32 && (std::same_as<T, int64_t> || std::same_as<T, int32_t> || std::same_as<T, int16_t> || std::same_as<T, int8_t> || std::same_as<T, uint64_t> || std::same_as<T, uint32_t> || std::same_as<T, uint16_t> || std::same_as<T, uint8_t>)), __m512i,
+			typename std::conditional<(sizeof(T)* len > 16 && (std::same_as<T, int64_t> || std::same_as<T, int32_t> || std::same_as<T, int16_t> || std::same_as<T, int8_t> || std::same_as<T, uint64_t> || std::same_as<T, uint32_t> || std::same_as<T, uint16_t> || std::same_as<T, uint8_t>)), __m256i,
+			typename std::conditional<(sizeof(T)* len <= 16 && (std::same_as<T, int64_t> || std::same_as<T, int32_t> || std::same_as<T, int16_t> || std::same_as<T, int8_t> || std::same_as<T, uint64_t> || std::same_as<T, uint32_t> || std::same_as<T, uint16_t> || std::same_as<T, uint8_t>)), __m128i,
+			typename std::conditional<(len > 16 && std::same_as<T, __fp16>), __m512h,
+			typename std::conditional<(len > 8 && std::same_as<T, __fp16>), __m256h,
+			typename std::conditional<(len <= 8 && std::same_as<T, __fp16>), __m128h,
+			typename std::conditional<(len > 8 && std::same_as<T, float>), __m512,
+			typename std::conditional<(len > 4 && std::same_as<T, float>), __m256,
+			typename std::conditional<(len <= 4 && std::same_as<T, float>), __m128,
+			typename std::conditional<(len > 4 && std::same_as<T, double>), __m512d,
+			typename std::conditional<(len > 2 && std::same_as<T, double>), __m256d, __m128d
+			>::type>::type>::type>::type>::type>::type>::type>::type>::type>::type>::type vector_type;
+
+
+		public:
+		template <typename Datatype, typename Vectortype>
+		static consteval uint8_t GetVectorAlignedSize() {
+			if constexpr(std::same_as<Datatype, int8_t> && std::same_as<Vectortype, __m512i>) {
+				return 64;
+			} else if constexpr(std::same_as<Vectortype, __m512h> || (std::same_as<T, int8_t> && std::same_as<Vectortype, __m256i>) || (std::same_as<T, int16_t> && std::same_as<Vectortype, __m512i>)) {
+				return 32;
+			} else if constexpr(std::same_as<Vectortype, __m256h> || std::same_as<Vectortype, __m512> || (std::same_as<Datatype, int8_t> && std::same_as<Vectortype, __m128i>) || (std::same_as<Datatype, int16_t> && std::same_as<Vectortype, __m256i>) || (std::same_as<T, int32_t> && std::same_as<Vectortype, __m512i>)) {
+				return 16;
+			} else if constexpr(std::same_as<Vectortype, __m128h> || std::same_as<Vectortype, __m256> || std::same_as<Vectortype, __m512d> || (std::same_as<Datatype, int16_t> && std::same_as<Vectortype, __m128i>) || (std::same_as<T, int32_t> && std::same_as<Vectortype, __m256i>) || (std::same_as<Datatype, int64_t> && std::same_as<Vectortype, __m512i>)) {
+				return 8;
+			} else if constexpr(std::same_as<Vectortype, __m128> || std::same_as<Vectortype, __m256d> || (std::same_as<Datatype, int32_t> && std::same_as<Vectortype, __m128>) || (std::same_as<Datatype, int64_t> && std::same_as<Vectortype, __m256i>)) {
+				return 4;
+			} else if constexpr(std::same_as<Vectortype, __m128d> || (std::same_as<Datatype, int64_t> && std::same_as<Vectortype, __m128>)) {
+				return 2;
+			} else{
+				throw std::invalid_argument("");
+			}
+		}
+			vector_type v;
+
+			Vector() {};
+			Vector(__m128h v) requires(std::same_as<vector_type, __m128h>) : v(v) { }
+			Vector(__m128 v) requires(std::same_as<vector_type, __m128>) : v(v) { }
+			Vector(__m128d v) requires(std::same_as<vector_type, __m128d>) : v(v) { }
+			Vector(__m128i v) requires(std::same_as<vector_type, __m128i>) : v(v) { }
+
+			Vector(__m256h v) requires(std::same_as<vector_type, __m256h>) : v(v) { }
+			Vector(__m256 v) requires(std::same_as<vector_type, __m256>) : v(v) { }
+			Vector(__m256d v) requires(std::same_as<vector_type, __m256d>) : v(v) { }
+			Vector(__m256i v) requires(std::same_as<vector_type, __m256i>) : v(v) { }
+
+			Vector(__m512h v) requires(std::same_as<vector_type, __m512h>) : v(v) { }
+			Vector(__m512 v) requires(std::same_as<vector_type, __m512>) : v(v) { }
+			Vector(__m512d v) requires(std::same_as<vector_type, __m512d>) : v(v) { }
+			Vector(__m512i v) requires(std::same_as<vector_type, __m512i>) : v(v) { }
+
+			template <uint32_t blen>
+			void operator+=(Vector<T, blen> b) requires(Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				this->v = (*this+b).v;
+			}
+			template <uint32_t blen>
+			void operator-=(Vector<T, blen> b) requires(Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				this->v = (*this-b).v;
+			}
+			template <uint32_t blen>
+			void operator*=(Vector<T, blen> b) requires(Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				this->v = (*this*b).v;
+			}
+			template <uint32_t blen>
+			void operator/=(Vector<T, blen> b) requires(Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				this->v = (*this/b).v;
+			}
+
+			std::string ToString() const {
+				std::ostringstream ss;
+				ss << "{ ";
+				T store[GetVectorAlignedSize<T, vector_type>()];
+				Store(store);
+				for(uint8_t i = 0; i < len; i++) {
+					ss << std::format("{}", store[i]);
+					if(i+1 < len) {
+						ss << ", ";
+					}
+				}
+				ss << " }";
+				return std::string(ss.str());
+			}
+#pragma region 128
+			Vector(
+				const __fp16& x0 = 0, const __fp16& y0 = 0, const __fp16& z0 = 0, const __fp16& w0 = 0,
+				const __fp16& x1 = 0, const __fp16& y1 = 0, const __fp16& z1 = 0, const __fp16& w1 = 0
+			) requires(std::same_as<T, __fp16> && std::same_as<vector_type, __m128h>) {
+				__fp16 temp[]{ x0,y0,z0,w0,x1,y1,z1,w1,};
+				v = _mm_load_ph(temp);
+			}
+
+			Vector(float x0 = 0, float y0 = 0, float z0 = 0, float w0 = 0) requires(std::same_as<T, float>&& std::same_as<vector_type, __m128>) {
+				v = _mm_set_ps(w0, z0, y0, x0);
+			}
+
+			Vector(double x0 = 0, double y0 = 0) requires(std::same_as<T, double>&& std::same_as<vector_type, __m128d>) {
+				v = _mm_set_pd(y0, x0);
+			}
+
+			Vector(
+				int8_t x0 = 0, int8_t y0 = 0, int8_t z0 = 0, int8_t w0 = 0,
+				int8_t x1 = 0, int8_t y1 = 0, int8_t z1 = 0, int8_t w1 = 0,
+				int8_t x2 = 0, int8_t y2 = 0, int8_t z2 = 0, int8_t w2 = 0,
+				int8_t x3 = 0, int8_t y3 = 0, int8_t z3 = 0, int8_t w3 = 0
+			) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi8(w3, z3, y3, x3, w2, z2, y2, x2, w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(
+				int16_t x0 = 0, int16_t y0 = 0, int16_t z0 = 0, int16_t w0 = 0,
+				int16_t x1 = 0, int16_t y1 = 0, int16_t z1 = 0, int16_t w1 = 0
+			) requires(std::same_as<T, int16_t>&& std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi16(w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(int32_t x0 = 0, int32_t y0 = 0, int32_t z0 = 0, int32_t w0 = 0) requires(std::same_as<T, int32_t>&& std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi32(w0, z0, y0, x0);
+			}
+
+			Vector(int64_t x0 = 0, int64_t y0 = 0) requires(std::same_as<T, int64_t>&& std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi64x(y0, x0);
+			}
+
+			Vector(
+				uint8_t x0 = 0, uint8_t y0 = 0, uint8_t z0 = 0, uint8_t w0 = 0,
+				uint8_t x1 = 0, uint8_t y1 = 0, uint8_t z1 = 0, uint8_t w1 = 0,
+				uint8_t x2 = 0, uint8_t y2 = 0, uint8_t z2 = 0, uint8_t w2 = 0,
+				uint8_t x3 = 0, uint8_t y3 = 0, uint8_t z3 = 0, uint8_t w3 = 0
+			) requires(std::same_as<T, uint8_t> && std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi8(w3, z3, y3, x3, w2, z2, y2, x2, w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(
+				uint16_t x0 = 0, uint16_t y0 = 0, uint16_t z0 = 0, uint16_t w0 = 0,
+				uint16_t x1 = 0, uint16_t y1 = 0, uint16_t z1 = 0, uint16_t w1 = 0
+			) requires(std::same_as<T, uint16_t>&& std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi16(w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(uint32_t x0 = 0, uint32_t y0 = 0, uint32_t z0 = 0, uint32_t w0 = 0) requires(std::same_as<T, uint32_t>&& std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi32(w0, z0, y0, x0);
+			}
+
+			Vector(uint64_t x0 = 0, uint64_t y0 = 0) requires(std::same_as<T, uint64_t>&& std::same_as<vector_type, __m128i>) {
+				v = _mm_set_epi64x(y0, x0);
+			}
+
+			static Vector<T, len> Zero() requires(std::same_as<vector_type, __m128>) {
+				return Vector<T, len>(_mm_setzero_ps());
+			}
+
+			void Store(T* data) const requires(std::same_as<vector_type, __m128h>) {
+				_mm_storeu_ph(reinterpret_cast<void*>(data), reinterpret_cast<__m128h>(v));
+			}
+			void Store(T* data) const requires(std::same_as<vector_type, __m128>) {
+				_mm_storeu_ps(reinterpret_cast<float*>(data), reinterpret_cast<__m128>(v));
+			}
+			void Store(T* data) const requires(std::same_as<vector_type, __m128d>) {
+				_mm_storeu_pd(data, reinterpret_cast<__m128d>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m128i>) {
+				_mm_storeu_epi8(reinterpret_cast<void*>(data), reinterpret_cast<__m128i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m128i>) {
+				_mm_storeu_epi16(reinterpret_cast<void*>(data), reinterpret_cast<__m128i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m128i>) {
+				_mm_storeu_epi32(reinterpret_cast<void*>(data), reinterpret_cast<__m128i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m128i>) {
+				_mm_storeu_epi64(reinterpret_cast<void*>(data), reinterpret_cast<__m128i>(v));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m128h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_add_ph(reinterpret_cast<__m128h>(v), reinterpret_cast<__m128h>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m128h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_sub_ph(reinterpret_cast<__m128h>(v), reinterpret_cast<__m128h>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m128h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_mul_ph(reinterpret_cast<__m128h>(v), reinterpret_cast<__m128h>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m128h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_div_ph(reinterpret_cast<__m128h>(v), reinterpret_cast<__m128h>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m128> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_add_ps(reinterpret_cast<__m128>(v), reinterpret_cast<__m128>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m128> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_sub_ps(reinterpret_cast<__m128>(v), reinterpret_cast<__m128>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m128> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_mul_ps(reinterpret_cast<__m128>(v), reinterpret_cast<__m128>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m128> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_div_ps(reinterpret_cast<__m128>(v), reinterpret_cast<__m128>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m128d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_add_pd(reinterpret_cast<__m128d>(v), reinterpret_cast<__m128d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m128d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_sub_pd(reinterpret_cast<__m128d>(v), reinterpret_cast<__m128d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m128d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_mul_pd(reinterpret_cast<__m128d>(v), reinterpret_cast<__m128d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m128d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_div_pd(reinterpret_cast<__m128d>(v), reinterpret_cast<__m128d>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_add_epi8(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_sub_epi8(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi8(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi8(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_add_epi16(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_sub_epi16(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi16(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi16(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_add_epi32(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_sub_epi32(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_mul_epi32(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, i132> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi32(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_add_epi64(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m128i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm_sub_epi64(reinterpret_cast<__m128i>(v), reinterpret_cast<__m128i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi64(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi64(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> AddMask() {
+
+			}
+#pragma endregion
+#pragma region 256
+			Vector(
+				const __fp16& x0 = 0, const __fp16& y0 = 0, const __fp16& z0 = 0, const __fp16& w0 = 0,
+				const __fp16& x1 = 0, const __fp16& y1 = 0, const __fp16& z1 = 0, const __fp16& w1 = 0,
+				const __fp16& x2 = 0, const __fp16& y2 = 0, const __fp16& z2 = 0, const __fp16& w2 = 0,
+				const __fp16& x3 = 0, const __fp16& y3 = 0, const __fp16& z3 = 0, const __fp16& w3 = 0
+			) requires(std::same_as<T, __fp16>&& std::same_as<vector_type, __m256h>) {
+				__fp16 temp[]{ w0,z0,y0,x0,w1,z1,y1,x1,w2,z2,y2,x2,w3,z3,y3,x3 };
+				v = _mm256_load_ph(temp);
+			}
+
+			Vector(
+				float x0 = 0, float y0 = 0, float z0 = 0, float w0 = 0,
+				float x1 = 0, float y1 = 0, float z1 = 0, float w1 = 0
+			) requires(std::same_as<T, float>&& std::same_as<vector_type, __m256>) {
+				v = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(double x0 = 0, double y0 = 0, double z0 = 0, double w0 = 0) requires(std::same_as<T, double>&& std::same_as<vector_type, __m256d>) {
+				v = _mm256_set_pd(w0, z0, y0, x0);
+			}
+
+			Vector(
+				int8_t x0 = 0, int8_t y0 = 0, int8_t z0 = 0, int8_t w0 = 0,
+				int8_t x1 = 0, int8_t y1 = 0, int8_t z1 = 0, int8_t w1 = 0,
+				int8_t x2 = 0, int8_t y2 = 0, int8_t z2 = 0, int8_t w2 = 0,
+				int8_t x3 = 0, int8_t y3 = 0, int8_t z3 = 0, int8_t w3 = 0,
+				int8_t x4 = 0, int8_t y4 = 0, int8_t z4 = 0, int8_t w4 = 0,
+				int8_t x5 = 0, int8_t y5 = 0, int8_t z5 = 0, int8_t w5 = 0,
+				int8_t x6 = 0, int8_t y6 = 0, int8_t z6 = 0, int8_t w6 = 0,
+				int8_t x7 = 0, int8_t y7 = 0, int8_t z7 = 0, int8_t w7 = 0
+			) requires(std::same_as<T, int8_t>&& std::same_as<vector_type, __m256i>) {
+				v = _mm256_set_epi8(w7, z7, y7, x7, w6, z6, y6, x6, w5, z5, y5, x5, w4, z4, y4, x4, w3, z3, y3, x3, w2, z2, y2, x2, w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(
+				int16_t x0 = 0, int16_t y0 = 0, int16_t z0 = 0, int16_t w0 = 0,
+				int16_t x1 = 0, int16_t y1 = 0, int16_t z1 = 0, int16_t w1 = 0,
+				int16_t x2 = 0, int16_t y2 = 0, int16_t z2 = 0, int16_t w2 = 0,
+				int16_t x3 = 0, int16_t y3 = 0, int16_t z3 = 0, int16_t w3 = 0
+			) requires(std::same_as<T, int16_t>&& std::same_as<vector_type, __m256i>) {
+				v = _mm256_set_epi16(w3, z3, y3, x3, w2, z2, y2, x2, w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(
+				int32_t x0 = 0, int32_t y0 = 0, int32_t z0 = 0, int32_t w0 = 0,
+				int32_t x1 = 0, int32_t y1 = 0, int32_t z1 = 0, int32_t w1 = 0
+			) requires(std::same_as<T, int32_t>&& std::same_as<vector_type, __m256i>) {
+				v = _mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0);
+			}
+
+			Vector(int64_t x0 = 0, int64_t y0 = 0, int64_t z0 = 0, int64_t w0 = 0) requires(std::same_as<T, int64_t>&& std::same_as<vector_type, __m256>) {
+				v = _mm256_set_epi64x(w0, z0, y0, x0);
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m256h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_add_ph(reinterpret_cast<__m256h>(v), reinterpret_cast<__m256h>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m256h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_sub_ph(reinterpret_cast<__m256h>(v), reinterpret_cast<__m256h>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m256h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_mul_ph(reinterpret_cast<__m256h>(v), reinterpret_cast<__m256h>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m256h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_div_ph(reinterpret_cast<__m256h>(v), reinterpret_cast<__m256h>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m256> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_add_ps(reinterpret_cast<__m256>(v), reinterpret_cast<__m256>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m256> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_sub_ps(reinterpret_cast<__m256>(v), reinterpret_cast<__m256>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m256> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_mul_ps(reinterpret_cast<__m256>(v), reinterpret_cast<__m256>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m256> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_div_ps(reinterpret_cast<__m256>(v), reinterpret_cast<__m256>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m256d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_add_pd(reinterpret_cast<__m256d>(v), reinterpret_cast<__m256d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m256d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_sub_pd(reinterpret_cast<__m256d>(v), reinterpret_cast<__m256d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m256d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_mul_pd(reinterpret_cast<__m256d>(v), reinterpret_cast<__m256d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m256d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_div_pd(reinterpret_cast<__m256d>(v), reinterpret_cast<__m256d>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_add_epi8(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_sub_epi8(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi8(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi8(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_add_epi16(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_sub_epi16(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi16(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi16(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_add_epi32(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_sub_epi32(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_mul_epi32(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, i132> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi32(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_add_epi64(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m256i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm256_sub_epi64(reinterpret_cast<__m256i>(v), reinterpret_cast<__m256i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi64(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi64(v, bv));
+			// }
+
+			void Store(T* data) const requires(std::same_as<vector_type, __m256h>) {
+				_mm256_storeu_ph(reinterpret_cast<void*>(data), reinterpret_cast<__m256h>(v));
+			}
+			void Store(T* data) const requires(std::same_as<vector_type, __m256>) {
+				_mm256_storeu_ps(data, reinterpret_cast<__m256>(v));
+			}
+			void Store(T* data) const requires(std::same_as<vector_type, __m256d>) {
+				_mm256_storeu_pd(data, reinterpret_cast<__m256d>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m256i>) {
+				_mm256_storeu_epi8(reinterpret_cast<void*>(data), reinterpret_cast<__m256i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m256i>) {
+				_mm256_storeu_epi16(reinterpret_cast<void*>(data), reinterpret_cast<__m256i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m256i>) {
+				_mm256_storeu_epi32(reinterpret_cast<void*>(data), reinterpret_cast<__m256i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m256i>) {
+				_mm256_storeu_epi64(reinterpret_cast<void*>(data), reinterpret_cast<__m256i>(v));
+			}
+#pragma endregion
+#pragma region 512
+			Vector(
+				const __fp16& x0 = 0, const __fp16& y0 = 0, const __fp16& z0 = 0, const __fp16& w0 = 0,
+				const __fp16& x1 = 0, const __fp16& y1 = 0, const __fp16& z1 = 0, const __fp16& w1 = 0,
+				const __fp16& x2 = 0, const __fp16& y2 = 0, const __fp16& z2 = 0, const __fp16& w2 = 0,
+				const __fp16& x3 = 0, const __fp16& y3 = 0, const __fp16& z3 = 0, const __fp16& w3 = 0,
+				const __fp16& x4 = 0, const __fp16& y4 = 0, const __fp16& z4 = 0, const __fp16& w4 = 0,
+				const __fp16& x5 = 0, const __fp16& y5 = 0, const __fp16& z5 = 0, const __fp16& w5 = 0,
+				const __fp16& x6 = 0, const __fp16& y6 = 0, const __fp16& z6 = 0, const __fp16& w6 = 0,
+				const __fp16& x7 = 0, const __fp16& y7 = 0, const __fp16& z7 = 0, const __fp16& w7 = 0
+			) requires(std::same_as<T, __fp16>&& std::same_as<vector_type, __m512h>) {
+				__fp16 temp[]{ w0,z0,y0,x0, w1,z1,y1,x1, w2,z2,y2,x2,w3, z3,y3,x3, w4,z4,y4,x4, w5,z5,y5,x5, w6,z6,y6,x6, w7,z7,y7,x7 };
+				v = _mm512_load_ph(temp);
+			}
+
+			Vector(
+				float x0 = 0, float y0 = 0, float z0 = 0, float w0 = 0,
+				float x1 = 0, float y1 = 0, float z1 = 0, float w1 = 0,
+				float x2 = 0, float y2 = 0, float z2 = 0, float w2 = 0,
+				float x3 = 0, float y3 = 0, float z3 = 0, float w3 = 0
+			) requires(std::same_as<T, float>&& std::same_as<vector_type, __m512>) {
+				v = _mm512_set_ps(
+					w3, z3, y3, x3,
+					w2, z2, y2, x2,
+					w1, z1, y1, x1,
+					w0, z0, y0, x0
+				);
+			}
+
+			Vector(
+				double x0 = 0, double y0 = 0, double z0 = 0, double w0 = 0,
+				double x1 = 0, double y1 = 0, double z1 = 0, double w1 = 0
+			) requires(std::same_as<T, double>&& std::same_as<vector_type, __m512d>) {
+				v = _mm512_set_pd(
+					w1, z1, y1, x1,
+					w0, z0, y0, x0
+				);
+			}
+
+			Vector(
+				int8_t x0 = 0, int8_t y0 = 0, int8_t z0 = 0, int8_t w0 = 0,
+				int8_t x1 = 0, int8_t y1 = 0, int8_t z1 = 0, int8_t w1 = 0,
+				int8_t x2 = 0, int8_t y2 = 0, int8_t z2 = 0, int8_t w2 = 0,
+				int8_t x3 = 0, int8_t y3 = 0, int8_t z3 = 0, int8_t w3 = 0,
+				int8_t x4 = 0, int8_t y4 = 0, int8_t z4 = 0, int8_t w4 = 0,
+				int8_t x5 = 0, int8_t y5 = 0, int8_t z5 = 0, int8_t w5 = 0,
+				int8_t x6 = 0, int8_t y6 = 0, int8_t z6 = 0, int8_t w6 = 0,
+				int8_t x7 = 0, int8_t y7 = 0, int8_t z7 = 0, int8_t w7 = 0,
+				int8_t x8 = 0, int8_t y8 = 0, int8_t z8 = 0, int8_t w8 = 0,
+				int8_t x9 = 0, int8_t y9 = 0, int8_t z9 = 0, int8_t w9 = 0,
+				int8_t x10 = 0, int8_t y10 = 0, int8_t z10 = 0, int8_t w10 = 0,
+				int8_t x11 = 0, int8_t y11 = 0, int8_t z11 = 0, int8_t w11 = 0,
+				int8_t x12 = 0, int8_t y12 = 0, int8_t z12 = 0, int8_t w12 = 0,
+				int8_t x13 = 0, int8_t y13 = 0, int8_t z13 = 0, int8_t w13 = 0,
+				int8_t x14 = 0, int8_t y14 = 0, int8_t z14 = 0, int8_t w14 = 0,
+				int8_t x15 = 0, int8_t y15 = 0, int8_t z15 = 0, int8_t w15 = 0
+			) requires(std::same_as<T, int8_t>&& std::same_as<vector_type, __m512i>) {
+				v = _mm512_set_epi8(
+					w15, z15, y15, x15,
+					w14, z14, y14, x14,
+					w13, z13, y13, x13,
+					w12, z12, y12, x12,
+					w11, z11, y11, x11,
+					w10, z10, y10, x10,
+					w9, z9, y9, x9,
+					w8, z8, y8, x8,
+					w7, z7, y7, x7,
+					w6, z6, y6, x6,
+					w5, z5, y5, x5,
+					w4, z4, y4, x4,
+					w3, z3, y3, x3,
+					w2, z2, y2, x2,
+					w1, z1, y1, x1,
+					w0, z0, y0, x0
+				);
+			}
+
+			Vector(
+				int16_t x0 = 0, int16_t y0 = 0, int16_t z0 = 0, int16_t w0 = 0,
+				int16_t x1 = 0, int16_t y1 = 0, int16_t z1 = 0, int16_t w1 = 0,
+				int16_t x2 = 0, int16_t y2 = 0, int16_t z2 = 0, int16_t w2 = 0,
+				int16_t x3 = 0, int16_t y3 = 0, int16_t z3 = 0, int16_t w3 = 0,
+				int16_t x4 = 0, int16_t y4 = 0, int16_t z4 = 0, int16_t w4 = 0,
+				int16_t x5 = 0, int16_t y5 = 0, int16_t z5 = 0, int16_t w5 = 0,
+				int16_t x6 = 0, int16_t y6 = 0, int16_t z6 = 0, int16_t w6 = 0,
+				int16_t x7 = 0, int16_t y7 = 0, int16_t z7 = 0, int16_t w7 = 0
+			) requires(std::same_as<T, int16_t>&& std::same_as<vector_type, __m512i>) {
+				v = _mm512_set_epi16(
+					w7, z7, y7, x7,
+					w6, z6, y6, x6,
+					w5, z5, y5, x5,
+					w4, z4, y4, x4,
+					w3, z3, y3, x3,
+					w2, z2, y2, x2,
+					w1, z1, y1, x1,
+					w0, z0, y0, x0
+				);
+			}
+
+			Vector(
+				int32_t x0 = 0, int32_t y0 = 0, int32_t z0 = 0, int32_t w0 = 0,
+				int32_t x1 = 0, int32_t y1 = 0, int32_t z1 = 0, int32_t w1 = 0,
+				int32_t x2 = 0, int32_t y2 = 0, int32_t z2 = 0, int32_t w2 = 0,
+				int32_t x3 = 0, int32_t y3 = 0, int32_t z3 = 0, int32_t w3 = 0
+			) requires(std::same_as<T, int32_t>&& std::same_as<vector_type, __m512i>) {
+				v = _mm512_set_epi32(
+					w3, z3, y3, x3,
+					w2, z2, y2, x2,
+					w1, z1, y1, x1,
+					w0, z0, y0, x0
+				);
+			}
+
+			Vector(
+				int64_t x0 = 0, int64_t y0 = 0, int64_t z0 = 0, int64_t w0 = 0,
+				int64_t x1 = 0, int64_t y1 = 0, int64_t z1 = 0, int64_t w1 = 0
+			) requires(std::same_as<T, int64_t>&& std::same_as<vector_type, __m512i>) {
+				v = _mm512_set_epi64(
+					w1, z1, y1, x1,
+					w0, z0, y0, x0
+				);
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m512h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_add_ph(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m512h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_sub_ph(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m512h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_mul_ph(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m512h> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_div_ph(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m512> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_add_ps(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m512> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_sub_ps(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m512> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_mul_ps(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m512> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_div_ps(reinterpret_cast<__m512>(v), reinterpret_cast<__m512>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<vector_type, __m512d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_add_pd(reinterpret_cast<__m512d>(v), reinterpret_cast<__m512d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<vector_type, __m512d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_sub_pd(reinterpret_cast<__m512d>(v), reinterpret_cast<__m512d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<vector_type, __m512d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_mul_pd(reinterpret_cast<__m512d>(v), reinterpret_cast<__m512d>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<vector_type, __m512d> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_div_pd(reinterpret_cast<__m512d>(v), reinterpret_cast<__m512d>(b.v)));
+			}
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_add_epi8(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_sub_epi8(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi8(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi8(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_add_epi16(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_sub_epi16(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi16(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi16(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_add_epi32(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_sub_epi32(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_mul_epi32(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, i132> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi32(v, bv));
+			// }
+
+			template <uint32_t blen>
+			Vector<T, len> operator+(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_add_epi64(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			template <uint32_t blen>
+			Vector<T, len> operator-(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i> && Vector<T, blen>::GetVectorAlignedSize() == GetVectorAlignedSize()) {
+				return Vector<T, len>(_mm512_sub_epi64(reinterpret_cast<__m512i>(v), reinterpret_cast<__m512i>(b.v)));
+			}
+			// template <uint32_t blen>
+			// Vector<T, len> operator*(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_mul_epi64(v, bv));
+			// }
+			// template <uint32_t blen>
+			// Vector<T, len> operator/(Vector<T, blen> b) requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i>) {
+			// 	__m512i v = this->v;
+			// 	__m512i bv = b.v;
+			// 	return Vector<T, len>(_mm512_div_epi64(v, bv));
+			// }
+
+			void Store(T* data) const requires(std::same_as<vector_type, __m512h>) {
+				_mm512_storeu_ph(reinterpret_cast<void*>(data), reinterpret_cast<__m512h>(v));
+			}
+			void Store(T* data) const requires(std::same_as<vector_type, __m512>) {
+				_mm512_storeu_ps(reinterpret_cast<void*>(data), reinterpret_cast<__m512>(v));
+			}
+			void Store(T* data) const requires(std::same_as<vector_type, __m512d>) {
+				_mm512_storeu_pd(data, reinterpret_cast<__m512d>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int8_t> && std::same_as<vector_type, __m512i>) {
+				_mm512_storeu_epi8(reinterpret_cast<void*>(data), reinterpret_cast<__m512i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int16_t> && std::same_as<vector_type, __m512i>) {
+				_mm512_storeu_epi16(reinterpret_cast<void*>(data), reinterpret_cast<__m512i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int32_t> && std::same_as<vector_type, __m512i>) {
+				_mm512_storeu_epi32(reinterpret_cast<void*>(data), reinterpret_cast<__m512i>(v));
+			}
+			void Store(T* data) const requires(std::same_as<T, int64_t> && std::same_as<vector_type, __m512i>) {
+				_mm512_storeu_epi64(reinterpret_cast<void*>(data), reinterpret_cast<__m512i>(v));
+			}
+#pragma endregion
+	};
+
+	export template <typename T, uint32_t len, uint32_t vectorLenght>
+	class VectorVector : public Vector<T, len*vectorLenght> {
+		public:
+			VectorVector(__m128h v0, __m128h v1) requires(std::same_as<T, __fp16> && vectorLenght*Vector<T, len>::GetVectorAlignedSize() == Vector<T, len*vectorLenght>::GetVectorAlignedSize()) { 
+				this->v = _mm256_castps128_ps256(v0);
+				this->v = _mm256_insertf128_ps(this->v,v1,1);
+			}
+			VectorVector(__m128 v0, __m128 v1, __m128 v2, __m128 v3) requires(std::same_as<T, float> && vectorLenght*Vector<T, len>::GetVectorAlignedSize() == Vector<T, len*vectorLenght>::GetVectorAlignedSize()) { 
+				this->v = _mm512_castps256_ps512(_mm256_castps128_ps256(v0));
+				this->v = _mm512_insertfloatx4(this->v, v1, 1);
+				this->v = _mm512_insertfloatx4(this->v, v2, 2);
+				this->v = _mm512_insertfloatx4(this->v, v3, 3);
+			}
+			VectorVector(__m512 v) : Vector<T, len*vectorLenght>(v) { //requires(std::same_as<T, float> && vectorLenght*Vector<T, len>::GetVectorAlignedSize() == Vector<T, len*vectorLenght>::GetVectorAlignedSize()) : Vector<T, len*vectorLenght>(v)  
+
+			}
+			VectorVector(
+				float x0 = 0, float y0 = 0, float z0 = 0, 
+				float x1 = 0, float y1 = 0, float z1 = 0,  
+				float x2 = 0, float y2 = 0, float z2 = 0,  
+				float x3 = 0, float y3 = 0, float z3 = 0,  
+				float x4 = 0, float y4 = 0, float z4 = 0,  
+				float x5 = 0
+				) requires(std::same_as<T, float> && vectorLenght*Vector<T, len>::GetVectorAlignedSize() == Vector<T, len*vectorLenght>::GetVectorAlignedSize() && len == 3) : 
+					Vector<T, len*vectorLenght>(
+						x0,y0,z0,
+						x1,y1,z1,
+						x2,y2,z2,
+						x3,y3,z3,
+						x4,y4,z4,
+						x5) 
+				{}
+
+			VectorVector(
+				float x0 = 0, float y0 = 0, float z0 = 0, float w0 = 0,
+				float x1 = 0, float y1 = 0, float z1 = 0, float w1 = 0,
+				float x2 = 0, float y2 = 0, float z2 = 0, float w2 = 0,
+				float x3 = 0, float y3 = 0, float z3 = 0, float w3 = 0
+			) :
+				Vector<T, len*vectorLenght>(
+					w3, z3, y3, x3,
+					w2, z2, y2, x2,
+					w1, z1, y1, x1,
+					w0, z0, y0, x0)
+			{}
+	};
+
+	export Vector<float, 4> g_XMNegIdentityR0(-1.0f, 0.0f, 0.0f, 0.0f);
+    export Vector<float, 4> g_XMNegIdentityR1(0.0f, -1.0f, 0.0f, 0.0f);
+    export Vector<float, 4> g_XMNegIdentityR2(0.0f, 0.0f, -1.0f, 0.0f);
+    export Vector<float, 4> g_XMNegIdentityR3(0.0f, 0.0f, 0.0f, -1.0f);
+	export Vector<float, 4> g_XMIdentityR0(1.0f, 0.0f, 0.0f, 0.0f);
+    export Vector<float, 4> g_XMIdentityR1(0.0f, 1.0f, 0.0f, 0.0f);
+    export Vector<float, 4> g_XMIdentityR2(0.0f, 0.0f, 1.0f, 0.0f);
+    export Vector<float, 4> g_XMIdentityR3(0.0f, 0.0f, 0.0f, 1.0f);
+	export Vector<uint32_t, 4> g_XMMaskXY(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
+    export Vector<uint32_t, 4> g_XMMask3(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
+    export Vector<uint32_t, 4> g_XMMaskX(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000);
+    export Vector<uint32_t, 4> g_XMMaskY(0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000);
+    export Vector<uint32_t, 4> g_XMMaskZ(0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000);
+    export Vector<uint32_t, 4> g_XMMaskW( 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
+}
--- a/Crafter.Math.cppm
+++ b/Crafter.Math.cppm
@ -0,0 +1,5 @@
+export module Crafter.Math;
+export import :BasicTypes;
+export import :Vector;
+export import :Matrix;
+export import :Misc;
--- a/bin/crafter-math
+++ b/bin/crafter-math
--- a/build/debug-lib/Crafter.Math-BasicTypes.o
+++ b/build/debug-lib/Crafter.Math-BasicTypes.o
--- a/build/debug-lib/Crafter.Math-Misc.o
+++ b/build/debug-lib/Crafter.Math-Misc.o
--- a/build/debug-lib/Crafter.Math-Vector.o
+++ b/build/debug-lib/Crafter.Math-Vector.o
--- a/build/debug/Crafter.Math-BasicTypes.o
+++ b/build/debug/Crafter.Math-BasicTypes.o
--- a/build/debug/Crafter.Math-BasicTypes.pcm
+++ b/build/debug/Crafter.Math-BasicTypes.pcm
--- a/build/debug/Crafter.Math-Matrix.o
+++ b/build/debug/Crafter.Math-Matrix.o
--- a/build/debug/Crafter.Math-Matrix.pcm
+++ b/build/debug/Crafter.Math-Matrix.pcm
--- a/build/debug/Crafter.Math-Misc.o
+++ b/build/debug/Crafter.Math-Misc.o
--- a/build/debug/Crafter.Math-Misc.pcm
+++ b/build/debug/Crafter.Math-Misc.pcm
--- a/build/debug/Crafter.Math-Vector.o
+++ b/build/debug/Crafter.Math-Vector.o
--- a/build/debug/Crafter.Math-Vector.pcm
+++ b/build/debug/Crafter.Math-Vector.pcm
--- a/build/debug/Crafter.Math.o
+++ b/build/debug/Crafter.Math.o
--- a/build/debug/Crafter.Math.pcm
+++ b/build/debug/Crafter.Math.pcm
--- a/build/debug/main_source.o
+++ b/build/debug/main_source.o
--- a/main.cpp
+++ b/main.cpp
@ -0,0 +1,21 @@
+#include <iostream>
+#include <chrono>
+#include <immintrin.h>
+#include <random>
+#include <format>
+
+import Crafter.Math;
+using namespace Crafter;
+
+
+int main() {
+    Matrix<float, 4, 4, 1> matrix(
+        1, 0, 0, 0,
+        0, 1, 0, 0,
+        0, 0, 1, 0,
+        1, 0, 0, 1
+    );
+    Vector<float, 4> test(0, 0, 0, 1);
+    Vector<float, 4> result = matrix*test;
+    std::cout << result.ToString() << std::endl;
+}
--- a/project.json
+++ b/project.json
@ -0,0 +1,35 @@
+{
+    "name": "crafter-math",
+    "configurations": [
+        {
+            "name": "base",
+            "standard": "c++26",
+            "source_files": [],
+            "module_files": ["Crafter.Math-Vector", "Crafter.Math-BasicTypes", "Crafter.Math-MatrixAMX", "Crafter.Math-Matrix", "Crafter.Math-Misc", "Crafter.Math"],
+            "build_dir": "./build",
+            "output_dir": "./bin"
+        },
+        {
+            "name": "debug",
+            "type": "executable",
+            "source_files": ["main"],
+            "optimization_level": "3",
+            "extends":["base"]
+        },
+        {
+            "name": "lib",
+            "extends": ["base"],
+            "type":"library"
+        },
+        {
+            "name": "debug-lib",
+            "extends": ["lib"],
+            "optimization_level": "0"
+        },
+        {
+            "name": "release-lib",
+            "extends": ["lib"],
+            "optimization_level": "3"
+        }
+    ]
+}