commit 59ea75710031f0afc82c2eef0060dba0e71e9a54
parent 4de55af2687bc2d3ade6b80219127ec049e605aa
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Thu, 1 Jun 2023 21:26:27 -0700
Add SIMD stuff
Diffstat:
| A | inc/simd.h | | | 224 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 224 insertions(+), 0 deletions(-)
diff --git a/inc/simd.h b/inc/simd.h
@@ -0,0 +1,224 @@
+#pragma once
+
+/* Note: This is a work in progress. Bindings for instructions should be
+ * added as needed. */
+
+#ifdef __MMX__
+#define R_HAVE_MMX 1
+#endif
+
+#ifdef __SSE__
+#define R_HAVE_SSE 1
+#endif
+
+#ifdef __SSE2__
+#define R_HAVE_SSE2 1
+#endif
+
+#ifdef __SSE3__
+#define R_HAVE_SSE3 1
+#endif
+
+#ifdef __SSSE3__
+#define R_HAVE_SSSE3 1
+#endif
+
+#ifdef __SSE4_1__
+#define R_HAVE_SSE4_1 1
+#endif
+
+#ifdef __SSE4_2__
+#define R_HAVE_SSE4_2 1
+#endif
+
+#ifdef __AVX__
+#define R_HAVE_AVX 1
+#endif
+
+#ifdef __AVX2__
+#define R_HAVE_AVX2 1
+#endif
+
+/* TODO: AVX-512 */
+
+/* TODO: MMX
+typedef i8 v8i8 __attribute__((vector_size(8)));
+typedef u8 v8u8 __attribute__((vector_size(8)));
+typedef i16 v4i16 __attribute__((vector_size(8)));
+typedef u16 v4u16 __attribute__((vector_size(8)));
+typedef i32 v2i32 __attribute__((vector_size(8)));
+typedef u32 v2u32 __attribute__((vector_size(8)));
+*/
+
+/* 128 bit */
+typedef i8 v16i8 __attribute__((vector_size(16)));
+typedef u8 v16u8 __attribute__((vector_size(16)));
+typedef i16 v8i16 __attribute__((vector_size(16)));
+typedef u16 v8u16 __attribute__((vector_size(16)));
+typedef i32 v4i32 __attribute__((vector_size(16)));
+typedef u32 v4u32 __attribute__((vector_size(16)));
+typedef i64 v2i64 __attribute__((vector_size(16)));
+typedef u64 v2u64 __attribute__((vector_size(16)));
+/* These are for casting inputs/output of the GCC builtins. */
+typedef char r_v16qi_ __attribute__((vector_size(16)));
+typedef short r_v8hi_ __attribute__((vector_size(16)));
+typedef int r_v4si_ __attribute__((vector_size(16)));
+typedef long long r_v2di_ __attribute__((vector_size(16)));
+#define v16qi r_v16qi_
+#define v8hi r_v8hi_
+#define v4si r_v4si_
+#define v2di r_v2di_
+
+/* 256 bit */
+typedef i8 v32i8 __attribute__((vector_size(32)));
+typedef u8 v32u8 __attribute__((vector_size(32)));
+typedef i16 v16i16 __attribute__((vector_size(32)));
+typedef u16 v16u16 __attribute__((vector_size(32)));
+typedef i32 v8i32 __attribute__((vector_size(32)));
+typedef u32 v8u32 __attribute__((vector_size(32)));
+typedef i64 v4i64 __attribute__((vector_size(32)));
+typedef u64 v4u64 __attribute__((vector_size(32)));
+/* These are for casting inputs/output of the GCC builtins. */
+typedef char r_v32qi_ __attribute__((vector_size(32)));
+typedef short r_v16hi_ __attribute__((vector_size(32)));
+typedef int r_v8si_ __attribute__((vector_size(32)));
+typedef long long r_v4di_ __attribute__((vector_size(32)));
+#define v32qi r_v32qi_
+#define v16hi r_v16hi_
+#define v8si r_v8si_
+#define v4di r_v4di_
+
+#ifdef R_HAVE_SSE2
+static inline v16i8 v16i8_add(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_add(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_add(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_add(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_add(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_paddd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_add(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_paddd128((v4si)x, (v4si)y); }
+static inline v2i64 v2i64_add(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_paddq128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_add(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_paddq128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_sub(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_sub(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_sub(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_sub(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_sub(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psubd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_sub(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psubd128((v4si)x, (v4si)y); }
+static inline v2i64 v2i64_sub(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_psubq128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_sub(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psubq128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_and(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_and(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_and(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_and(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_and(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_and(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_and(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_and(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_andnot(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_andnot(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_andnot(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_andnot(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_andnot(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_andnot(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_andnot(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_andnot(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_or(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_or(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_or(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_or(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_or(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_or(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_or(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_or(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_por128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_xor(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_xor(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_xor(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_xor(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_xor(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_xor(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_xor(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_xor(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+
+static inline v8u16 v8u16_sl(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psllw128((v8hi)x, (v8hi)y); }
+static inline v4u32 v4u32_sl(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pslld128((v4si)x, (v4si)y); }
+static inline v2u64 v2u64_sl(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psllq128((v2di)x, (v2di)y); }
+
+static inline v8u16 v8u16_sr(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psrlw128((v8hi)x, (v8hi)y); }
+static inline v4u32 v4u32_sr(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psrld128((v4si)x, (v4si)y); }
+static inline v2u64 v2u64_sr(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psrlq128((v2di)x, (v2di)y); }
+
+static inline v8i16 v8i16_sr(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_sr(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); }
+
+static inline v8u16 v8u16_sli(v8u16 x, int c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); }
+static inline v4u32 v4u32_sli(v4u32 x, int c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); }
+static inline v2u64 v2u64_sli(v2u64 x, int c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); }
+
+static inline v8u16 v8u16_sri(v8u16 x, int c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); }
+static inline v4u32 v4u32_sri(v4u32 x, int c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); }
+static inline v2u64 v2u64_sri(v2u64 x, int c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); }
+
+static inline v8i16 v8i16_sri(v8i16 x, int c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); }
+static inline v4i32 v4i32_sri(v4i32 x, int c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); }
+
+static inline v16i8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_cmpeq(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_cmpeq(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_cmpeq(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
+
+static inline v16i8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_cmpgt(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_cmpgt(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_cmpgt(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
+
+static inline uint v16u8_msb(v16u8 x) { return __builtin_ia32_pmovmskb128((v16qi)x); }
+#endif
+
+#ifdef R_HAVE_SSSE3
+static inline v16u8 v16u8_shuf(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pshufb128((v16qi)x, (v16qi)y); }
+
+static inline v16i8 v16i8_sign(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psignb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_sign(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psignw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_sign(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psignd128((v4si)x, (v4si)y); }
+#endif
+
+#ifdef R_HAVE_SSE4_1
+static inline v2i64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_cmpeq(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
+
+static inline v8i16 v16i8_ext16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); }
+static inline v4i32 v16i8_ext32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); }
+static inline v2i64 v16i8_ext64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); }
+static inline v4i32 v8i16_ext32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); }
+static inline v2i64 v8i16_ext64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); }
+static inline v2i64 v4i32_ext64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); }
+
+static inline v8u16 v16u8_ext16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); }
+static inline v4u32 v16u8_ext32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); }
+static inline v2u64 v16u8_ext64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); }
+static inline v4u32 v8u16_ext32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); }
+static inline v2u64 v8u16_ext64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); }
+static inline v2u64 v4u32_ext64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); }
+#endif
+
+#ifdef R_HAVE_SSE4_2
+static inline v2i64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_cmpgt(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
+#endif
+
+#undef v2di
+#undef v4si
+#undef v8hi
+#undef v16qi
+#undef v4di
+#undef v8si
+#undef v16hi
+#undef v32qi