commit 7f03a23fdc342ebf76b3cd7c126eb6de7f5044f6
parent c7df073f1e7b916a7da36cf38b0df35218b56e6e
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Fri, 2 Jun 2023 16:00:51 -0700
Revert "Use portable Intel SIMD"
This reverts commit c7df073f1e7b916a7da36cf38b0df35218b56e6e.
See the new comment at the top of simd.h.
Diffstat:
| M | inc/simd.h | | | 277 | +++++++++++++++++++++++++++++++++++++++++++++++++------------------------------ |
1 file changed, 173 insertions(+), 104 deletions(-)
diff --git a/inc/simd.h b/inc/simd.h
@@ -1,11 +1,11 @@
#pragma once
-#include <x86intrin.h>
-
/* Note: This is a work in progress. Bindings for instructions should be
* added as needed. */
-/* TODO: MMX, AVX-512 */
+#ifdef __MMX__
+#define R_HAVE_MMX 1
+#endif
#ifdef __SSE__
#define R_HAVE_SSE 1
@@ -39,117 +39,186 @@
#define R_HAVE_AVX2 1
#endif
-typedef __m128i v128u1;
-typedef __m128i v16i8, v16u8;
-typedef __m128i v8i16, v8u16;
-typedef __m128i v4i32, v4u32;
-typedef __m128i v2i64, v2u64;
-
-typedef __m256i v256u1;
-typedef __m256i v32i8, v32u8;
-typedef __m256i v16i16, v16u16;
-typedef __m256i v8i32, v8u32;
-typedef __m256i v4i64, v4u64;
+/* TODO: AVX-512 */
+
+/* TODO: MMX
+typedef i8 v8i8 __attribute__((vector_size(8)));
+typedef u8 v8u8 __attribute__((vector_size(8)));
+typedef i16 v4i16 __attribute__((vector_size(8)));
+typedef u16 v4u16 __attribute__((vector_size(8)));
+typedef i32 v2i32 __attribute__((vector_size(8)));
+typedef u32 v2u32 __attribute__((vector_size(8)));
+*/
+
+/* 128 bit */
+typedef i8 v16i8 __attribute__((vector_size(16)));
+typedef u8 v16u8 __attribute__((vector_size(16)));
+typedef i16 v8i16 __attribute__((vector_size(16)));
+typedef u16 v8u16 __attribute__((vector_size(16)));
+typedef i32 v4i32 __attribute__((vector_size(16)));
+typedef u32 v4u32 __attribute__((vector_size(16)));
+typedef i64 v2i64 __attribute__((vector_size(16)));
+typedef u64 v2u64 __attribute__((vector_size(16)));
+/* These are for casting inputs/output of the GCC builtins. */
+typedef char r_v16qi_ __attribute__((vector_size(16)));
+typedef short r_v8hi_ __attribute__((vector_size(16)));
+typedef int r_v4si_ __attribute__((vector_size(16)));
+typedef long long r_v2di_ __attribute__((vector_size(16)));
+#define v16qi r_v16qi_
+#define v8hi r_v8hi_
+#define v4si r_v4si_
+#define v2di r_v2di_
+
+/* 256 bit */
+typedef i8 v32i8 __attribute__((vector_size(32)));
+typedef u8 v32u8 __attribute__((vector_size(32)));
+typedef i16 v16i16 __attribute__((vector_size(32)));
+typedef u16 v16u16 __attribute__((vector_size(32)));
+typedef i32 v8i32 __attribute__((vector_size(32)));
+typedef u32 v8u32 __attribute__((vector_size(32)));
+typedef i64 v4i64 __attribute__((vector_size(32)));
+typedef u64 v4u64 __attribute__((vector_size(32)));
+/* These are for casting inputs/output of the GCC builtins. */
+typedef char r_v32qi_ __attribute__((vector_size(32)));
+typedef short r_v16hi_ __attribute__((vector_size(32)));
+typedef int r_v8si_ __attribute__((vector_size(32)));
+typedef long long r_v4di_ __attribute__((vector_size(32)));
+#define v32qi r_v32qi_
+#define v16hi r_v16hi_
+#define v8si r_v8si_
+#define v4di r_v4di_
#ifdef R_HAVE_SSE2
-#define v128u1_loada _mm_load_si128
-#define v128u1_loadu _mm_loadu_si128
-
-#define v128u1_storea _mm_store_si128
-#define v128u1_storeu _mm_storeu_si128
-
-#define v16i8_set _mm_set_epi8
-#define v8i16_set _mm_set_epi16
-#define v4i32_set _mm_set_epi32
-#define v2i64_set _mm_set_epi64x
-
-#define v16i8_fill _mm_set1_epi8
-#define v8i16_fill _mm_set1_epi16
-#define v4i32_fill _mm_set1_epi32
-#define v2i64_fill _mm_set1_epi64x
-
-#define v16i8_add _mm_add_epi8
-#define v8i16_add _mm_add_epi16
-#define v4i32_add _mm_add_epi32
-#define v2i64_add _mm_add_epi64
-
-#define v16i8_sub _mm_sub_epi8
-#define v8i16_sub _mm_sub_epi16
-#define v4i32_sub _mm_sub_epi32
-#define v2i64_sub _mm_sub_epi64
-
-#define v128u1_and _mm_and_si128
-
-#define v128u1_andnot _mm_andnot_si128
-
-#define v128u1_or _mm_or_si128
-
-#define v128u1_xor _mm_xor_si128
-
-#define v8u16_sl _mm_sll_epi16
-#define v4u32_sl _mm_sll_epi32
-#define v2u64_sl _mm_sll_epi64
-
-#define v8u16_sr _mm_srl_epi16
-#define v4u32_sr _mm_srl_epi32
-#define v2u64_sr _mm_srl_epi64
-
-#define v8i16_sr _mm_sra_epi16
-#define v4i32_sr _mm_sra_epi32
-
-#define v8u16_sli _mm_slli_epi16
-#define v4u32_sli _mm_slli_epi32
-#define v2u64_sli _mm_slli_epi64
-
-#define v8u16_sri _mm_srli_epi16
-#define v4u32_sri _mm_srli_epi32
-#define v2u64_sri _mm_srli_epi64
-
-#define v8i16_sri _mm_srai_epi16
-#define v4i32_sri _mm_srai_epi32
-
-#define v16i8_cmplt _mm_cmplt_epi8
-#define v8i16_cmplt _mm_cmplt_epi16
-#define v4i32_cmplt _mm_cmplt_epi32
-
-#define v16i8_cmpeq _mm_cmpeq_epi8
-#define v8i16_cmpeq _mm_cmpeq_epi16
-#define v4i32_cmpeq _mm_cmpeq_epi32
-
-#define v16i8_cmpgt _mm_cmpgt_epi8
-#define v8i16_cmpgt _mm_cmpgt_epi16
-#define v4i32_cmpgt _mm_cmpgt_epi32
-
-#define v16u8_msb _mm_movemask_epi8
+static inline v16i8 v16i8_add(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_add(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_add(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_add(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_add(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_paddd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_add(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_paddd128((v4si)x, (v4si)y); }
+static inline v2i64 v2i64_add(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_paddq128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_add(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_paddq128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_sub(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_sub(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_sub(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_sub(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_sub(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psubd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_sub(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psubd128((v4si)x, (v4si)y); }
+static inline v2i64 v2i64_sub(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_psubq128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_sub(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psubq128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_and(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_and(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_and(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_and(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_and(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_and(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_and(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_and(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pand128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_andnot(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_andnot(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_andnot(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_andnot(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_andnot(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_andnot(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_andnot(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_andnot(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pandn128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_or(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_or(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_or(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_or(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_or(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_or(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_or(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_por128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_or(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_por128((v2di)x, (v2di)y); }
+
+static inline v16i8 v16i8_xor(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v16u8 v16u8_xor(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v8i16 v8i16_xor(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v8u16 v8u16_xor(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v4i32 v4i32_xor(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v4u32 v4u32_xor(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v2i64 v2i64_xor(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_xor(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pxor128((v2di)x, (v2di)y); }
+
+static inline v8u16 v8u16_sl(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psllw128((v8hi)x, (v8hi)y); }
+static inline v4u32 v4u32_sl(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pslld128((v4si)x, (v4si)y); }
+static inline v2u64 v2u64_sl(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psllq128((v2di)x, (v2di)y); }
+
+static inline v8u16 v8u16_sr(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psrlw128((v8hi)x, (v8hi)y); }
+static inline v4u32 v4u32_sr(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psrld128((v4si)x, (v4si)y); }
+static inline v2u64 v2u64_sr(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psrlq128((v2di)x, (v2di)y); }
+
+static inline v8i16 v8i16_sr(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_sr(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); }
+
+static inline v8u16 v8u16_sli(v8u16 x, int c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); }
+static inline v4u32 v4u32_sli(v4u32 x, int c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); }
+static inline v2u64 v2u64_sli(v2u64 x, int c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); }
+
+static inline v8u16 v8u16_sri(v8u16 x, int c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); }
+static inline v4u32 v4u32_sri(v4u32 x, int c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); }
+static inline v2u64 v2u64_sri(v2u64 x, int c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); }
+
+static inline v8i16 v8i16_sri(v8i16 x, int c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); }
+static inline v4i32 v4i32_sri(v4i32 x, int c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); }
+
+static inline v16i8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_cmpeq(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_cmpeq(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_cmpeq(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); }
+
+static inline v16i8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
+static inline v16u8 v16u8_cmpgt(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
+static inline v8u16 v8u16_cmpgt(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
+static inline v4u32 v4u32_cmpgt(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); }
+
+static inline uint v16u8_msb(v16u8 x) { return __builtin_ia32_pmovmskb128((v16qi)x); }
#endif
#ifdef R_HAVE_SSSE3
-#define v16u8_shuf _mm_shuffle_epi8
+static inline v16u8 v16u8_shuf(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pshufb128((v16qi)x, (v16qi)y); }
-#define v16i8_sign _mm_sign_epi8
-#define v8i16_sign _mm_sign_epi16
-#define v4i32_sign _mm_sign_epi32
+static inline v16i8 v16i8_sign(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psignb128((v16qi)x, (v16qi)y); }
+static inline v8i16 v8i16_sign(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psignw128((v8hi)x, (v8hi)y); }
+static inline v4i32 v4i32_sign(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psignd128((v4si)x, (v4si)y); }
#endif
#ifdef R_HAVE_SSE4_1
-#define v2i64_cmpeq _mm_cmpeq_epi64
-
-#define v16i8_ext16 _mm_cvtepi8_epi16
-#define v16i8_ext32 _mm_cvtepi8_epi32
-#define v16i8_ext64 _mm_cvtepi8_epi64
-#define v8i16_ext32 _mm_cvtepi16_epi32
-#define v8i16_ext64 _mm_cvtepi16_epi64
-#define v4i32_ext64 _mm_cvtepi32_epi64
-
-#define v16u8_ext16 _mm_cvtepu8_epi16
-#define v16u8_ext32 _mm_cvtepu8_epi32
-#define v16u8_ext64 _mm_cvtepu8_epi64
-#define v8u16_ext32 _mm_cvtepu16_epi32
-#define v8u16_ext64 _mm_cvtepu16_epi64
-#define v4u32_ext64 _mm_cvtepu32_epi64
+static inline v2i64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_cmpeq(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); }
+
+static inline v8i16 v16i8_ext16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); }
+static inline v4i32 v16i8_ext32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); }
+static inline v2i64 v16i8_ext64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); }
+static inline v4i32 v8i16_ext32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); }
+static inline v2i64 v8i16_ext64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); }
+static inline v2i64 v4i32_ext64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); }
+
+static inline v8u16 v16u8_ext16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); }
+static inline v4u32 v16u8_ext32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); }
+static inline v2u64 v16u8_ext64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); }
+static inline v4u32 v8u16_ext32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); }
+static inline v2u64 v8u16_ext64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); }
+static inline v2u64 v4u32_ext64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); }
#endif
#ifdef R_HAVE_SSE4_2
-#define v2i64_cmpgt _mm_cmpgt_epi64
-#define v2i64_cmplt(x, y) v2i64_cmpgt(y, x)
+static inline v2i64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
+static inline v2u64 v2u64_cmpgt(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); }
#endif
+
+#undef v2di
+#undef v4si
+#undef v8hi
+#undef v16qi
+#undef v4di
+#undef v8si
+#undef v16hi
+#undef v32qi