rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit c7df073f1e7b916a7da36cf38b0df35218b56e6e
parent 59ea75710031f0afc82c2eef0060dba0e71e9a54
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Thu,  1 Jun 2023 23:11:19 -0700

Use portable Intel SIMD

Diffstat:
Minc/simd.h | 277++++++++++++++++++++++++++++++-------------------------------------------------
1 file changed, 104 insertions(+), 173 deletions(-)

diff --git a/inc/simd.h b/inc/simd.h @@ -1,11 +1,11 @@ #pragma once +#include <x86intrin.h> + /* Note: This is a work in progress. Bindings for instructions should be * added as needed. */ -#ifdef __MMX__ -#define R_HAVE_MMX 1 -#endif +/* TODO: MMX, AVX-512 */ #ifdef __SSE__ #define R_HAVE_SSE 1 @@ -39,186 +39,117 @@ #define R_HAVE_AVX2 1 #endif -/* TODO: AVX-512 */ - -/* TODO: MMX -typedef i8 v8i8 __attribute__((vector_size(8))); -typedef u8 v8u8 __attribute__((vector_size(8))); -typedef i16 v4i16 __attribute__((vector_size(8))); -typedef u16 v4u16 __attribute__((vector_size(8))); -typedef i32 v2i32 __attribute__((vector_size(8))); -typedef u32 v2u32 __attribute__((vector_size(8))); -*/ - -/* 128 bit */ -typedef i8 v16i8 __attribute__((vector_size(16))); -typedef u8 v16u8 __attribute__((vector_size(16))); -typedef i16 v8i16 __attribute__((vector_size(16))); -typedef u16 v8u16 __attribute__((vector_size(16))); -typedef i32 v4i32 __attribute__((vector_size(16))); -typedef u32 v4u32 __attribute__((vector_size(16))); -typedef i64 v2i64 __attribute__((vector_size(16))); -typedef u64 v2u64 __attribute__((vector_size(16))); -/* These are for casting inputs/output of the GCC builtins. */ -typedef char r_v16qi_ __attribute__((vector_size(16))); -typedef short r_v8hi_ __attribute__((vector_size(16))); -typedef int r_v4si_ __attribute__((vector_size(16))); -typedef long long r_v2di_ __attribute__((vector_size(16))); -#define v16qi r_v16qi_ -#define v8hi r_v8hi_ -#define v4si r_v4si_ -#define v2di r_v2di_ - -/* 256 bit */ -typedef i8 v32i8 __attribute__((vector_size(32))); -typedef u8 v32u8 __attribute__((vector_size(32))); -typedef i16 v16i16 __attribute__((vector_size(32))); -typedef u16 v16u16 __attribute__((vector_size(32))); -typedef i32 v8i32 __attribute__((vector_size(32))); -typedef u32 v8u32 __attribute__((vector_size(32))); -typedef i64 v4i64 __attribute__((vector_size(32))); -typedef u64 v4u64 __attribute__((vector_size(32))); -/* These are for casting inputs/output of the GCC builtins. */ -typedef char r_v32qi_ __attribute__((vector_size(32))); -typedef short r_v16hi_ __attribute__((vector_size(32))); -typedef int r_v8si_ __attribute__((vector_size(32))); -typedef long long r_v4di_ __attribute__((vector_size(32))); -#define v32qi r_v32qi_ -#define v16hi r_v16hi_ -#define v8si r_v8si_ -#define v4di r_v4di_ +typedef __m128i v128u1; +typedef __m128i v16i8, v16u8; +typedef __m128i v8i16, v8u16; +typedef __m128i v4i32, v4u32; +typedef __m128i v2i64, v2u64; + +typedef __m256i v256u1; +typedef __m256i v32i8, v32u8; +typedef __m256i v16i16, v16u16; +typedef __m256i v8i32, v8u32; +typedef __m256i v4i64, v4u64; #ifdef R_HAVE_SSE2 -static inline v16i8 v16i8_add(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); } -static inline v16u8 v16u8_add(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); } -static inline v8i16 v8i16_add(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); } -static inline v8u16 v8u16_add(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); } -static inline v4i32 v4i32_add(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_paddd128((v4si)x, (v4si)y); } -static inline v4u32 v4u32_add(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_paddd128((v4si)x, (v4si)y); } -static inline v2i64 v2i64_add(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_paddq128((v2di)x, (v2di)y); } -static inline v2u64 v2u64_add(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_paddq128((v2di)x, (v2di)y); } - -static inline v16i8 v16i8_sub(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); } -static inline v16u8 v16u8_sub(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); } -static inline v8i16 v8i16_sub(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); } -static inline v8u16 v8u16_sub(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); } -static inline v4i32 v4i32_sub(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psubd128((v4si)x, (v4si)y); } -static inline v4u32 v4u32_sub(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psubd128((v4si)x, (v4si)y); } -static inline v2i64 v2i64_sub(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_psubq128((v2di)x, (v2di)y); } -static inline v2u64 v2u64_sub(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psubq128((v2di)x, (v2di)y); } - -static inline v16i8 v16i8_and(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pand128((v2di)x, (v2di)y); } -static inline v16u8 v16u8_and(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pand128((v2di)x, (v2di)y); } -static inline v8i16 v8i16_and(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pand128((v2di)x, (v2di)y); } -static inline v8u16 v8u16_and(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pand128((v2di)x, (v2di)y); } -static inline v4i32 v4i32_and(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pand128((v2di)x, (v2di)y); } -static inline v4u32 v4u32_and(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pand128((v2di)x, (v2di)y); } -static inline v2i64 v2i64_and(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pand128((v2di)x, (v2di)y); } -static inline v2u64 v2u64_and(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pand128((v2di)x, (v2di)y); } - -static inline v16i8 v16i8_andnot(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pandn128((v2di)x, (v2di)y); } -static inline v16u8 v16u8_andnot(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pandn128((v2di)x, (v2di)y); } -static inline v8i16 v8i16_andnot(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pandn128((v2di)x, (v2di)y); } -static inline v8u16 v8u16_andnot(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pandn128((v2di)x, (v2di)y); } -static inline v4i32 v4i32_andnot(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pandn128((v2di)x, (v2di)y); } -static inline v4u32 v4u32_andnot(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pandn128((v2di)x, (v2di)y); } -static inline v2i64 v2i64_andnot(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pandn128((v2di)x, (v2di)y); } -static inline v2u64 v2u64_andnot(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pandn128((v2di)x, (v2di)y); } - -static inline v16i8 v16i8_or(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_por128((v2di)x, (v2di)y); } -static inline v16u8 v16u8_or(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_por128((v2di)x, (v2di)y); } -static inline v8i16 v8i16_or(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_por128((v2di)x, (v2di)y); } -static inline v8u16 v8u16_or(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_por128((v2di)x, (v2di)y); } -static inline v4i32 v4i32_or(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_por128((v2di)x, (v2di)y); } -static inline v4u32 v4u32_or(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_por128((v2di)x, (v2di)y); } -static inline v2i64 v2i64_or(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_por128((v2di)x, (v2di)y); } -static inline v2u64 v2u64_or(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_por128((v2di)x, (v2di)y); } - -static inline v16i8 v16i8_xor(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pxor128((v2di)x, (v2di)y); } -static inline v16u8 v16u8_xor(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pxor128((v2di)x, (v2di)y); } -static inline v8i16 v8i16_xor(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pxor128((v2di)x, (v2di)y); } -static inline v8u16 v8u16_xor(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pxor128((v2di)x, (v2di)y); } -static inline v4i32 v4i32_xor(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pxor128((v2di)x, (v2di)y); } -static inline v4u32 v4u32_xor(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pxor128((v2di)x, (v2di)y); } -static inline v2i64 v2i64_xor(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pxor128((v2di)x, (v2di)y); } -static inline v2u64 v2u64_xor(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pxor128((v2di)x, (v2di)y); } - -static inline v8u16 v8u16_sl(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psllw128((v8hi)x, (v8hi)y); } -static inline v4u32 v4u32_sl(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pslld128((v4si)x, (v4si)y); } -static inline v2u64 v2u64_sl(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psllq128((v2di)x, (v2di)y); } - -static inline v8u16 v8u16_sr(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psrlw128((v8hi)x, (v8hi)y); } -static inline v4u32 v4u32_sr(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psrld128((v4si)x, (v4si)y); } -static inline v2u64 v2u64_sr(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psrlq128((v2di)x, (v2di)y); } - -static inline v8i16 v8i16_sr(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); } -static inline v4i32 v4i32_sr(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); } - -static inline v8u16 v8u16_sli(v8u16 x, int c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); } -static inline v4u32 v4u32_sli(v4u32 x, int c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); } -static inline v2u64 v2u64_sli(v2u64 x, int c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); } - -static inline v8u16 v8u16_sri(v8u16 x, int c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); } -static inline v4u32 v4u32_sri(v4u32 x, int c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); } -static inline v2u64 v2u64_sri(v2u64 x, int c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); } - -static inline v8i16 v8i16_sri(v8i16 x, int c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); } -static inline v4i32 v4i32_sri(v4i32 x, int c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); } - -static inline v16i8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); } -static inline v16u8 v16u8_cmpeq(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); } -static inline v8i16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); } -static inline v8u16 v8u16_cmpeq(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); } -static inline v4i32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); } -static inline v4u32 v4u32_cmpeq(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); } - -static inline v16i8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); } -static inline v16u8 v16u8_cmpgt(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); } -static inline v8i16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); } -static inline v8u16 v8u16_cmpgt(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); } -static inline v4i32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); } -static inline v4u32 v4u32_cmpgt(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); } - -static inline uint v16u8_msb(v16u8 x) { return __builtin_ia32_pmovmskb128((v16qi)x); } +#define v128u1_loada _mm_load_si128 +#define v128u1_loadu _mm_loadu_si128 + +#define v128u1_storea _mm_store_si128 +#define v128u1_storeu _mm_storeu_si128 + +#define v16i8_set _mm_set_epi8 +#define v8i16_set _mm_set_epi16 +#define v4i32_set _mm_set_epi32 +#define v2i64_set _mm_set_epi64x + +#define v16i8_fill _mm_set1_epi8 +#define v8i16_fill _mm_set1_epi16 +#define v4i32_fill _mm_set1_epi32 +#define v2i64_fill _mm_set1_epi64x + +#define v16i8_add _mm_add_epi8 +#define v8i16_add _mm_add_epi16 +#define v4i32_add _mm_add_epi32 +#define v2i64_add _mm_add_epi64 + +#define v16i8_sub _mm_sub_epi8 +#define v8i16_sub _mm_sub_epi16 +#define v4i32_sub _mm_sub_epi32 +#define v2i64_sub _mm_sub_epi64 + +#define v128u1_and _mm_and_si128 + +#define v128u1_andnot _mm_andnot_si128 + +#define v128u1_or _mm_or_si128 + +#define v128u1_xor _mm_xor_si128 + +#define v8u16_sl _mm_sll_epi16 +#define v4u32_sl _mm_sll_epi32 +#define v2u64_sl _mm_sll_epi64 + +#define v8u16_sr _mm_srl_epi16 +#define v4u32_sr _mm_srl_epi32 +#define v2u64_sr _mm_srl_epi64 + +#define v8i16_sr _mm_sra_epi16 +#define v4i32_sr _mm_sra_epi32 + +#define v8u16_sli _mm_slli_epi16 +#define v4u32_sli _mm_slli_epi32 +#define v2u64_sli _mm_slli_epi64 + +#define v8u16_sri _mm_srli_epi16 +#define v4u32_sri _mm_srli_epi32 +#define v2u64_sri _mm_srli_epi64 + +#define v8i16_sri _mm_srai_epi16 +#define v4i32_sri _mm_srai_epi32 + +#define v16i8_cmplt _mm_cmplt_epi8 +#define v8i16_cmplt _mm_cmplt_epi16 +#define v4i32_cmplt _mm_cmplt_epi32 + +#define v16i8_cmpeq _mm_cmpeq_epi8 +#define v8i16_cmpeq _mm_cmpeq_epi16 +#define v4i32_cmpeq _mm_cmpeq_epi32 + +#define v16i8_cmpgt _mm_cmpgt_epi8 +#define v8i16_cmpgt _mm_cmpgt_epi16 +#define v4i32_cmpgt _mm_cmpgt_epi32 + +#define v16u8_msb _mm_movemask_epi8 #endif #ifdef R_HAVE_SSSE3 -static inline v16u8 v16u8_shuf(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pshufb128((v16qi)x, (v16qi)y); } +#define v16u8_shuf _mm_shuffle_epi8 -static inline v16i8 v16i8_sign(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psignb128((v16qi)x, (v16qi)y); } -static inline v8i16 v8i16_sign(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psignw128((v8hi)x, (v8hi)y); } -static inline v4i32 v4i32_sign(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psignd128((v4si)x, (v4si)y); } +#define v16i8_sign _mm_sign_epi8 +#define v8i16_sign _mm_sign_epi16 +#define v4i32_sign _mm_sign_epi32 #endif #ifdef R_HAVE_SSE4_1 -static inline v2i64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); } -static inline v2u64 v2u64_cmpeq(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); } - -static inline v8i16 v16i8_ext16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); } -static inline v4i32 v16i8_ext32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); } -static inline v2i64 v16i8_ext64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); } -static inline v4i32 v8i16_ext32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); } -static inline v2i64 v8i16_ext64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); } -static inline v2i64 v4i32_ext64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); } - -static inline v8u16 v16u8_ext16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); } -static inline v4u32 v16u8_ext32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); } -static inline v2u64 v16u8_ext64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); } -static inline v4u32 v8u16_ext32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); } -static inline v2u64 v8u16_ext64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); } -static inline v2u64 v4u32_ext64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); } +#define v2i64_cmpeq _mm_cmpeq_epi64 + +#define v16i8_ext16 _mm_cvtepi8_epi16 +#define v16i8_ext32 _mm_cvtepi8_epi32 +#define v16i8_ext64 _mm_cvtepi8_epi64 +#define v8i16_ext32 _mm_cvtepi16_epi32 +#define v8i16_ext64 _mm_cvtepi16_epi64 +#define v4i32_ext64 _mm_cvtepi32_epi64 + +#define v16u8_ext16 _mm_cvtepu8_epi16 +#define v16u8_ext32 _mm_cvtepu8_epi32 +#define v16u8_ext64 _mm_cvtepu8_epi64 +#define v8u16_ext32 _mm_cvtepu16_epi32 +#define v8u16_ext64 _mm_cvtepu16_epi64 +#define v4u32_ext64 _mm_cvtepu32_epi64 #endif #ifdef R_HAVE_SSE4_2 -static inline v2i64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); } -static inline v2u64 v2u64_cmpgt(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); } +#define v2i64_cmpgt _mm_cmpgt_epi64 +#define v2i64_cmplt(x, y) v2i64_cmpgt(y, x) #endif - -#undef v2di -#undef v4si -#undef v8hi -#undef v16qi -#undef v4di -#undef v8si -#undef v16hi -#undef v32qi