rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit 59ea75710031f0afc82c2eef0060dba0e71e9a54
parent 4de55af2687bc2d3ade6b80219127ec049e605aa
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Thu,  1 Jun 2023 21:26:27 -0700

Add SIMD stuff

Diffstat:
Ainc/simd.h | 224+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 224 insertions(+), 0 deletions(-)

diff --git a/inc/simd.h b/inc/simd.h @@ -0,0 +1,224 @@ +#pragma once + +/* Note: This is a work in progress. Bindings for instructions should be + * added as needed. */ + +#ifdef __MMX__ +#define R_HAVE_MMX 1 +#endif + +#ifdef __SSE__ +#define R_HAVE_SSE 1 +#endif + +#ifdef __SSE2__ +#define R_HAVE_SSE2 1 +#endif + +#ifdef __SSE3__ +#define R_HAVE_SSE3 1 +#endif + +#ifdef __SSSE3__ +#define R_HAVE_SSSE3 1 +#endif + +#ifdef __SSE4_1__ +#define R_HAVE_SSE4_1 1 +#endif + +#ifdef __SSE4_2__ +#define R_HAVE_SSE4_2 1 +#endif + +#ifdef __AVX__ +#define R_HAVE_AVX 1 +#endif + +#ifdef __AVX2__ +#define R_HAVE_AVX2 1 +#endif + +/* TODO: AVX-512 */ + +/* TODO: MMX +typedef i8 v8i8 __attribute__((vector_size(8))); +typedef u8 v8u8 __attribute__((vector_size(8))); +typedef i16 v4i16 __attribute__((vector_size(8))); +typedef u16 v4u16 __attribute__((vector_size(8))); +typedef i32 v2i32 __attribute__((vector_size(8))); +typedef u32 v2u32 __attribute__((vector_size(8))); +*/ + +/* 128 bit */ +typedef i8 v16i8 __attribute__((vector_size(16))); +typedef u8 v16u8 __attribute__((vector_size(16))); +typedef i16 v8i16 __attribute__((vector_size(16))); +typedef u16 v8u16 __attribute__((vector_size(16))); +typedef i32 v4i32 __attribute__((vector_size(16))); +typedef u32 v4u32 __attribute__((vector_size(16))); +typedef i64 v2i64 __attribute__((vector_size(16))); +typedef u64 v2u64 __attribute__((vector_size(16))); +/* These are for casting inputs/output of the GCC builtins. */ +typedef char r_v16qi_ __attribute__((vector_size(16))); +typedef short r_v8hi_ __attribute__((vector_size(16))); +typedef int r_v4si_ __attribute__((vector_size(16))); +typedef long long r_v2di_ __attribute__((vector_size(16))); +#define v16qi r_v16qi_ +#define v8hi r_v8hi_ +#define v4si r_v4si_ +#define v2di r_v2di_ + +/* 256 bit */ +typedef i8 v32i8 __attribute__((vector_size(32))); +typedef u8 v32u8 __attribute__((vector_size(32))); +typedef i16 v16i16 __attribute__((vector_size(32))); +typedef u16 v16u16 __attribute__((vector_size(32))); +typedef i32 v8i32 __attribute__((vector_size(32))); +typedef u32 v8u32 __attribute__((vector_size(32))); +typedef i64 v4i64 __attribute__((vector_size(32))); +typedef u64 v4u64 __attribute__((vector_size(32))); +/* These are for casting inputs/output of the GCC builtins. */ +typedef char r_v32qi_ __attribute__((vector_size(32))); +typedef short r_v16hi_ __attribute__((vector_size(32))); +typedef int r_v8si_ __attribute__((vector_size(32))); +typedef long long r_v4di_ __attribute__((vector_size(32))); +#define v32qi r_v32qi_ +#define v16hi r_v16hi_ +#define v8si r_v8si_ +#define v4di r_v4di_ + +#ifdef R_HAVE_SSE2 +static inline v16i8 v16i8_add(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); } +static inline v16u8 v16u8_add(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_paddb128((v16qi)x, (v16qi)y); } +static inline v8i16 v8i16_add(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); } +static inline v8u16 v8u16_add(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_paddw128((v8hi)x, (v8hi)y); } +static inline v4i32 v4i32_add(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_paddd128((v4si)x, (v4si)y); } +static inline v4u32 v4u32_add(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_paddd128((v4si)x, (v4si)y); } +static inline v2i64 v2i64_add(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_paddq128((v2di)x, (v2di)y); } +static inline v2u64 v2u64_add(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_paddq128((v2di)x, (v2di)y); } + +static inline v16i8 v16i8_sub(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); } +static inline v16u8 v16u8_sub(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_psubb128((v16qi)x, (v16qi)y); } +static inline v8i16 v8i16_sub(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); } +static inline v8u16 v8u16_sub(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psubw128((v8hi)x, (v8hi)y); } +static inline v4i32 v4i32_sub(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psubd128((v4si)x, (v4si)y); } +static inline v4u32 v4u32_sub(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psubd128((v4si)x, (v4si)y); } +static inline v2i64 v2i64_sub(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_psubq128((v2di)x, (v2di)y); } +static inline v2u64 v2u64_sub(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psubq128((v2di)x, (v2di)y); } + +static inline v16i8 v16i8_and(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pand128((v2di)x, (v2di)y); } +static inline v16u8 v16u8_and(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pand128((v2di)x, (v2di)y); } +static inline v8i16 v8i16_and(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pand128((v2di)x, (v2di)y); } +static inline v8u16 v8u16_and(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pand128((v2di)x, (v2di)y); } +static inline v4i32 v4i32_and(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pand128((v2di)x, (v2di)y); } +static inline v4u32 v4u32_and(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pand128((v2di)x, (v2di)y); } +static inline v2i64 v2i64_and(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pand128((v2di)x, (v2di)y); } +static inline v2u64 v2u64_and(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pand128((v2di)x, (v2di)y); } + +static inline v16i8 v16i8_andnot(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pandn128((v2di)x, (v2di)y); } +static inline v16u8 v16u8_andnot(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pandn128((v2di)x, (v2di)y); } +static inline v8i16 v8i16_andnot(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pandn128((v2di)x, (v2di)y); } +static inline v8u16 v8u16_andnot(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pandn128((v2di)x, (v2di)y); } +static inline v4i32 v4i32_andnot(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pandn128((v2di)x, (v2di)y); } +static inline v4u32 v4u32_andnot(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pandn128((v2di)x, (v2di)y); } +static inline v2i64 v2i64_andnot(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pandn128((v2di)x, (v2di)y); } +static inline v2u64 v2u64_andnot(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pandn128((v2di)x, (v2di)y); } + +static inline v16i8 v16i8_or(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_por128((v2di)x, (v2di)y); } +static inline v16u8 v16u8_or(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_por128((v2di)x, (v2di)y); } +static inline v8i16 v8i16_or(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_por128((v2di)x, (v2di)y); } +static inline v8u16 v8u16_or(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_por128((v2di)x, (v2di)y); } +static inline v4i32 v4i32_or(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_por128((v2di)x, (v2di)y); } +static inline v4u32 v4u32_or(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_por128((v2di)x, (v2di)y); } +static inline v2i64 v2i64_or(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_por128((v2di)x, (v2di)y); } +static inline v2u64 v2u64_or(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_por128((v2di)x, (v2di)y); } + +static inline v16i8 v16i8_xor(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pxor128((v2di)x, (v2di)y); } +static inline v16u8 v16u8_xor(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pxor128((v2di)x, (v2di)y); } +static inline v8i16 v8i16_xor(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pxor128((v2di)x, (v2di)y); } +static inline v8u16 v8u16_xor(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pxor128((v2di)x, (v2di)y); } +static inline v4i32 v4i32_xor(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pxor128((v2di)x, (v2di)y); } +static inline v4u32 v4u32_xor(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pxor128((v2di)x, (v2di)y); } +static inline v2i64 v2i64_xor(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pxor128((v2di)x, (v2di)y); } +static inline v2u64 v2u64_xor(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pxor128((v2di)x, (v2di)y); } + +static inline v8u16 v8u16_sl(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psllw128((v8hi)x, (v8hi)y); } +static inline v4u32 v4u32_sl(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pslld128((v4si)x, (v4si)y); } +static inline v2u64 v2u64_sl(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psllq128((v2di)x, (v2di)y); } + +static inline v8u16 v8u16_sr(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_psrlw128((v8hi)x, (v8hi)y); } +static inline v4u32 v4u32_sr(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_psrld128((v4si)x, (v4si)y); } +static inline v2u64 v2u64_sr(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_psrlq128((v2di)x, (v2di)y); } + +static inline v8i16 v8i16_sr(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psraw128((v8hi)x, (v8hi)y); } +static inline v4i32 v4i32_sr(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psrad128((v4si)x, (v4si)y); } + +static inline v8u16 v8u16_sli(v8u16 x, int c) { return (v8u16)__builtin_ia32_psllwi128((v8hi)x, c); } +static inline v4u32 v4u32_sli(v4u32 x, int c) { return (v4u32)__builtin_ia32_pslldi128((v4si)x, c); } +static inline v2u64 v2u64_sli(v2u64 x, int c) { return (v2u64)__builtin_ia32_psllqi128((v2di)x, c); } + +static inline v8u16 v8u16_sri(v8u16 x, int c) { return (v8u16)__builtin_ia32_psrlwi128((v8hi)x, c); } +static inline v4u32 v4u32_sri(v4u32 x, int c) { return (v4u32)__builtin_ia32_psrldi128((v4si)x, c); } +static inline v2u64 v2u64_sri(v2u64 x, int c) { return (v2u64)__builtin_ia32_psrlqi128((v2di)x, c); } + +static inline v8i16 v8i16_sri(v8i16 x, int c) { return (v8i16)__builtin_ia32_psrawi128((v8hi)x, c); } +static inline v4i32 v4i32_sri(v4i32 x, int c) { return (v4i32)__builtin_ia32_psradi128((v4si)x, c); } + +static inline v16i8 v16i8_cmpeq(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); } +static inline v16u8 v16u8_cmpeq(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpeqb128((v16qi)x, (v16qi)y); } +static inline v8i16 v8i16_cmpeq(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); } +static inline v8u16 v8u16_cmpeq(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpeqw128((v8hi)x, (v8hi)y); } +static inline v4i32 v4i32_cmpeq(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); } +static inline v4u32 v4u32_cmpeq(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpeqd128((v4si)x, (v4si)y); } + +static inline v16i8 v16i8_cmpgt(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); } +static inline v16u8 v16u8_cmpgt(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pcmpgtb128((v16qi)x, (v16qi)y); } +static inline v8i16 v8i16_cmpgt(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); } +static inline v8u16 v8u16_cmpgt(v8u16 x, v8u16 y) { return (v8u16)__builtin_ia32_pcmpgtw128((v8hi)x, (v8hi)y); } +static inline v4i32 v4i32_cmpgt(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); } +static inline v4u32 v4u32_cmpgt(v4u32 x, v4u32 y) { return (v4u32)__builtin_ia32_pcmpgtd128((v4si)x, (v4si)y); } + +static inline uint v16u8_msb(v16u8 x) { return __builtin_ia32_pmovmskb128((v16qi)x); } +#endif + +#ifdef R_HAVE_SSSE3 +static inline v16u8 v16u8_shuf(v16u8 x, v16u8 y) { return (v16u8)__builtin_ia32_pshufb128((v16qi)x, (v16qi)y); } + +static inline v16i8 v16i8_sign(v16i8 x, v16i8 y) { return (v16i8)__builtin_ia32_psignb128((v16qi)x, (v16qi)y); } +static inline v8i16 v8i16_sign(v8i16 x, v8i16 y) { return (v8i16)__builtin_ia32_psignw128((v8hi)x, (v8hi)y); } +static inline v4i32 v4i32_sign(v4i32 x, v4i32 y) { return (v4i32)__builtin_ia32_psignd128((v4si)x, (v4si)y); } +#endif + +#ifdef R_HAVE_SSE4_1 +static inline v2i64 v2i64_cmpeq(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); } +static inline v2u64 v2u64_cmpeq(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpeqq((v2di)x, (v2di)y); } + +static inline v8i16 v16i8_ext16(v16i8 x) { return (v8i16)__builtin_ia32_pmovsxbw128((v16qi)x); } +static inline v4i32 v16i8_ext32(v16i8 x) { return (v4i32)__builtin_ia32_pmovsxbd128((v16qi)x); } +static inline v2i64 v16i8_ext64(v16i8 x) { return (v2i64)__builtin_ia32_pmovsxbq128((v16qi)x); } +static inline v4i32 v8i16_ext32(v8i16 x) { return (v4i32)__builtin_ia32_pmovsxwd128((v8hi)x); } +static inline v2i64 v8i16_ext64(v8i16 x) { return (v2i64)__builtin_ia32_pmovsxwq128((v8hi)x); } +static inline v2i64 v4i32_ext64(v4i32 x) { return (v2i64)__builtin_ia32_pmovsxdq128((v4si)x); } + +static inline v8u16 v16u8_ext16(v16u8 x) { return (v8u16)__builtin_ia32_pmovzxbw128((v16qi)x); } +static inline v4u32 v16u8_ext32(v16u8 x) { return (v4u32)__builtin_ia32_pmovzxbd128((v16qi)x); } +static inline v2u64 v16u8_ext64(v16u8 x) { return (v2u64)__builtin_ia32_pmovzxbq128((v16qi)x); } +static inline v4u32 v8u16_ext32(v8u16 x) { return (v4u32)__builtin_ia32_pmovzxwd128((v8hi)x); } +static inline v2u64 v8u16_ext64(v8u16 x) { return (v2u64)__builtin_ia32_pmovzxwq128((v8hi)x); } +static inline v2u64 v4u32_ext64(v4u32 x) { return (v2u64)__builtin_ia32_pmovzxdq128((v4si)x); } +#endif + +#ifdef R_HAVE_SSE4_2 +static inline v2i64 v2i64_cmpgt(v2i64 x, v2i64 y) { return (v2i64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); } +static inline v2u64 v2u64_cmpgt(v2u64 x, v2u64 y) { return (v2u64)__builtin_ia32_pcmpgtq((v2di)x, (v2di)y); } +#endif + +#undef v2di +#undef v4si +#undef v8hi +#undef v16qi +#undef v4di +#undef v8si +#undef v16hi +#undef v32qi