rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit 67b68bf991b3688a3657dfe382a92074afc454c4
parent a4fbab1bc49397ad153cd0d75db52315cd210628
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sat,  6 Jul 2024 16:25:23 -0700

Expand SIMD

Diffstat:
Minc/simd.h | 142+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 107 insertions(+), 35 deletions(-)

diff --git a/inc/simd.h b/inc/simd.h @@ -11,6 +11,7 @@ #endif /* TODO: MMX, AVX-512 */ +/* TODO: Unaligned 128 bit typedefs, and corresponding load/store intrinsics */ /* 128 bit */ typedef i8 v16i8 __attribute__((vector_size(16))); @@ -22,33 +23,41 @@ typedef u32 v4u32 __attribute__((vector_size(16))); typedef i64 v2i64 __attribute__((vector_size(16))); typedef u64 v2u64 __attribute__((vector_size(16))); /* These are for casting inputs/output of the GCC builtins. */ -typedef char r_v16qi_ __attribute__((vector_size(16))); -typedef short r_v8hi_ __attribute__((vector_size(16))); -typedef int r_v4si_ __attribute__((vector_size(16))); -typedef long long r_v2di_ __attribute__((vector_size(16))); +typedef char r_v16qi_ __attribute__((vector_size(16))); +typedef short r_v8hi_ __attribute__((vector_size(16))); +typedef int r_v4si_ __attribute__((vector_size(16))); +typedef long long r_v2di_ __attribute__((vector_size(16))); #define v16qi r_v16qi_ -#define v8hi r_v8hi_ -#define v4si r_v4si_ -#define v2di r_v2di_ +#define v8hi r_v8hi_ +#define v4si r_v4si_ +#define v2di r_v2di_ /* 256 bit */ -typedef i8 v32i8 __attribute__((vector_size(32))); -typedef u8 v32u8 __attribute__((vector_size(32))); -typedef i16 v16i16 __attribute__((vector_size(32))); -typedef u16 v16u16 __attribute__((vector_size(32))); -typedef i32 v8i32 __attribute__((vector_size(32))); -typedef u32 v8u32 __attribute__((vector_size(32))); -typedef i64 v4i64 __attribute__((vector_size(32))); -typedef u64 v4u64 __attribute__((vector_size(32))); +typedef i8 v32i8 __attribute__((vector_size(32))); +typedef u8 v32u8 __attribute__((vector_size(32))); +typedef i16 v16i16 __attribute__((vector_size(32))); +typedef u16 v16u16 __attribute__((vector_size(32))); +typedef i32 v8i32 __attribute__((vector_size(32))); +typedef u32 v8u32 __attribute__((vector_size(32))); +typedef i64 v4i64 __attribute__((vector_size(32))); +typedef u64 v4u64 __attribute__((vector_size(32))); +typedef i8 v32i8a1 __attribute__((vector_size(32), aligned(1))); +typedef u8 v32u8a1 __attribute__((vector_size(32), aligned(1))); +typedef i16 v16i16a1 __attribute__((vector_size(32), aligned(1))); +typedef u16 v16u16a1 __attribute__((vector_size(32), aligned(1))); +typedef i32 v8i32a1 __attribute__((vector_size(32), aligned(1))); +typedef u32 v8u32a1 __attribute__((vector_size(32), aligned(1))); +typedef i64 v4i64a1 __attribute__((vector_size(32), aligned(1))); +typedef u64 v4u64a1 __attribute__((vector_size(32), aligned(1))); /* These are for casting inputs/output of the GCC builtins. */ -typedef char r_v32qi_ __attribute__((vector_size(32))); -typedef short r_v16hi_ __attribute__((vector_size(32))); -typedef int r_v8si_ __attribute__((vector_size(32))); -typedef long long r_v4di_ __attribute__((vector_size(32))); +typedef char r_v32qi_ __attribute__((vector_size(32))); +typedef short r_v16hi_ __attribute__((vector_size(32))); +typedef int r_v8si_ __attribute__((vector_size(32))); +typedef long long r_v4di_ __attribute__((vector_size(32))); #define v32qi r_v32qi_ #define v16hi r_v16hi_ -#define v8si r_v8si_ -#define v4di r_v4di_ +#define v8si r_v8si_ +#define v4di r_v4di_ #ifdef R_HAVE_SSE2 static inline v16i8 v16i8_set( @@ -74,8 +83,8 @@ static inline v4u32 v4u32_set(u32 x3, u32 x2, u32 x1, u32 x0) { return (v4u32){ static inline v2i64 v2i64_set(i64 x1, i64 x0) { return (v2i64){ x1, x0 }; } static inline v2u64 v2u64_set(u64 x1, u64 x0) { return (v2u64){ x1, x0 }; } -static inline v16i8 v16i8_fill(i8 x) { return v16i8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } -static inline v16u8 v16u8_fill(u8 x) { return v16u8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } +static inline v16i8 v16i8_fill(i8 x) { return v16i8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } +static inline v16u8 v16u8_fill(u8 x) { return v16u8_set(x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x); } static inline v8i16 v8i16_fill(i16 x) { return v8i16_set(x, x, x, x, x, x, x, x); } static inline v8u16 v8u16_fill(u16 x) { return v8u16_set(x, x, x, x, x, x, x, x); } static inline v4i32 v4i32_fill(i32 x) { return v4i32_set(x, x, x, x); } @@ -267,6 +276,42 @@ static inline v8u32 v8u32_fill(u32 x) { return v8u32_set(x, x, x, x, x, x, x, static inline v4i64 v4i64_fill(i64 x) { return v4i64_set(x, x, x, x); } static inline v4u64 v4u64_fill(u64 x) { return v4u64_set(x, x, x, x); } +static inline v32i8 v32i8_load(v32i8 *p) { return *p; } +static inline v32u8 v32u8_load(v32u8 *p) { return *p; } +static inline v16i16 v16i16_load(v16i16 *p) { return *p; } +static inline v16u16 v16u16_load(v16u16 *p) { return *p; } +static inline v8i32 v8i32_load(v8i32 *p) { return *p; } +static inline v8u32 v8u32_load(v8u32 *p) { return *p; } +static inline v4i64 v4i64_load(v4i64 *p) { return *p; } +static inline v4u64 v4u64_load(v4u64 *p) { return *p; } + +static inline v32i8 v32i8_loadu(v32i8a1 *p) { return *p; } +static inline v32u8 v32u8_loadu(v32u8a1 *p) { return *p; } +static inline v16i16 v16i16_loadu(v16i16a1 *p) { return *p; } +static inline v16u16 v16u16_loadu(v16u16a1 *p) { return *p; } +static inline v8i32 v8i32_loadu(v8i32a1 *p) { return *p; } +static inline v8u32 v8u32_loadu(v8u32a1 *p) { return *p; } +static inline v4i64 v4i64_loadu(v4i64a1 *p) { return *p; } +static inline v4u64 v4u64_loadu(v4u64a1 *p) { return *p; } + +static inline void v32i8_store(v32i8 *p, v32i8 x) { *p = x; } +static inline void v32u8_store(v32u8 *p, v32u8 x) { *p = x; } +static inline void v16i16_store(v16i16 *p, v16i16 x) { *p = x; } +static inline void v16u16_store(v16u16 *p, v16u16 x) { *p = x; } +static inline void v8i32_store(v8i32 *p, v8i32 x) { *p = x; } +static inline void v8u32_store(v8u32 *p, v8u32 x) { *p = x; } +static inline void v4i64_store(v4i64 *p, v4i64 x) { *p = x; } +static inline void v4u64_store(v4u64 *p, v4u64 x) { *p = x; } + +static inline void v32i8_storeu(v32i8a1 *p, v32i8 x) { *p = x; } +static inline void v32u8_storeu(v32u8a1 *p, v32u8 x) { *p = x; } +static inline void v16i16_storeu(v16i16a1 *p, v16i16 x) { *p = x; } +static inline void v16u16_storeu(v16u16a1 *p, v16u16 x) { *p = x; } +static inline void v8i32_storeu(v8i32a1 *p, v8i32 x) { *p = x; } +static inline void v8u32_storeu(v8u32a1 *p, v8u32 x) { *p = x; } +static inline void v4i64_storeu(v4i64a1 *p, v4i64 x) { *p = x; } +static inline void v4u64_storeu(v4u64a1 *p, v4u64 x) { *p = x; } + static inline v32i8 v32i8_add(v32i8 x, v32i8 y) { return (v32i8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); } static inline v32u8 v32u8_add(v32u8 x, v32u8 y) { return (v32u8) __builtin_ia32_paddb256((v32qi)x, (v32qi)y); } static inline v16i16 v16i16_add(v16i16 x, v16i16 y) { return (v16i16)__builtin_ia32_paddw256((v16hi)x, (v16hi)y); } @@ -343,23 +388,50 @@ static inline v4u64 v4u64_sri(v4u64 x, uint c) { return (v4u64)__builtin_ia3 static inline v16i16 v16i16_sri(v16i16 x, uint c) { return (v16i16)__builtin_ia32_psrawi256((v16hi)x, c); } static inline v8i32 v8i32_sri(v8i32 x, uint c) { return (v8i32)__builtin_ia32_psradi256((v8si)x, c); } -static inline v32u8 v32i8_cmpeq(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); } -static inline v32u8 v32u8_cmpeq(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); } +static inline v32u8 v32i8_cmpeq(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); } +static inline v32u8 v32u8_cmpeq(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpeqb256((v32qi)x, (v32qi)y); } static inline v16u16 v16i16_cmpeq(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); } static inline v16u16 v16u16_cmpeq(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpeqw256((v16hi)x, (v16hi)y); } -static inline v8u32 v8i32_cmpeq(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); } -static inline v8u32 v8u32_cmpeq(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); } -static inline v4u64 v4i64_cmpeq(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); } -static inline v4u64 v4u64_cmpeq(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); } +static inline v8u32 v8i32_cmpeq(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); } +static inline v8u32 v8u32_cmpeq(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpeqd256((v8si)x, (v8si)y); } +static inline v4u64 v4i64_cmpeq(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); } +static inline v4u64 v4u64_cmpeq(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpeqq256((v4di)x, (v4di)y); } -static inline v32u8 v32i8_cmpgt(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); } -static inline v32u8 v32u8_cmpgt(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); } +static inline v32u8 v32i8_cmpgt(v32i8 x, v32i8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); } +static inline v32u8 v32u8_cmpgt(v32u8 x, v32u8 y) { return (v32u8)__builtin_ia32_pcmpgtb256((v32qi)x, (v32qi)y); } static inline v16u16 v16i16_cmpgt(v16i16 x, v16i16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); } static inline v16u16 v16u16_cmpgt(v16u16 x, v16u16 y) { return (v16u16)__builtin_ia32_pcmpgtw256((v16hi)x, (v16hi)y); } -static inline v8u32 v8i32_cmpgt(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); } -static inline v8u32 v8u32_cmpgt(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); } -static inline v4u64 v4i64_cmpgt(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); } -static inline v4u64 v4u64_cmpgt(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); } +static inline v8u32 v8i32_cmpgt(v8i32 x, v8i32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); } +static inline v8u32 v8u32_cmpgt(v8u32 x, v8u32 y) { return (v8u32)__builtin_ia32_pcmpgtd256((v8si)x, (v8si)y); } +static inline v4u64 v4i64_cmpgt(v4i64 x, v4i64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); } +static inline v4u64 v4u64_cmpgt(v4u64 x, v4u64 y) { return (v4u64)__builtin_ia32_pcmpgtq256((v4di)x, (v4di)y); } + +static inline int v32i8_testc(v32i8 x, v32i8 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } +static inline int v32u8_testc(v32u8 x, v32u8 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } +static inline int v16i16_testc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } +static inline int v16u16_testc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } +static inline int v8i32_testc(v8i32 x, v8i32 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } +static inline int v8u32_testc(v8u32 x, v8u32 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } +static inline int v4i64_testc(v4i64 x, v4i64 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } +static inline int v4u64_testc(v4u64 x, v4u64 y) { return __builtin_ia32_ptestc256((v4di)x, (v4di)y); } + +static inline int v32i8_testz(v32i8 x, v32i8 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } +static inline int v32u8_testz(v32u8 x, v32u8 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } +static inline int v16i16_testz(v16i16 x, v16i16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } +static inline int v16u16_testz(v16u16 x, v16u16 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } +static inline int v8i32_testz(v8i32 x, v8i32 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } +static inline int v8u32_testz(v8u32 x, v8u32 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } +static inline int v4i64_testz(v4i64 x, v4i64 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } +static inline int v4u64_testz(v4u64 x, v4u64 y) { return __builtin_ia32_ptestz256((v4di)x, (v4di)y); } + +static inline int v32i8_testnzc(v32i8 x, v32i8 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } +static inline int v32u8_testnzc(v32u8 x, v32u8 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } +static inline int v16i16_testnzc(v16i16 x, v16i16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } +static inline int v16u16_testnzc(v16u16 x, v16u16 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } +static inline int v8i32_testnzc(v8i32 x, v8i32 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } +static inline int v8u32_testnzc(v8u32 x, v8u32 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } +static inline int v4i64_testnzc(v4i64 x, v4i64 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } +static inline int v4u64_testnzc(v4u64 x, v4u64 y) { return __builtin_ia32_ptestnzc256((v4di)x, (v4di)y); } static inline uint v32u8_msb(v32u8 x) { return __builtin_ia32_pmovmskb256((v32qi)x); }