rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit b0e120dd5dbafa7230ce7ca9c7b46c32f7ccbb47
parent 51071af719559df83b990c518e076a269dbae576
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Thu,  8 Jun 2023 19:11:38 -0700

Remove AVX support

It was slower than SSE! Also, supporting two SIMD APIs just makes
the code even more of an ifdef mess.

Diffstat:
Minc/dict.h | 3+--
Msrc/dict.c | 46+++++++++++-----------------------------------
2 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/inc/dict.h b/inc/dict.h @@ -30,7 +30,7 @@ struct r_dict { usize count; usize ntombs; void *mem; - u8 *data; /* Cache line-aligned pointer into mem */ + u8 *data; /* Cache-aligned pointer into mem */ }; static inline bool r_dict_mem_eq(void *a, void *b, RDictSpec *spec) { return memcmp(a, b, spec->ksize) == 0; } @@ -73,7 +73,6 @@ typedef struct D { \ .free = (RDictFreeFunc)(R_DICT_FREE), \ } -/* TODO: function for assigning without allocating */ #define R_DICT_DECLARE(D, K, V, ...)\ static inline UNUSED usize R_DICT_METHOD(len,##__VA_ARGS__)(D *d) { return d->d.count; } \ static inline UNUSED void R_DICT_METHOD(set_default,##__VA_ARGS__)(D *d, V v) { d->default_val = v; } \ diff --git a/src/dict.c b/src/dict.c @@ -140,10 +140,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) { usize h0 = h & meta_mask; u8 h1 = hash1(h, d->b); -#if PROBE_LEN == 32 - v32u8 vh1 = v32u8_fill(h1); - v32u8 vempty = v32u8_fill(EMPTY); -#elif PROBE_LEN == 16 +#ifdef USE_SIMD v16u8 vh1 = v16u8_fill(h1); v16u8 vempty = v16u8_fill(EMPTY); #else @@ -160,12 +157,12 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) { usize i = mvi * PROBE_LEN; /* We must load the metadata vector as little endian, so that low bits - * in the vector represent earlier slots in the table and hence ctz - * gives us earlier slots first. In the SIMD cases, we know we're on - * x86, so a memcpy suffices. In the general fallback case, we use - * readl64, which does a byte swap on big endian machines. + * in the vector represent earlier slots in the table and hence + * ctz/r8search gives us earlier slots first. In the SIMD cases, we + * know we're on x86, so a memcpy suffices. In the portable case, we + * use readl64, which does a byte swap on big endian machines. * - * In the SIMD cases, m is a bit array indicating where vmeta equals + * In the SIMD case, m is a bit array indicating where vmeta equals * h1. Thus, ctz(m) is the least j such that meta[j] == h1 (provided * m != 0). * @@ -173,11 +170,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) { * Thus, we can use r8search64 on z to find the indices j of h1 in * vmeta. After each iteration, we mask in an arbitrary bit into the * jth byte so that the next iteration gets a different j. */ -#if PROBE_LEN == 32 - v32u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); - for (u32 m = v32u8_msb(v32u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) { - int j = r_ctz32(m); -#elif PROBE_LEN == 16 +#ifdef USE_SIMD v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) { int j = r_ctz16(m); @@ -191,9 +184,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) { Page *p = pages + (match_idx - page_idx) * (spec->ksize + spec->vsize); if likely (spec->eq(p + page_idx * spec->ksize, k, spec)) { if (mode == DELETE) { -#if PROBE_LEN == 32 - if (v32u8_msb(v32u8_cmpeq(vmeta, vempty)) != 0) { -#elif PROBE_LEN == 16 +#ifdef USE_SIMD if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) { #else if (r_r8search64(vmeta) != 8) { @@ -211,9 +202,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) { } /* Search for an EMPTY slot. */ -#if PROBE_LEN == 32 - u32 m0 = v32u8_msb(v32u8_cmpeq(vmeta, vempty)); -#elif PROBE_LEN == 16 +#ifdef USE_SIMD u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty)); #else int j0 = r_r8search64(vmeta); @@ -223,20 +212,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) { * case we don't find an existing slot with the given key. We * prioritize filling TOMBSTONE slots to decrease the load factor. */ if (mode == CREATE && !create_slot_found) { -#if PROBE_LEN == 32 - v32u8 vtomb = v32u8_fill(TOMBSTONE); - u32 m1 = v32u8_msb(v32u8_cmpeq(vmeta, vtomb)); - if (m1 != 0) { - create_slot_found = true; - create_idx = i + r_ctz32(m1); - create_replacing_tomb = true; - } else if (m0 != 0) { - create_slot_found = true; - create_idx = i + r_ctz32(m0); - create_replacing_tomb = false; - break; - } -#elif PROBE_LEN == 16 +#ifdef USE_SIMD v16u8 vtomb = v16u8_fill(TOMBSTONE); u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb)); if (m1 != 0) { @@ -266,7 +242,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) { } /* If the page contains an EMPTY slot, the key can't be any further. */ -#if PROBE_LEN > 8 +#ifdef USE_SIMD if (m0 != 0) break; #else if (j0 != 8) break;