rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit 7657930c27c7c96a5fa55334cfc87eb4404ba57f
parent ef66779c5a7e01591a175eb50a466cdb84679940
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sat,  3 Jun 2023 18:46:09 -0700

Progress on SIMDing dict

Not complete. Also, I need to rethink the dict structure. Storing
PAGE_LEN metas, then PAGE_LEN keys, and then PAGE_LEN vals does not
make much sense when PAGE_LEN is large. (The point before was that
the cache miss to read a meta/key would probably prefetch the
corresponding value, thus saving a cache miss.) It probably makes
more sense to have all meta bytes in their own array (but still same
allocation, of course) and stagger keys/vals in length 8 arrays
(like before).

Diffstat:
Msrc/dict.c | 159++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 118 insertions(+), 41 deletions(-)

diff --git a/src/dict.c b/src/dict.c @@ -7,16 +7,21 @@ #include "rand.h" #include "rcx.h" -/* PAGE_LEN should be a multiple of the machine word size (i.e., 8 on 64-bit - * arches) for optimal packing and to prevent alignment issues. There are - * places where we assume it is exactly 8, so don't change this without - * auditing the code. */ +#if defined(__GNUC__) && defined(R_HAVE_AVX2) && 0 +#include "simd.h" +#define PAGE_LEN 32u +#elif defined(__GNUC__) && defined(R_HAVE_SSE2) && 0 +#include "simd.h" +#define PAGE_LEN 16u +#else #define PAGE_LEN 8u +#endif + #define LF_NUM 3u #define LF_DEN 4u #define EMPTY 0u #define TOMBSTONE 1u -#define MIN_HASH2 2u +#define MIN_HASH1 2u typedef struct r_dict_spec Spec; typedef struct r_dict Dict; @@ -39,7 +44,7 @@ struct r_dict { }; struct page { - u8 h2[PAGE_LEN]; + u8 meta[PAGE_LEN]; /* K key[PAGE_LEN]; */ /* V val[PAGE_LEN]; */ }; @@ -49,15 +54,10 @@ hash0(u64 h, u8 b) { return h & ((U64_C(1) << b) - 1u); } -static inline u64 -hash1(u64 h, u8 b) { - return (h >> b) & (PAGE_LEN - 1u); -} - static inline u8 -hash2(u64 h, u8 b) { +hash1(u64 h, u8 b) { u64 t = h >> 56; - if (t < MIN_HASH2) t += MIN_HASH2; + if (t < MIN_HASH1) t += MIN_HASH1; return t; } @@ -67,6 +67,8 @@ exceeds_lf(u64 n, u8 b) { return LF_DEN * n > LF_NUM * PAGE_LEN * (U64_C(1) << b); } +/* TODO: try computing psize when in dict.h instead. */ + /* Compute Page size *without* overflow checks. */ static inline usize psize_unsafe(Spec *spec) { @@ -103,20 +105,23 @@ val(Page *p, usize j, Spec *spec) { static void insert(Page *pages, u8 b, void *k, void *v, u64 h, Spec *spec) { - usize psize = psize_unsafe(spec); + assert(false); + usize psize = psize_unsafe(spec); u64 h0 = hash0(h, b); - u64 h1 = hash1(h, b); - u8 h2 = hash2(h, b); + u8 h1 = hash1(h, b); usize pn = h0; for (usize i = 1;; i++) { Page *p = (void *)((u8 *)pages + pn*psize); + /* TODO: this is wrong. Should probably just generalize scan and + * replace insert with scan. Otherwise we need a bunch of #ifs here. */ + usize j = h1; do { - if (p->h2[j] < MIN_HASH2) { - p->h2[j] = h2; + if (p->meta[j] < MIN_HASH1) { + p->meta[j] = h1; memcpy(key(p, j, spec), k, spec->ksize); memcpy(val(p, j, spec), v, spec->vsize); return; @@ -152,7 +157,7 @@ grow(Dict *d, Spec *spec) { Page *p = (void *)((u8 *)old + i*psize); for (usize j = 0; j < PAGE_LEN; j++) { - if (p->h2[j] < MIN_HASH2) continue; + if (p->meta[j] < MIN_HASH1) continue; void *k = key(p, j, spec); void *v = val(p, j, spec); u64 h = spec->hash(k, d->seed, spec); @@ -182,31 +187,55 @@ scan(Dict *d, void **v, void *k, u64 h, int mode, Spec *spec) { usize psize = psize_unsafe(spec); u64 h0 = hash0(h, d->b); - u64 h1 = hash1(h, d->b); - u8 h2 = hash2(h, d->b); + u8 h1 = hash1(h, d->b); + +#if PAGE_LEN == 32 + v32u8 vh1 = v32u8_fill(h1); + v32u8 vempty = v32u8_fill(EMPTY); +#elif PAGE_LEN == 16 + v16u8 vh1 = v16u8_fill(h1); + v16u8 vempty = v16u8_fill(EMPTY); +#else + u64 vh1; memset(&vh1, h1, sizeof vh1); +#endif usize pn = h0; Page *createp = 0; - usize createj = 0; /* XXX: this does not need to be initialized, but GCC doesn't realize that */ + usize createj; + bool createtomb; for (usize i = 1;; i++) { Page *p = (void *)((u8 *)d->pages + pn*psize); - bool seen_empty = false; - usize j = h1; - do { - if (p->h2[j] < MIN_HASH2) { - seen_empty = p->h2[j] == EMPTY; - if (!createp) { - createp = p; - createj = j; - } - } else if (p->h2[j] == h2 && spec->eq(key(p, j, spec), k, spec)) { - /* TODO: try branch predict hint true on eq */ + /* We must load the metadata vector as little endian, so that low bits + * in the vector represent earlier slots in the table and hence ctz + * gives us earlier slots first. In the SIMD cases, we know we're on + * x86, so a memcpy suffices. In the general fallback case, we use + * readl64, which does a byte swap on big endian machines. */ +#if PAGE_LEN == 32 + v32u8 meta; memcpy(&meta, &p->meta, sizeof meta); + for (u32 m = v32u8_msb(v32u8_cmpeq(meta, vh1)); m != 0; m &= m - 1) { + int j = r_ctz32(m); +#elif PAGE_LEN == 16 + v16u8 meta; memcpy(&meta, &p->meta, sizeof meta); + for (u16 m = v16u8_msb(v16u8_cmpeq(meta, vh1)); m != 0; m &= m - 1) { + int j = r_ctz16(m); +#else + u64 meta = r_readl64(&p->meta); + u64 z = vh1 ^ meta; + for (int j = r_r8search64(z); j < 8; z |= U64_C(1) << (8*j), j = r_r8search64(z)) { +#endif + if likely (spec->eq(key(p, j, spec), k, spec)) { if (mode == DELETE) { - if (seen_empty || r_r8search64(r_readh64(p->h2)) < 8) { - p->h2[j] = EMPTY; +#if PAGE_LEN == 32 + if (v32u8_msb(v32u8_cmpeq(meta, vempty)) != 0) { +#elif PAGE_LEN == 16 + if (v16u8_msb(v16u8_cmpeq(meta, vempty)) != 0) { +#else + if (r_r8search64(meta) < 8) { +#endif + p->meta[j] = EMPTY; } else { - p->h2[j] = TOMBSTONE; + p->meta[j] = TOMBSTONE; d->ntombs += 1; } d->count -= 1; @@ -214,18 +243,66 @@ scan(Dict *d, void **v, void *k, u64 h, int mode, Spec *spec) { *v = val(p, j, spec); return 1; } + } - j = (j + 1) & (PAGE_LEN - 1); - } while (j != h1); + /* Search for an EMPTY slot. */ +#if PAGE_LEN == 32 + u32 m0 = v32u8_msb(v32u8_cmpeq(meta, vempty)); +#elif PAGE_LEN == 16 + u16 m0 = v16u8_msb(v16u8_cmpeq(meta, vempty)); +#else + int j0 = r_r8search64(meta); +#endif + + /* In mode CREATE, we need to look for an EMPTY or TOMBSTONE slot in + * case we don't find an existing slot with the given key. We + * prioritize filling TOMBSTONE slots to decrease the load factor. */ + if (mode == CREATE && !createp) { +#if PAGE_LEN == 32 + v32u8 vtomb = v32u8_fill(TOMBSTONE); + u32 m1 = v32u8_msb(v32u8_cmpeq(meta, vtomb)); + if (m1 != 0) { + createp = p; createj = r_ctz32(m1); createtomb = true; + } else if (m0 != 0) { + createp = p; createj = r_ctz32(m0); createtomb = false; + break; + } +#elif PAGE_LEN == 16 + v16u8 vtomb = v16u8_fill(TOMBSTONE); + u16 m1 = v16u8_msb(v16u8_cmpeq(meta, vtomb)); + if (m1 != 0) { + createp = p; createj = r_ctz16(m1); createtomb = true; + } else if (m0 != 0) { + createp = p; createj = r_ctz16(m0); createtomb = false; + break; + } +#else + u64 vtomb; memset(&vtomb, TOMBSTONE, sizeof vtomb); + int j1 = r_r8search64(meta ^ vtomb); + if (j1 != 8) { + createp = p; createj = j1; createtomb = true; + } else if (j0 != 8) { + createp = p; createj = j0; createtomb = false; + break; + } +#endif + } - if (seen_empty) break; + /* If the page contains an EMPTY slot, the key can't be any further. */ +#if PAGE_LEN > 8 + if (m0 != 0) break; +#else + if (j0 != 8) break; +#endif + /* Quadratic probing between pages */ pn = (pn + i) & ((USIZE_C(1) << d->b) - 1u); } if (mode == CREATE) { d->count += 1; - createp->h2[createj] = h2; + if (createtomb) d->ntombs -= 1; + createp->meta[createj] = h1; /* TODO: try returning a pointer to the key slot instead, so that the * compiler can generate copy code when ksize is a constant */ memcpy(key(createp, createj, spec), k, spec->ksize); @@ -283,7 +360,7 @@ r_dict_clear(Dict *d, Spec *spec) { usize len = USIZE_C(1) << d->b; for (usize i = 0; i < len; i++) { Page *p = (void *)((u8 *)d->pages + i*psize); - memset(p->h2, EMPTY, sizeof p->h2); + memset(p->meta, EMPTY, sizeof p->meta); } d->count = 0;