rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit 58461ad3af99902e7ef1a93c40d90ae031f9d9b5
parent 798ef8714182bff677eb386c979d3b782b072667
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sun, 13 Aug 2023 15:15:52 -0700

Improve readability in dict.c

Diffstat:
Minc/def.h | 18+++++++++---------
Minc/dict.h | 2+-
Msrc/dict.c | 180++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
3 files changed, 116 insertions(+), 84 deletions(-)

diff --git a/inc/def.h b/inc/def.h @@ -4,6 +4,11 @@ #include <stddef.h> #include <stdint.h> +/* Reasonable code assumes CHAR_BIT == 8. */ +#if CHAR_BIT != 8 +#error "Expected CHAR_BIT == 8" +#endif + #define JOIN_AUX(a,b) a##b #define JOIN(a,b) JOIN_AUX(a,b) @@ -62,11 +67,6 @@ #define STATIC(T, ...) ({static __typeof__(T) tmp = __VA_ARGS__; &tmp;}) #endif -/* Reasonable code assumes CHAR_BIT == 8. */ -#if CHAR_BIT != 8 -#error "Expected CHAR_BIT == 8" -#endif - #if !defined(R_LITTLE_ENDIAN) && !defined(R_BIG_ENDIAN) #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define R_LITTLE_ENDIAN 1 @@ -83,10 +83,6 @@ #endif #define R_CACHE_LINE_SIZE (1 << (R_CACHE_LINE_BITS)) -#ifdef __SIZEOF_INT128__ -#define R_HAVE_128 1 -#endif - #ifdef __MMX__ #define R_HAVE_MMX 1 #endif @@ -115,6 +111,10 @@ #define R_HAVE_AVX2 1 #endif +#ifdef __SIZEOF_INT128__ +#define R_HAVE_128 1 +#endif + /* Correct the mistakes of whoever named these macros */ #define SHORT_MIN SHRT_MIN #define SHORT_MAX SHRT_MAX diff --git a/inc/dict.h b/inc/dict.h @@ -32,7 +32,7 @@ struct r_dict { usize count; usize ntombs; void *mem; - void *data; /* Cache-aligned pointer into mem */ + void *data; /* Cache-aligned pointer into mem; see dict.c for layout. */ }; static inline bool r_dict_mem_eq(void *a, void *b, RDictSpec *s) { return memcmp(a, b, s->ksize) == 0; } diff --git a/src/dict.c b/src/dict.c @@ -2,12 +2,30 @@ #include <string.h> #include "bits.h" +#include "debug.h" #include "dict.h" #include "internal/util.h" #include "log.h" #include "rand.h" #include "rcx.h" +/* In a dict with n slots (i.e., the dict has room for n key-value pairs), the + * allocated dict data consists of n metadata bytes followed by n/PAGE_LEN + * pages, where a page consists of PAGE_LEN keys followed by PAGE_LEN values. + * A meta byte is either EMPTY, TOMBSTONE, or the most significant hash byte of + * the key stored in the corresponding slot (roughly speaking; see hash1). When + * scanning the dict, we use SIMD to probe PROBE_LEN cells at once. Thus, the + * dict size n is always at least MAX(PROBE_LEN, PAGE_LEN). Actually, we also + * require the dict size to be at least R_CACHE_LINE_SIZE, so that the meta + * bytes occupy a whole number of cache lines, and so that pages are + * cache-aligned in the common case of key types K and value types V such that + * sizeof(K) * PAGE_LEN and sizeof(V) * PAGE_LEN are both multiples of + * R_CACHE_LINE_SIZE. */ + +/* TODO: At some point, we should replace SSE2 with AVX2... or should we? + * Longer probe vectors means higher chance of conflicts within one probe + * vector, so maybe SSE2 is actually better. Before replacing SSE2 with AVX2, + * be sure to benchmark and do some statistical analysis. */ #if defined(__GNUC__) && defined(R_HAVE_SSE2) #include "simd.h" #define USE_SIMD 1 @@ -20,11 +38,14 @@ /* PAGE_LEN should be a multiple of 8 for packing and alignment. */ /* TODO: make these parameters programmer-controllable. PAGE_LEN == 8 is - * a conservative choice. In most cases (i.e., ksize a multiple of 8), - * PAGE_LEN == 1 causes no packing issues and would improve caching - * performance with very large keys or values. */ + * a conservative choice. In most cases (i.e., with the key size being a + * multiple of 8), PAGE_LEN == 1 causes no packing issues and would improve + * caching performance with very large keys or values. */ #define PAGE_LEN 8u #define PAGE_LEN_BITS 3u + +/* The dict size doubles when count + ntombs exceeds the proportion of the + * dict size indicated by LF_NUM and LF_DEN. See exceeds_lf. */ #define LF_NUM 3u #define LF_DEN 4u @@ -32,8 +53,6 @@ #define TOMBSTONE 1u #define MIN_HASH1 2u -typedef u8 Page; /* For readability only; a page isn't really a u8. */ - enum scan_mode { ACCESS = 0, INSERT = 1, @@ -56,12 +75,17 @@ exceeds_lf(u64 n, u8 b) { static int alloc_and_align(void **mem, void **data, u8 b, RDictSpec *s) { usize len = USIZE_C(1) << b; - if (r_mul_will_overflow_(len, 1u + s->ksize + s->vsize)) goto fail; - usize dsize = len * (1u + s->ksize + s->vsize); - if (dsize > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail; - *mem = s->allocz((R_CACHE_LINE_SIZE - 1) + dsize); + /* XXX: We assume ksize and vsize are small, so computing bytes_per_slot + * shouldn't overflow. */ + usize bytes_per_slot = 1u + s->ksize + s->vsize; + if (r_mul_will_overflow_(len, bytes_per_slot)) goto fail; + usize data_size = len * bytes_per_slot; + if (data_size > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail; + usize mem_size = (R_CACHE_LINE_SIZE - 1) + data_size; + *mem = s->allocz(mem_size); if (!*mem) goto fail; - *data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1)) & -(uptr)R_CACHE_LINE_SIZE); + *data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1)) + & -(uptr)R_CACHE_LINE_SIZE); return 0; @@ -70,45 +94,46 @@ fail: return -1; } -/* This is a version of scan specialized for inserting while growing. +/* grow_insert is a version of scan specialized for inserting while growing. * See the comments in scan for explanation. */ static void -grow_insert(u8 b, u8 *meta, Page *pages, void *k, void *v, u64 h, RDictSpec *s) { - usize meta_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u; +grow_insert(u8 b, u8 *meta, u8 *pages, void *k, void *v, u64 h, RDictSpec *s) { + usize mvi_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u; - usize h0 = h & meta_mask; + usize h0 = h & mvi_mask; u8 h1 = hash1(h, b); #ifdef USE_SIMD v16u8 vempty = v16u8_fill(EMPTY); #endif - usize insert_idx; - for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & meta_mask, jump++) { + usize insert_slot; + + for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) { usize i = mvi * PROBE_LEN; #ifdef USE_SIMD v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty)); if (m0 != 0) { - insert_idx = i + r_ctz16(m0); + insert_slot = i + r_ctz16(m0); break; } #else u64 vmeta = r_readl64(meta+i); int j0 = r_r8search64(vmeta); if (j0 != 8) { - insert_idx = i + j0; + insert_slot = i + j0; break; } #endif } - meta[insert_idx] = h1; - usize page_idx = insert_idx % PAGE_LEN; - Page *p = pages + (insert_idx - page_idx) * (s->ksize + s->vsize); - memcpy(p + page_idx * s->ksize, k, s->ksize); - memcpy(p + PAGE_LEN * s->ksize + page_idx * s->vsize, v, s->vsize); + meta[insert_slot] = h1; + usize page_slot = insert_slot % PAGE_LEN; + u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize); + memcpy(page + page_slot * s->ksize, k, s->ksize); + memcpy(page + PAGE_LEN * s->ksize + page_slot * s->vsize, v, s->vsize); } static int @@ -117,22 +142,23 @@ grow(RDict *d, RDictSpec *s) { usize oldlen = USIZE_C(1) << oldb; void *oldmem = d->mem; u8 *oldmeta = d->data; - Page *oldpages = (u8 *)d->data + oldlen; + u8 *oldpages = (u8 *)d->data + oldlen; u8 newb = oldb + 1; - assert(newb < USIZE_BITS); + ASSERT(newb < USIZE_BITS); usize newlen = USIZE_C(1) << newb; void *newmem, *newdata; if (alloc_and_align(&newmem, &newdata, newb, s) < 0) return -1; u8 *newmeta = newdata; - Page *newpages = (u8 *)newdata + newlen; - - for (usize i = 0; i < oldlen; i++) { - if (oldmeta[i] < MIN_HASH1) continue; - usize page_idx = i % PAGE_LEN; - Page *p = oldpages + (i - page_idx) * (s->ksize + s->vsize); - void *k = p + page_idx * s->ksize; - void *v = p + PAGE_LEN * s->ksize + page_idx * s->vsize; + u8 *newpages = (u8 *)newdata + newlen; + + /* Rehash. */ + for (usize slot = 0; slot < oldlen; slot++) { + if (oldmeta[slot] < MIN_HASH1) continue; + usize page_slot = slot % PAGE_LEN; + u8 *page = oldpages + (slot - page_slot) * (s->ksize + s->vsize); + void *k = page + page_slot * s->ksize; + void *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize; u64 h = s->hash(k, d->seed, s); grow_insert(newb, newmeta, newpages, k, v, h, s); } @@ -147,14 +173,15 @@ grow(RDict *d, RDictSpec *s) { return 0; } +/* scan is inline, so that the mode argument becomes constant in all uses. */ static inline int scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) { - /* meta_mask is a mask for reducing modulo the table size. */ - usize meta_mask = (USIZE_C(1) << (d->b - PROBE_LEN_BITS)) - 1u; u8 *meta = d->data; - Page *pages = (u8 *)d->data + (USIZE_C(1) << d->b); + u8 *pages = (u8 *)d->data + (USIZE_C(1) << d->b); - usize h0 = h & meta_mask; + usize mvi_mask = (USIZE_C(1) << (d->b - PROBE_LEN_BITS)) - 1u; + + usize h0 = h & mvi_mask; u8 h1 = hash1(h, d->b); #ifdef USE_SIMD @@ -164,81 +191,85 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) { u64 vh1; memset(&vh1, h1, sizeof vh1); #endif + /* These variables should get optimized away when mode != INSERT. */ bool insert_slot_found = false; - usize insert_idx; + usize insert_slot; bool insert_replacing_tomb; + /* mvi is the metadata vector index (where each metadata vector is a * u8[PROBE_LEN]), and i is the corresponding index in the meta table. * Note that mvi is increased quadratically. */ - for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & meta_mask, jump++) { + for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) { usize i = mvi * PROBE_LEN; /* We must load the metadata vector as little endian, so that low bits * in the vector represent earlier slots in the table and hence - * ctz/r8search gives us earlier slots first. In the SIMD cases, we + * ctz/r8search gives us earlier slots first. In the SIMD case, we * know we're on x86, so a memcpy suffices. In the portable case, we - * use readl64, which does a byte swap on big endian machines. - * - * In the SIMD case, m is a bit array indicating where vmeta equals - * h1. Thus, ctz(m) is the least j such that meta[j] == h1 (provided - * m != 0). - * - * In the portable case, z initially has 0s exactly where vmeta has h1. - * Thus, we can use r8search64 on z to find the indices j of h1 in - * vmeta. After each iteration, we mask in an arbitrary bit into the - * jth byte so that the next iteration gets a different j. */ + * use readl64, which does a byte swap on big endian machines. */ #ifdef USE_SIMD v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); + /* m is a bit array indicating where vmeta equals h1. Thus, ctz(m) is + * the least j such that meta[j] == h1 (provided m != 0). */ for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) { int j = r_ctz16(m); #else u64 vmeta = r_readl64(meta+i); + /* Initially, z has 0x00 exactly where vmeta has h1. Thus, we can use + * r8search64 on z to find the indices j of h1 in vmeta. After each + * iteration, we mask in an arbitrary bit into the jth byte so that + * the next iteration gets a different j. */ u64 z = vmeta ^ vh1; for (int j = r_r8search64(z); j != 8; z |= U64_C(1)<<(8*j), j = r_r8search64(z)) { #endif - usize match_idx = i + j; - usize page_idx = match_idx % PAGE_LEN; - Page *p = pages + (match_idx - page_idx) * (s->ksize + s->vsize); - if likely (s->eq(p + page_idx * s->ksize, k, s)) { + usize slot = i + j; + usize page_slot = slot % PAGE_LEN; + u8 *page = pages + (slot - page_slot) * (s->ksize + s->vsize); + if likely (s->eq(page + page_slot * s->ksize, k, s)) { if (mode == DELETE) { + /* If the probe vector contains an empty slot, then we + * don't need to insert a tombstone, and fewer tombstones + * means lower load factor. */ #ifdef USE_SIMD if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) { #else if (r_r8search64(vmeta) != 8) { #endif - meta[match_idx] = EMPTY; + meta[slot] = EMPTY; } else { - meta[match_idx] = TOMBSTONE; + meta[slot] = TOMBSTONE; d->ntombs += 1; } d->count -= 1; } - *v = p + PAGE_LEN * s->ksize + page_idx * s->vsize; + *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize; return 1; } } - /* Search for an EMPTY slot. */ + /* The key is not in this probe vector. Search for an empty slot to + * determine if we need to check more probe vectors. */ #ifdef USE_SIMD u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty)); #else int j0 = r_r8search64(vmeta); #endif - /* In mode INSERT, we need to look for an EMPTY or TOMBSTONE slot in - * case we don't find an existing slot with the given key. We - * prioritize filling TOMBSTONE slots to decrease the load factor. */ + /* When inserting, we need to remember the first empty slot or + * tombstone seen in case we don't find an existing slot with the + * given key. We prioritize replacing tombstones to decrease the + * load factor. */ if (mode == INSERT && !insert_slot_found) { #ifdef USE_SIMD v16u8 vtomb = v16u8_fill(TOMBSTONE); u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb)); if (m1 != 0) { insert_slot_found = true; - insert_idx = i + r_ctz16(m1); + insert_slot = i + r_ctz16(m1); insert_replacing_tomb = true; } else if (m0 != 0) { insert_slot_found = true; - insert_idx = i + r_ctz16(m0); + insert_slot = i + r_ctz16(m0); insert_replacing_tomb = false; break; } @@ -247,18 +278,17 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) { int j1 = r_r8search64(vmeta ^ vtomb); if (j1 != 8) { insert_slot_found = true; - insert_idx = i + j1; + insert_slot = i + j1; insert_replacing_tomb = true; } else if (j0 != 8) { insert_slot_found = true; - insert_idx = i + j0; + insert_slot = i + j0; insert_replacing_tomb = false; break; } #endif } - /* If the page contains an EMPTY slot, the key can't be any further. */ #ifdef USE_SIMD if (m0 != 0) break; #else @@ -267,13 +297,16 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) { } if (mode == INSERT) { + /* The key wasn't found, so insert in the first available slot found + * along the probe sequence. */ + ASSERT(insert_slot_found); d->count += 1; if (insert_replacing_tomb) d->ntombs -= 1; - meta[insert_idx] = h1; - usize page_idx = insert_idx % PAGE_LEN; - Page *p = pages + (insert_idx - page_idx) * (s->ksize + s->vsize); - memcpy(p + page_idx * s->ksize, k, s->ksize); - *v = p + PAGE_LEN * s->ksize + page_idx * s->vsize; + meta[insert_slot] = h1; + usize page_slot = insert_slot % PAGE_LEN; + u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize); + memcpy(page + page_slot * s->ksize, k, s->ksize); + *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize; } return 0; @@ -289,7 +322,7 @@ r_dict_init(RDict *d, usize hint, RDictSpec *s) { u8 b = MAX(R_CACHE_LINE_BITS, PROBE_LEN_BITS); b = MAX(b, PAGE_LEN_BITS); while (exceeds_lf(hint, b)) b += 1; - assert(b < USIZE_BITS); + ASSERT(b < USIZE_BITS); d->b = b; if (alloc_and_align(&d->mem, &d->data, b, s) < 0) return -1; @@ -317,8 +350,7 @@ r_dict_access(RDict *d, void **v, void *k, u64 h, RDictSpec *s) { int r_dict_insert(RDict *d, void **v, void *k, u64 h, RDictSpec *s) { if (exceeds_lf(d->count + d->ntombs, d->b)) { - if (grow(d, s) < 0) - return -1; + if (grow(d, s) < 0) return -1; } return scan(d, v, k, h, INSERT, s); }