rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit ae1d9dd5a0d6ea1232b35d3001c88b7c67cc385a
parent eaa709cff6d7a23ede478704e902dc723b220fa0
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sun, 24 Sep 2023 19:40:41 -0700

Abstract dict implementation to generic table data structure

This will form the basis of a new hash map ("dict") and hash set
implementation. Table is not meant to be used directly; the methods
are not very ergonomic.

Diffstat:
MMakefile | 2--
Minc/all.h | 2--
Dinc/dict-defaults.h | 17-----------------
Dinc/dict.h | 115-------------------------------------------------------------------------------
Ainc/table/declare.h | 13+++++++++++++
Ainc/table/define.h | 15+++++++++++++++
Ainc/table/impl/common.h | 43+++++++++++++++++++++++++++++++++++++++++++
Ainc/table/impl/declare.h | 43+++++++++++++++++++++++++++++++++++++++++++
Ainc/table/impl/define.h | 355+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainc/table/impl/macro.h | 73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ainc/table/impl/unmacro.h | 19+++++++++++++++++++
Ainc/table/static.h | 18++++++++++++++++++
Dsrc/dict.c | 373-------------------------------------------------------------------------------
13 files changed, 579 insertions(+), 509 deletions(-)

diff --git a/Makefile b/Makefile @@ -6,7 +6,6 @@ SRC =\ src/alloc.c\ src/bench.c\ src/debug.c\ - src/dict.c\ src/log.c\ src/rand.c\ src/str.c\ @@ -26,7 +25,6 @@ librcx.a: $(SRC:.c=.o) src/alloc.o: src/alloc.c inc/alloc.h inc/def.h inc/log.h inc/rcx.h inc/internal/util.h config.mk src/bench.o: src/bench.c inc/bench.h inc/def.h inc/log.h inc/rcx.h config.mk src/debug.o: src/debug.c inc/debug.h inc/def.h inc/rcx.h config.mk -src/dict.o: src/dict.c inc/bits.h inc/debug.h inc/def.h inc/dict.h inc/dict-defaults.h inc/internal/util.h inc/log.h inc/rand.h inc/rcx.h inc/string.h config.mk src/log.o: src/log.c inc/def.h inc/log.h inc/rcx.h config.mk src/opt.o: src/opt.c inc/def.h inc/opt.h inc/rcx.h config.mk src/rand.o: src/rand.c inc/bits.h inc/def.h inc/rand.h inc/rcx.h inc/unix.h config.mk diff --git a/inc/all.h b/inc/all.h @@ -4,10 +4,8 @@ #include "bits.h" #include "debug.h" #include "deque.h" -#include "dict.h" #include "error.h" #include "log.h" -#include "opt.h" #include "rand.h" #include "rcx.h" #include "str.h" diff --git a/inc/dict-defaults.h b/inc/dict-defaults.h @@ -1,17 +0,0 @@ -#undef R_DICT_STATIC -#define R_DICT_STATIC - -#undef R_DICT_METHOD -#define R_DICT_METHOD(name, prefix) JOIN(JOIN(prefix,_),name) - -#undef R_DICT_EQ -#define R_DICT_EQ r_dict_mem_eq - -#undef R_DICT_HASH -#define R_DICT_HASH r_dict_mem_hash - -#undef R_DICT_ALLOCZ -#define R_DICT_ALLOCZ r_eallocz - -#undef R_DICT_FREE -#define R_DICT_FREE free diff --git a/inc/dict.h b/inc/dict.h @@ -1,114 +0,0 @@ -#pragma once - -#include <stdbool.h> -#include <string.h> - -#include "debug.h" -#include "def.h" -#include "rand.h" -#include "string.h" - -typedef struct r_dict_spec RDictSpec; -typedef struct r_dict RDict; -typedef bool (*RDictEqFunc)(void *a, void *b, RDictSpec *s); -typedef u64 (*RDictHashFunc)(void *k, u64 seed, RDictSpec *s); -typedef void *(*RDictAlloczFunc)(usize size); -typedef void (*RDictFreeFunc)(void *p); - -/* RDictSpec is for passing user settings in to the shared part of the - * implementation in dict.c. */ -struct r_dict_spec { - usize ksize; - usize vsize; - RDictEqFunc eq; - RDictHashFunc hash; - RDictAlloczFunc allocz; - RDictFreeFunc free; -}; - -struct r_dict { - u8 b; /* Log2 of number of slots */ - u32 niter; - u64 seed; - usize count; - usize ntombs; - void *mem; - void *data; /* Cache-aligned pointer into mem; see dict.c for layout. */ -}; - -static inline bool r_dict_mem_eq(void *a, void *b, RDictSpec *s) { return memcmp(a, b, s->ksize) == 0; } -static inline u64 r_dict_mem_hash(void *k, u64 seed, RDictSpec *s) { return r_hash(k, s->ksize, seed); } - -static inline bool r_dict_cstr_eq(char **a, char **b, RDictSpec *s) { return strcmp(*a, *b) == 0; } -static inline u64 r_dict_cstr_hash(char **k, u64 seed, RDictSpec *s) { return r_hash(*k, strlen(*k), seed); } - -static inline bool r_dict_rstr_eq(Str *a, Str *b, RDictSpec *s) { return str_eq(*a, *b); } -static inline u64 r_dict_rstr_hash(Str *k, u64 seed, RDictSpec *s) { return r_hash(str_bytes(*k), str_len(*k), seed); } - -/* TODO: add functions for shrinking dict and reserving space? */ -int r_dict_init(RDict *d, usize hint, RDictSpec *s); -void r_dict_free(RDict *d, RDictSpec *s); -int r_dict_access(RDict *d, void **v, void *k, u64 h, RDictSpec *s); -int r_dict_insert(RDict *d, void **v, void *k, u64 h, RDictSpec *s); -int r_dict_delete(RDict *d, void **v, void *k, u64 h, RDictSpec *s); -void r_dict_clear(RDict *d, RDictSpec *s); - -#include "dict-defaults.h" - -#define R_DICT_TYPEDEF(D, K, V)\ -typedef struct D { \ - RDict d; \ - V default_val; \ -} D; - -#define R_DICT_SPEC_(K,V) &(RDictSpec){ \ - .ksize = sizeof(K), \ - .vsize = sizeof(V), \ - .eq = (RDictEqFunc)(R_DICT_EQ), \ - .hash = (RDictHashFunc)(R_DICT_HASH), \ - .allocz = (RDictAlloczFunc)(R_DICT_ALLOCZ), \ - .free = (RDictFreeFunc)(R_DICT_FREE), \ - } - -#define R_DICT_DECLARE(D, K, V, ...)\ -static inline UNUSED usize R_DICT_METHOD(len,##__VA_ARGS__)(D *d) { return d->d.count; } \ -static inline UNUSED void R_DICT_METHOD(set_default,##__VA_ARGS__)(D *d, V v) { d->default_val = v; } \ -static inline UNUSED V R_DICT_METHOD(get_default,##__VA_ARGS__)(D *d) { return d->default_val; } \ -static inline UNUSED int R_DICT_METHOD(init,##__VA_ARGS__)(D *d, usize hint) { \ - memset(&d->default_val, 0, sizeof(V)); \ - return r_dict_init(&d->d, hint, R_DICT_SPEC_(K,V)); \ -} \ -static inline UNUSED void R_DICT_METHOD(free,##__VA_ARGS__)(D *d) { r_dict_free(&d->d, R_DICT_SPEC_(K,V)); } \ -static inline UNUSED V *R_DICT_METHOD(ptr,##__VA_ARGS__)(D *d, K k) { \ - u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \ - void *vp = 0; r_dict_access(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \ - return vp; \ -} \ -static inline UNUSED bool R_DICT_METHOD(get,##__VA_ARGS__)(D *d, V *v, K k) { \ - u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \ - void *vp; int r = r_dict_access(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \ - if (v) *v = r > 0 ? *(V *)vp : d->default_val; \ - return r > 0; \ -} \ -static inline UNUSED bool R_DICT_METHOD(set,##__VA_ARGS__)(D *d, K k, V v) { \ - u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \ - void *vp; int r = r_dict_access(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \ - if (r > 0) *(V *)vp = v; \ - return r > 0; \ -} \ -static inline UNUSED int R_DICT_METHOD(put,##__VA_ARGS__)(D *d, K k, V v) { \ - ASSERT(d->d.niter == 0, "can not call put while iterating through dict"); \ - u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \ - void *vp; int r = r_dict_insert(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \ - if (r >= 0) *(V *)vp = v; \ - return r; \ -} \ -static inline UNUSED bool R_DICT_METHOD(del,##__VA_ARGS__)(D *d, V *v, K k) { \ - u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \ - void *vp; int r = r_dict_delete(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \ - if (v) *v = r > 0 ? *(V *)vp : d->default_val; \ - return r > 0; \ -} \ -static inline UNUSED void R_DICT_METHOD(clr,##__VA_ARGS__)(D *d) { r_dict_clear(&d->d, R_DICT_SPEC_(K,V)); } - -#define R_DICT_DEFINE(D, K, V, ...) /* No op. Everything is static inline. */ -\ No newline at end of file diff --git a/inc/table/declare.h b/inc/table/declare.h @@ -0,0 +1,13 @@ +#include <errno.h> +#include <stdbool.h> +#include <string.h> + +#include "../alloc.h" +#include "../def.h" +#include "../rand.h" +#include "../string.h" + +#include "impl/macro.h" +#include "impl/common.h" +#include "impl/declare.h" +#include "impl/unmacro.h" diff --git a/inc/table/define.h b/inc/table/define.h @@ -0,0 +1,15 @@ +#include <errno.h> +#include <stdbool.h> +#include <string.h> + +#include "../alloc.h" +#include "../bits.h" +#include "../debug.h" +#include "../def.h" +#include "../internal/util.h" +#include "../rand.h" +#include "../string.h" + +#include "impl/macro.h" +#include "impl/define.h" +#include "impl/unmacro.h" diff --git a/inc/table/impl/common.h b/inc/table/impl/common.h @@ -0,0 +1,43 @@ +#pragma once + +typedef bool (*RTableEqFunc)(void *a, void *b, usize ksize); +typedef u64 (*RTableHashFunc)(void *k, u64 seed, usize ksize); +typedef void *(*RTableAlloczFunc)(usize size); +typedef void (*RTableFreeFunc)(void *p); +typedef struct r_table RTable; + +struct r_table { + u8 b; /* Log2 of number of slots */ + u64 seed; + usize count; + usize ntombs; + void *mem; + void *data; /* Cache-aligned pointer into mem; see define.h for layout. */ +}; + +static inline bool +r_table_mem_eq(void *a, void *b, usize ksize) { + return memcmp(a, b, ksize) == 0; +} +static inline u64 +r_table_mem_hash(void *k, u64 seed, usize ksize) { + return r_hash(k, ksize, seed); +} + +static inline bool +r_table_cstr_eq(char **a, char **b, usize ksize) { + return strcmp(*a, *b) == 0; +} +static inline u64 +r_table_cstr_hash(char **k, u64 seed, usize ksize) { + return r_hash(*k, strlen(*k), seed); +} + +static inline bool +r_table_rstr_eq(Str *a, Str *b, usize ksize) { + return str_eq(*a, *b); +} +static inline u64 +r_table_rstr_hash(Str *k, u64 seed, usize ksize) { + return r_hash(str_bytes(*k), str_len(*k), seed); +} diff --git a/inc/table/impl/declare.h b/inc/table/impl/declare.h @@ -0,0 +1,43 @@ +#ifdef TABLE_SPEC +/* The spec(ification) struct is for passing around settings specific to a + * table instantiation. Fields are omitted or included according to whether + * certain TABLE_... macros are assigned non-default values when a table is + * specialized; see macro.h for details. */ +/* TODO: Add PAGE_LEN and LF_{NUM,DEN} to the spec struct so that they are + * controllable at run-time? */ +struct TABLE_METHOD(spec) { + TABLE_KSIZE_FIELD + TABLE_VSIZE_FIELD + TABLE_EQ_FIELD + TABLE_HASH_FIELD + TABLE_ALLOCZ_FIELD + TABLE_FREE_FIELD +}; +#endif + +/* TODO: Add methods for shrinking the table and reserving space? */ + +#ifndef TABLE_STATIC +#define TABLE_STATIC +#endif + +TABLE_STATIC UNUSED int TABLE_METHOD(init)(RTable *t, usize hint TABLE_SPEC_PARAM); +TABLE_STATIC UNUSED int TABLE_METHOD(acc)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM); +TABLE_STATIC UNUSED int TABLE_METHOD(ins)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM); +TABLE_STATIC UNUSED int TABLE_METHOD(del)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM); +TABLE_STATIC UNUSED void TABLE_METHOD(clr)(RTable *t TABLE_SPEC_PARAM); + +#undef TABLE_STATIC + +static inline UNUSED void +TABLE_METHOD(free)(RTable *t TABLE_SPEC_PARAM) { + int e = errno; + TABLE_FREE(t->mem); + t->data = 0; /* Ensure hard error on use after free. */ + errno = e; +} + +static inline UNUSED usize +TABLE_METHOD(len)(RTable *t TABLE_SPEC_PARAM) { + return t->count; +} diff --git a/inc/table/impl/define.h b/inc/table/impl/define.h @@ -0,0 +1,355 @@ +/* In a table with n slots (i.e., the table has room for n key-value pairs), + * the allocated table data consists of n metadata bytes followed by n/PAGE_LEN + * pages, where a page consists of PAGE_LEN keys followed by PAGE_LEN values. + * A meta byte is either EMPTY, TOMBSTONE, or the most significant hash byte of + * the key stored in the corresponding slot (roughly speaking; see hash1). When + * scanning the dict, we use SIMD to probe PROBE_LEN cells at once. Thus, the + * dict size n is always at least MAX(PROBE_LEN, PAGE_LEN). Actually, we also + * require the dict size to be at least R_CACHE_LINE_SIZE, so that the meta + * bytes occupy a whole number of cache lines, and so that pages are + * cache-aligned in the common case of key types K and value types V such that + * sizeof(K) * PAGE_LEN and sizeof(V) * PAGE_LEN are both multiples of + * R_CACHE_LINE_SIZE. */ + +/* PAGE_LEN should be a multiple of 8 for packing and alignment. */ +/* TODO: make these parameters programmer-controllable. PAGE_LEN == 8 is + * a conservative choice. In most cases (i.e., with the key size being a + * multiple of 8), PAGE_LEN == 1 causes no packing issues and would improve + * caching performance with very large keys or values. */ + +/* The dict size doubles when count + ntombs exceeds the proportion of the + * dict size indicated by LF_NUM and LF_DEN. See exceeds_lf. */ + +/* TODO: At some point, we should replace SSE2 with AVX2... or should we? + * Longer probe vectors means higher chance of conflicts within one probe + * vector, so maybe SSE2 is actually better. Before replacing SSE2 with AVX2, + * be sure to benchmark and do some statistical analysis. */ +#if defined(__GNUC__) && defined(R_HAVE_SSE2) +#include "../../simd.h" +#define USE_SIMD 1 +#define PROBE_LEN 16u +#define PROBE_LEN_BITS 4u +#else +#define PROBE_LEN 8u +#define PROBE_LEN_BITS 3u +#endif + +#define EMPTY 0u /* Must be 0. */ +#define TOMBSTONE 1u +#define MIN_HASH1 2u + +#define ACCESS 0 +#define INSERT 1 +#define DELETE 2 + +static inline u8 +TABLE_METHOD(hash1)(u64 h, u8 b) { + u64 h1 = h >> 56; + if (h1 < MIN_HASH1) h1 += MIN_HASH1; + return h1; +} + +static inline bool +TABLE_METHOD(exceeds_lf)(u64 n, u8 b) { + /* Note that the math here is 64-bit to avoid overflow. */ + return TABLE_LF_DEN * n > TABLE_LF_NUM * (U64_C(1) << b); +} + +static int +TABLE_METHOD(alloc_and_align)(void **mem, void **data, u8 b TABLE_SPEC_PARAM) { + usize len = USIZE_C(1) << b; + /* XXX: We assume KSIZE and VSIZE are small, so computing bytes_per_slot + * shouldn't overflow. */ + usize bytes_per_slot = 1u + TABLE_KSIZE + TABLE_VSIZE; + if (r_mul_will_overflow_(len, bytes_per_slot)) goto fail; + usize data_size = len * bytes_per_slot; + if (data_size > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail; + usize mem_size = (R_CACHE_LINE_SIZE - 1) + data_size; + *mem = TABLE_ALLOCZ(mem_size); + if (!*mem) goto fail; + *data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1)) + & -(uptr)R_CACHE_LINE_SIZE); + return 0; + +fail: + errno = ENOMEM; + return -1; +} + +/* grow_insert is a version of scan specialized for inserting while growing. + * See the comments in scan for explanation. */ +static void +TABLE_METHOD(grow_insert)(u8 b, u8 *meta, u8 *pages, void *k, void *v, u64 h TABLE_SPEC_PARAM) { + usize mvi_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u; + + usize h0 = h & mvi_mask; + u8 h1 = TABLE_METHOD(hash1)(h, b); + +#ifdef USE_SIMD + v16u8 vempty = v16u8_fill(EMPTY); +#endif + + usize insert_slot; + + for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) { + usize i = mvi * PROBE_LEN; + + #ifdef USE_SIMD + v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); + u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty)); + if (m0 != 0) { + insert_slot = i + r_ctz16(m0); + break; + } + #else + u64 vmeta = r_readl64(meta+i); + int j0 = r_rzb64(vmeta); + if (j0 != 8) { + insert_slot = i + j0; + break; + } + #endif + } + + meta[insert_slot] = h1; + usize page_slot = insert_slot % TABLE_PAGE_LEN; + u8 *page = pages + (insert_slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE); + memcpy(page + page_slot * TABLE_KSIZE, k, TABLE_KSIZE); + memcpy(page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE, v, TABLE_VSIZE); +} + +static int +TABLE_METHOD(grow)(RTable *t TABLE_SPEC_PARAM) { + u8 oldb = t->b; + usize oldlen = USIZE_C(1) << oldb; + void *oldmem = t->mem; + u8 *oldmeta = t->data; + u8 *oldpages = (u8 *)t->data + oldlen; + + u8 newb = oldb + 1; + ASSERT(newb < USIZE_BITS); + usize newlen = USIZE_C(1) << newb; + void *newmem, *newdata; + if (TABLE_METHOD(alloc_and_align)(&newmem, &newdata, newb TABLE_SPEC) < 0) + return -1; + u8 *newmeta = newdata; + u8 *newpages = (u8 *)newdata + newlen; + + /* Rehash. */ + for (usize slot = 0; slot < oldlen; slot++) { + if (oldmeta[slot] < MIN_HASH1) continue; + usize page_slot = slot % TABLE_PAGE_LEN; + u8 *page = oldpages + (slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE); + void *k = page + page_slot * TABLE_KSIZE; + void *v = page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE; + u64 h = TABLE_HASH(k, t->seed, TABLE_KSIZE); + TABLE_METHOD(grow_insert)(newb, newmeta, newpages, k, v, h TABLE_SPEC); + } + + TABLE_FREE(oldmem); + + t->b = newb; + t->ntombs = 0; + t->mem = newmem; + t->data = newdata; + + return 0; +} + +/* scan is inline, so that the mode argument becomes constant in all uses. */ +static inline int +TABLE_METHOD(scan)(RTable *t, void **v, void *k, u64 h, int mode TABLE_SPEC_PARAM) { + u8 *meta = t->data; + u8 *pages = (u8 *)t->data + (USIZE_C(1) << t->b); + + usize mvi_mask = (USIZE_C(1) << (t->b - PROBE_LEN_BITS)) - 1u; + + usize h0 = h & mvi_mask; + u8 h1 = TABLE_METHOD(hash1)(h, t->b); + +#ifdef USE_SIMD + v16u8 vh1 = v16u8_fill(h1); + v16u8 vempty = v16u8_fill(EMPTY); +#else + u64 vh1; memset(&vh1, h1, sizeof vh1); +#endif + + /* These variables should get optimized away when mode != INSERT. */ + bool insert_slot_found = false; + usize insert_slot; + bool insert_replacing_tomb; + + /* mvi is the metadata vector index (where each metadata vector is a + * u8[PROBE_LEN]), and i is the corresponding index in the meta table. + * Note that mvi is increased quadratically. */ + for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) { + usize i = mvi * PROBE_LEN; + + /* We must load the metadata vector as little endian, so that low bits + * in the vector represent earlier slots in the table and hence + * ctz/rzb gives us earlier slots first. In the SIMD case, we know + * we're on x86, so a memcpy suffices. In the portable case, we use + * readl64, which does a byte swap on big endian machines. */ + #ifdef USE_SIMD + v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); + /* m is a bit array indicating where vmeta equals h1. Thus, ctz(m) is + * the least j such that meta[j] == h1 (provided m != 0). */ + for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) { + int j = r_ctz16(m); + #else + u64 vmeta = r_readl64(meta+i); + /* Initially, z has 0x00 exactly where vmeta has h1. Thus, we can use + * rzb64 on z to find the indices j of h1 in vmeta. After each + * iteration, we mask in an arbitrary bit into the jth byte so that + * the next iteration gets a different j. */ + u64 z = vmeta ^ vh1; + for (int j = r_rzb64(z); j != 8; z |= U64_C(1)<<(8*j), j = r_rzb64(z)) { + #endif + usize slot = i + j; + usize page_slot = slot % TABLE_PAGE_LEN; + u8 *page = pages + (slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE); + if likely (TABLE_EQ(page + page_slot * TABLE_KSIZE, k, TABLE_KSIZE)) { + if (mode == DELETE) { + /* If the probe vector contains an empty slot, then we + * don't need to insert a tombstone, and fewer tombstones + * means lower load factor. */ + #ifdef USE_SIMD + if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) { + #else + if (r_rzb64(vmeta) != 8) { + #endif + meta[slot] = EMPTY; + } else { + meta[slot] = TOMBSTONE; + t->ntombs += 1; + } + t->count -= 1; + } + *v = page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE; + return 1; + } + } + + /* The key is not in this probe vector. Search for an empty slot to + * determine if we need to check more probe vectors. */ + #ifdef USE_SIMD + u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty)); + #else + int j0 = r_rzb64(vmeta); + #endif + + /* When inserting, we need to remember the first empty slot or + * tombstone seen in case we don't find an existing slot with the + * given key. We prioritize replacing tombstones to decrease the + * load factor. */ + if (mode == INSERT && !insert_slot_found) { + #ifdef USE_SIMD + v16u8 vtomb = v16u8_fill(TOMBSTONE); + u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb)); + if (m1 != 0) { + insert_slot_found = true; + insert_slot = i + r_ctz16(m1); + insert_replacing_tomb = true; + } else if (m0 != 0) { + insert_slot_found = true; + insert_slot = i + r_ctz16(m0); + insert_replacing_tomb = false; + break; + } + #else + u64 vtomb; memset(&vtomb, TOMBSTONE, sizeof vtomb); + int j1 = r_rzb64(vmeta ^ vtomb); + if (j1 != 8) { + insert_slot_found = true; + insert_slot = i + j1; + insert_replacing_tomb = true; + } else if (j0 != 8) { + insert_slot_found = true; + insert_slot = i + j0; + insert_replacing_tomb = false; + break; + } + #endif + } + + #ifdef USE_SIMD + if (m0 != 0) break; + #else + if (j0 != 8) break; + #endif + } + + if (mode == INSERT) { + /* The key wasn't found, so insert in the first available slot found + * along the probe sequence. */ + ASSERT(insert_slot_found); + t->count += 1; + if (insert_replacing_tomb) t->ntombs -= 1; + meta[insert_slot] = h1; + usize page_slot = insert_slot % TABLE_PAGE_LEN; + u8 *page = pages + (insert_slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE); + memcpy(page + page_slot * TABLE_KSIZE, k, TABLE_KSIZE); + *v = page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE; + } + + return 0; +} + +int +TABLE_METHOD(init)(RTable *t, usize hint TABLE_SPEC_PARAM) { + memset(t, 0, sizeof *t); + + /* Impose maximum on hint to prevent overflow and the following while + * loop from being infinite. See exceeds_lf. */ + hint = MIN(hint, (U64_MAX / TABLE_LF_DEN) >> 1); + u8 b = MAX(R_CACHE_LINE_BITS, PROBE_LEN_BITS); + b = MAX(b, TABLE_PAGE_LEN_BITS); + while (TABLE_METHOD(exceeds_lf)(hint, b)) b += 1; + ASSERT(b < USIZE_BITS); + t->b = b; + + if (TABLE_METHOD(alloc_and_align)(&t->mem, &t->data, b TABLE_SPEC) < 0) + return -1; + + t->seed = r_prand64(); + + return 0; +} + +int +TABLE_METHOD(acc)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM) { + return TABLE_METHOD(scan)(t, v, k, h, ACCESS TABLE_SPEC); +} + +int +TABLE_METHOD(ins)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM) { + if (TABLE_METHOD(exceeds_lf)(t->count + t->ntombs, t->b)) { + if (TABLE_METHOD(grow)(t TABLE_SPEC) < 0) return -1; + } + return TABLE_METHOD(scan)(t, v, k, h, INSERT TABLE_SPEC); +} + +int +TABLE_METHOD(del)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM) { + return TABLE_METHOD(scan)(t, v, k, h, DELETE TABLE_SPEC); +} + +void +TABLE_METHOD(clr)(RTable *t TABLE_SPEC_PARAM) { + memset(t->data, EMPTY, USIZE_C(1) << t->b); + t->count = 0; + t->ntombs = 0; + + /* Re-seed to prevent some attacks. */ + t->seed = r_prand64(); +} + +#undef DELETE +#undef INSERT +#undef ACCESS +#undef MIN_HASH1 +#undef TOMBSTONE +#undef EMPTY +#undef PROBE_LEN_BITS +#undef PROBE_LEN +#undef USE_SIMD diff --git a/inc/table/impl/macro.h b/inc/table/impl/macro.h @@ -0,0 +1,73 @@ +#ifndef TABLE_KSIZE +#define TABLE_KSIZE s_->ksize +#define TABLE_KSIZE_FIELD usize ksize; +#define TABLE_USE_SPEC +#else +#define TABLE_KSIZE_FIELD +#endif + +#ifndef TABLE_VSIZE +#define TABLE_VSIZE s_->vsize +#define TABLE_VSIZE_FIELD usize vsize; +#define TABLE_USE_SPEC +#else +#define TABLE_VSIZE_FIELD +#endif + +#ifndef TABLE_EQ +#define TABLE_EQ s_->eq +#define TABLE_EQ_FIELD RTableEqFunc eq; +#define TABLE_USE_SPEC +#else +#define TABLE_EQ_FIELD +#endif + +#ifndef TABLE_HASH +#define TABLE_HASH s_->hash +#define TABLE_HASH_FIELD RTableHashFunc hash; +#define TABLE_USE_SPEC +#else +#define TABLE_HASH_FIELD +#endif + +#ifndef TABLE_ALLOCZ +#define TABLE_ALLOCZ s_->allocz +#define TABLE_ALLOCZ_FIELD RTableAlloczFunc allocz; +#define TABLE_USE_SPEC +#else +#define TABLE_ALLOCZ_FIELD +#endif + +#ifndef TABLE_FREE +#define TABLE_FREE s_->free +#define TABLE_FREE_FIELD RTableFreeFunc free; +#define TABLE_USE_SPEC +#else +#define TABLE_FREE_FIELD +#endif + +#ifdef TABLE_USE_SPEC +#define TABLE_SPEC_PARAM , struct TABLE_METHOD(spec) *s_ +#define TABLE_SPEC , s_ +#undef TABLE_USE_SPEC +#endif + +#ifndef TABLE_METHOD +#define TABLE_METHOD(name) name +#endif + +// TODO: document page len and lf num/den + +#ifndef TABLE_PAGE_LEN_BITS +#define TABLE_PAGE_LEN_BITS 3u +#endif + +#define TABLE_PAGE_LEN (1u << (TABLE_PAGE_LEN_BITS)) + +#ifndef TABLE_LF_NUM +#define TABLE_LF_NUM 3u +#endif + +#ifndef TABLE_LF_DEN +#define TABLE_LF_DEN 4u +#endif diff --git a/inc/table/impl/unmacro.h b/inc/table/impl/unmacro.h @@ -0,0 +1,19 @@ +#undef TABLE_LF_DEN +#undef TABLE_LF_NUM +#undef TABLE_PAGE_LEN +#undef TABLE_PAGE_LEN_BITS +#undef TABLE_METHOD +#undef TABLE_SPEC +#undef TABLE_SPEC_PARAM +#undef TABLE_FREE_FIELD +#undef TABLE_FREE +#undef TABLE_ALLOCZ_FIELD +#undef TABLE_ALLOCZ +#undef TABLE_HASH_FIELD +#undef TABLE_HASH +#undef TABLE_EQ_FIELD +#undef TABLE_EQ +#undef TABLE_VSIZE_FIELD +#undef TABLE_VSIZE +#undef TABLE_KSIZE_FIELD +#undef TABLE_KSIZE diff --git a/inc/table/static.h b/inc/table/static.h @@ -0,0 +1,18 @@ +#include <errno.h> +#include <stdbool.h> +#include <string.h> + +#include "../alloc.h" +#include "../bits.h" +#include "../debug.h" +#include "../def.h" +#include "../internal/util.h" +#include "../rand.h" +#include "../string.h" + +#include "impl/macro.h" +#include "impl/common.h" +#define TABLE_STATIC static +#include "impl/declare.h" +#include "impl/define.h" +#include "impl/unmacro.h" diff --git a/src/dict.c b/src/dict.c @@ -1,373 +0,0 @@ -#include <errno.h> -#include <string.h> - -#include "bits.h" -#include "debug.h" -#include "dict.h" -#include "internal/util.h" -#include "log.h" -#include "rand.h" -#include "rcx.h" - -/* In a dict with n slots (i.e., the dict has room for n key-value pairs), the - * allocated dict data consists of n metadata bytes followed by n/PAGE_LEN - * pages, where a page consists of PAGE_LEN keys followed by PAGE_LEN values. - * A meta byte is either EMPTY, TOMBSTONE, or the most significant hash byte of - * the key stored in the corresponding slot (roughly speaking; see hash1). When - * scanning the dict, we use SIMD to probe PROBE_LEN cells at once. Thus, the - * dict size n is always at least MAX(PROBE_LEN, PAGE_LEN). Actually, we also - * require the dict size to be at least R_CACHE_LINE_SIZE, so that the meta - * bytes occupy a whole number of cache lines, and so that pages are - * cache-aligned in the common case of key types K and value types V such that - * sizeof(K) * PAGE_LEN and sizeof(V) * PAGE_LEN are both multiples of - * R_CACHE_LINE_SIZE. */ - -/* TODO: At some point, we should replace SSE2 with AVX2... or should we? - * Longer probe vectors means higher chance of conflicts within one probe - * vector, so maybe SSE2 is actually better. Before replacing SSE2 with AVX2, - * be sure to benchmark and do some statistical analysis. */ -#if defined(__GNUC__) && defined(R_HAVE_SSE2) -#include "simd.h" -#define USE_SIMD 1 -#define PROBE_LEN 16u -#define PROBE_LEN_BITS 4u -#else -#define PROBE_LEN 8u -#define PROBE_LEN_BITS 3u -#endif - -/* PAGE_LEN should be a multiple of 8 for packing and alignment. */ -/* TODO: make these parameters programmer-controllable. PAGE_LEN == 8 is - * a conservative choice. In most cases (i.e., with the key size being a - * multiple of 8), PAGE_LEN == 1 causes no packing issues and would improve - * caching performance with very large keys or values. */ -#define PAGE_LEN 8u -#define PAGE_LEN_BITS 3u - -/* The dict size doubles when count + ntombs exceeds the proportion of the - * dict size indicated by LF_NUM and LF_DEN. See exceeds_lf. */ -#define LF_NUM 3u -#define LF_DEN 4u - -#define EMPTY 0u /* Must be 0. */ -#define TOMBSTONE 1u -#define MIN_HASH1 2u - -enum scan_mode { - ACCESS = 0, - INSERT = 1, - DELETE = 2, -}; - -static inline u8 -hash1(u64 h, u8 b) { - u64 h1 = h >> 56; - if (h1 < MIN_HASH1) h1 += MIN_HASH1; - return h1; -} - -static inline bool -exceeds_lf(u64 n, u8 b) { - /* Note that the math here is 64-bit to avoid overflow. */ - return LF_DEN * n > LF_NUM * (U64_C(1) << b); -} - -static int -alloc_and_align(void **mem, void **data, u8 b, RDictSpec *s) { - usize len = USIZE_C(1) << b; - /* XXX: We assume ksize and vsize are small, so computing bytes_per_slot - * shouldn't overflow. */ - usize bytes_per_slot = 1u + s->ksize + s->vsize; - if (r_mul_will_overflow_(len, bytes_per_slot)) goto fail; - usize data_size = len * bytes_per_slot; - if (data_size > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail; - usize mem_size = (R_CACHE_LINE_SIZE - 1) + data_size; - *mem = s->allocz(mem_size); - if (!*mem) goto fail; - *data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1)) - & -(uptr)R_CACHE_LINE_SIZE); - - return 0; - -fail: - errno = ENOMEM; - return -1; -} - -/* grow_insert is a version of scan specialized for inserting while growing. - * See the comments in scan for explanation. */ -static void -grow_insert(u8 b, u8 *meta, u8 *pages, void *k, void *v, u64 h, RDictSpec *s) { - usize mvi_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u; - - usize h0 = h & mvi_mask; - u8 h1 = hash1(h, b); - -#ifdef USE_SIMD - v16u8 vempty = v16u8_fill(EMPTY); -#endif - - usize insert_slot; - - for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) { - usize i = mvi * PROBE_LEN; - -#ifdef USE_SIMD - v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); - u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty)); - if (m0 != 0) { - insert_slot = i + r_ctz16(m0); - break; - } -#else - u64 vmeta = r_readl64(meta+i); - int j0 = r_r8search64(vmeta); - if (j0 != 8) { - insert_slot = i + j0; - break; - } -#endif - } - - meta[insert_slot] = h1; - usize page_slot = insert_slot % PAGE_LEN; - u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize); - memcpy(page + page_slot * s->ksize, k, s->ksize); - memcpy(page + PAGE_LEN * s->ksize + page_slot * s->vsize, v, s->vsize); -} - -static int -grow(RDict *d, RDictSpec *s) { - u8 oldb = d->b; - usize oldlen = USIZE_C(1) << oldb; - void *oldmem = d->mem; - u8 *oldmeta = d->data; - u8 *oldpages = (u8 *)d->data + oldlen; - - u8 newb = oldb + 1; - ASSERT(newb < USIZE_BITS); - usize newlen = USIZE_C(1) << newb; - void *newmem, *newdata; - if (alloc_and_align(&newmem, &newdata, newb, s) < 0) return -1; - u8 *newmeta = newdata; - u8 *newpages = (u8 *)newdata + newlen; - - /* Rehash. */ - for (usize slot = 0; slot < oldlen; slot++) { - if (oldmeta[slot] < MIN_HASH1) continue; - usize page_slot = slot % PAGE_LEN; - u8 *page = oldpages + (slot - page_slot) * (s->ksize + s->vsize); - void *k = page + page_slot * s->ksize; - void *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize; - u64 h = s->hash(k, d->seed, s); - grow_insert(newb, newmeta, newpages, k, v, h, s); - } - - s->free(oldmem); - - d->b = newb; - d->ntombs = 0; - d->mem = newmem; - d->data = newdata; - - return 0; -} - -/* scan is inline, so that the mode argument becomes constant in all uses. */ -static inline int -scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) { - u8 *meta = d->data; - u8 *pages = (u8 *)d->data + (USIZE_C(1) << d->b); - - usize mvi_mask = (USIZE_C(1) << (d->b - PROBE_LEN_BITS)) - 1u; - - usize h0 = h & mvi_mask; - u8 h1 = hash1(h, d->b); - -#ifdef USE_SIMD - v16u8 vh1 = v16u8_fill(h1); - v16u8 vempty = v16u8_fill(EMPTY); -#else - u64 vh1; memset(&vh1, h1, sizeof vh1); -#endif - - /* These variables should get optimized away when mode != INSERT. */ - bool insert_slot_found = false; - usize insert_slot; - bool insert_replacing_tomb; - - /* mvi is the metadata vector index (where each metadata vector is a - * u8[PROBE_LEN]), and i is the corresponding index in the meta table. - * Note that mvi is increased quadratically. */ - for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) { - usize i = mvi * PROBE_LEN; - - /* We must load the metadata vector as little endian, so that low bits - * in the vector represent earlier slots in the table and hence - * ctz/r8search gives us earlier slots first. In the SIMD case, we - * know we're on x86, so a memcpy suffices. In the portable case, we - * use readl64, which does a byte swap on big endian machines. */ -#ifdef USE_SIMD - v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta); - /* m is a bit array indicating where vmeta equals h1. Thus, ctz(m) is - * the least j such that meta[j] == h1 (provided m != 0). */ - for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) { - int j = r_ctz16(m); -#else - u64 vmeta = r_readl64(meta+i); - /* Initially, z has 0x00 exactly where vmeta has h1. Thus, we can use - * r8search64 on z to find the indices j of h1 in vmeta. After each - * iteration, we mask in an arbitrary bit into the jth byte so that - * the next iteration gets a different j. */ - u64 z = vmeta ^ vh1; - for (int j = r_r8search64(z); j != 8; z |= U64_C(1)<<(8*j), j = r_r8search64(z)) { -#endif - usize slot = i + j; - usize page_slot = slot % PAGE_LEN; - u8 *page = pages + (slot - page_slot) * (s->ksize + s->vsize); - if likely (s->eq(page + page_slot * s->ksize, k, s)) { - if (mode == DELETE) { - /* If the probe vector contains an empty slot, then we - * don't need to insert a tombstone, and fewer tombstones - * means lower load factor. */ -#ifdef USE_SIMD - if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) { -#else - if (r_r8search64(vmeta) != 8) { -#endif - meta[slot] = EMPTY; - } else { - meta[slot] = TOMBSTONE; - d->ntombs += 1; - } - d->count -= 1; - } - *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize; - return 1; - } - } - - /* The key is not in this probe vector. Search for an empty slot to - * determine if we need to check more probe vectors. */ -#ifdef USE_SIMD - u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty)); -#else - int j0 = r_r8search64(vmeta); -#endif - - /* When inserting, we need to remember the first empty slot or - * tombstone seen in case we don't find an existing slot with the - * given key. We prioritize replacing tombstones to decrease the - * load factor. */ - if (mode == INSERT && !insert_slot_found) { -#ifdef USE_SIMD - v16u8 vtomb = v16u8_fill(TOMBSTONE); - u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb)); - if (m1 != 0) { - insert_slot_found = true; - insert_slot = i + r_ctz16(m1); - insert_replacing_tomb = true; - } else if (m0 != 0) { - insert_slot_found = true; - insert_slot = i + r_ctz16(m0); - insert_replacing_tomb = false; - break; - } -#else - u64 vtomb; memset(&vtomb, TOMBSTONE, sizeof vtomb); - int j1 = r_r8search64(vmeta ^ vtomb); - if (j1 != 8) { - insert_slot_found = true; - insert_slot = i + j1; - insert_replacing_tomb = true; - } else if (j0 != 8) { - insert_slot_found = true; - insert_slot = i + j0; - insert_replacing_tomb = false; - break; - } -#endif - } - -#ifdef USE_SIMD - if (m0 != 0) break; -#else - if (j0 != 8) break; -#endif - } - - if (mode == INSERT) { - /* The key wasn't found, so insert in the first available slot found - * along the probe sequence. */ - ASSERT(insert_slot_found); - d->count += 1; - if (insert_replacing_tomb) d->ntombs -= 1; - meta[insert_slot] = h1; - usize page_slot = insert_slot % PAGE_LEN; - u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize); - memcpy(page + page_slot * s->ksize, k, s->ksize); - *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize; - } - - return 0; -} - -int -r_dict_init(RDict *d, usize hint, RDictSpec *s) { - memset(d, 0, sizeof *d); - - /* Impose maximum on hint to prevent overflow and the following while - * loop from being infinite. See exceeds_lf. */ - hint = MIN(hint, (U64_MAX / LF_DEN) >> 1); - u8 b = MAX(R_CACHE_LINE_BITS, PROBE_LEN_BITS); - b = MAX(b, PAGE_LEN_BITS); - while (exceeds_lf(hint, b)) b += 1; - ASSERT(b < USIZE_BITS); - d->b = b; - - if (alloc_and_align(&d->mem, &d->data, b, s) < 0) return -1; - - d->seed = r_prand64(); - - return 0; -} - -void -r_dict_free(RDict *d, RDictSpec *s) { - int e = errno; - if (d->niter > 0) - r_warnf("dict freed with %"PRId32" iterators alive", d->niter); - s->free(d->mem); - d->data = 0; /* Ensure hard error on use after free. */ - errno = e; -} - -int -r_dict_access(RDict *d, void **v, void *k, u64 h, RDictSpec *s) { - return scan(d, v, k, h, ACCESS, s); -} - -int -r_dict_insert(RDict *d, void **v, void *k, u64 h, RDictSpec *s) { - if (exceeds_lf(d->count + d->ntombs, d->b)) { - if (grow(d, s) < 0) return -1; - } - return scan(d, v, k, h, INSERT, s); -} - -int -r_dict_delete(RDict *d, void **v, void *k, u64 h, RDictSpec *s) { - return scan(d, v, k, h, DELETE, s); -} - -void -r_dict_clear(RDict *d, RDictSpec *s) { - memset(d->data, EMPTY, USIZE_C(1) << d->b); - d->count = 0; - d->ntombs = 0; - - /* Re-seed to prevent some attacks. */ - d->seed = r_prand64(); -} - -// TODO: iter