commit ae1d9dd5a0d6ea1232b35d3001c88b7c67cc385a
parent eaa709cff6d7a23ede478704e902dc723b220fa0
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Sun, 24 Sep 2023 19:40:41 -0700
Abstract dict implementation to generic table data structure
This will form the basis of a new hash map ("dict") and hash set
implementation. Table is not meant to be used directly; the methods
are not very ergonomic.
Diffstat:
13 files changed, 579 insertions(+), 509 deletions(-)
diff --git a/Makefile b/Makefile
@@ -6,7 +6,6 @@ SRC =\
src/alloc.c\
src/bench.c\
src/debug.c\
- src/dict.c\
src/log.c\
src/rand.c\
src/str.c\
@@ -26,7 +25,6 @@ librcx.a: $(SRC:.c=.o)
src/alloc.o: src/alloc.c inc/alloc.h inc/def.h inc/log.h inc/rcx.h inc/internal/util.h config.mk
src/bench.o: src/bench.c inc/bench.h inc/def.h inc/log.h inc/rcx.h config.mk
src/debug.o: src/debug.c inc/debug.h inc/def.h inc/rcx.h config.mk
-src/dict.o: src/dict.c inc/bits.h inc/debug.h inc/def.h inc/dict.h inc/dict-defaults.h inc/internal/util.h inc/log.h inc/rand.h inc/rcx.h inc/string.h config.mk
src/log.o: src/log.c inc/def.h inc/log.h inc/rcx.h config.mk
src/opt.o: src/opt.c inc/def.h inc/opt.h inc/rcx.h config.mk
src/rand.o: src/rand.c inc/bits.h inc/def.h inc/rand.h inc/rcx.h inc/unix.h config.mk
diff --git a/inc/all.h b/inc/all.h
@@ -4,10 +4,8 @@
#include "bits.h"
#include "debug.h"
#include "deque.h"
-#include "dict.h"
#include "error.h"
#include "log.h"
-#include "opt.h"
#include "rand.h"
#include "rcx.h"
#include "str.h"
diff --git a/inc/dict-defaults.h b/inc/dict-defaults.h
@@ -1,17 +0,0 @@
-#undef R_DICT_STATIC
-#define R_DICT_STATIC
-
-#undef R_DICT_METHOD
-#define R_DICT_METHOD(name, prefix) JOIN(JOIN(prefix,_),name)
-
-#undef R_DICT_EQ
-#define R_DICT_EQ r_dict_mem_eq
-
-#undef R_DICT_HASH
-#define R_DICT_HASH r_dict_mem_hash
-
-#undef R_DICT_ALLOCZ
-#define R_DICT_ALLOCZ r_eallocz
-
-#undef R_DICT_FREE
-#define R_DICT_FREE free
diff --git a/inc/dict.h b/inc/dict.h
@@ -1,114 +0,0 @@
-#pragma once
-
-#include <stdbool.h>
-#include <string.h>
-
-#include "debug.h"
-#include "def.h"
-#include "rand.h"
-#include "string.h"
-
-typedef struct r_dict_spec RDictSpec;
-typedef struct r_dict RDict;
-typedef bool (*RDictEqFunc)(void *a, void *b, RDictSpec *s);
-typedef u64 (*RDictHashFunc)(void *k, u64 seed, RDictSpec *s);
-typedef void *(*RDictAlloczFunc)(usize size);
-typedef void (*RDictFreeFunc)(void *p);
-
-/* RDictSpec is for passing user settings in to the shared part of the
- * implementation in dict.c. */
-struct r_dict_spec {
- usize ksize;
- usize vsize;
- RDictEqFunc eq;
- RDictHashFunc hash;
- RDictAlloczFunc allocz;
- RDictFreeFunc free;
-};
-
-struct r_dict {
- u8 b; /* Log2 of number of slots */
- u32 niter;
- u64 seed;
- usize count;
- usize ntombs;
- void *mem;
- void *data; /* Cache-aligned pointer into mem; see dict.c for layout. */
-};
-
-static inline bool r_dict_mem_eq(void *a, void *b, RDictSpec *s) { return memcmp(a, b, s->ksize) == 0; }
-static inline u64 r_dict_mem_hash(void *k, u64 seed, RDictSpec *s) { return r_hash(k, s->ksize, seed); }
-
-static inline bool r_dict_cstr_eq(char **a, char **b, RDictSpec *s) { return strcmp(*a, *b) == 0; }
-static inline u64 r_dict_cstr_hash(char **k, u64 seed, RDictSpec *s) { return r_hash(*k, strlen(*k), seed); }
-
-static inline bool r_dict_rstr_eq(Str *a, Str *b, RDictSpec *s) { return str_eq(*a, *b); }
-static inline u64 r_dict_rstr_hash(Str *k, u64 seed, RDictSpec *s) { return r_hash(str_bytes(*k), str_len(*k), seed); }
-
-/* TODO: add functions for shrinking dict and reserving space? */
-int r_dict_init(RDict *d, usize hint, RDictSpec *s);
-void r_dict_free(RDict *d, RDictSpec *s);
-int r_dict_access(RDict *d, void **v, void *k, u64 h, RDictSpec *s);
-int r_dict_insert(RDict *d, void **v, void *k, u64 h, RDictSpec *s);
-int r_dict_delete(RDict *d, void **v, void *k, u64 h, RDictSpec *s);
-void r_dict_clear(RDict *d, RDictSpec *s);
-
-#include "dict-defaults.h"
-
-#define R_DICT_TYPEDEF(D, K, V)\
-typedef struct D { \
- RDict d; \
- V default_val; \
-} D;
-
-#define R_DICT_SPEC_(K,V) &(RDictSpec){ \
- .ksize = sizeof(K), \
- .vsize = sizeof(V), \
- .eq = (RDictEqFunc)(R_DICT_EQ), \
- .hash = (RDictHashFunc)(R_DICT_HASH), \
- .allocz = (RDictAlloczFunc)(R_DICT_ALLOCZ), \
- .free = (RDictFreeFunc)(R_DICT_FREE), \
- }
-
-#define R_DICT_DECLARE(D, K, V, ...)\
-static inline UNUSED usize R_DICT_METHOD(len,##__VA_ARGS__)(D *d) { return d->d.count; } \
-static inline UNUSED void R_DICT_METHOD(set_default,##__VA_ARGS__)(D *d, V v) { d->default_val = v; } \
-static inline UNUSED V R_DICT_METHOD(get_default,##__VA_ARGS__)(D *d) { return d->default_val; } \
-static inline UNUSED int R_DICT_METHOD(init,##__VA_ARGS__)(D *d, usize hint) { \
- memset(&d->default_val, 0, sizeof(V)); \
- return r_dict_init(&d->d, hint, R_DICT_SPEC_(K,V)); \
-} \
-static inline UNUSED void R_DICT_METHOD(free,##__VA_ARGS__)(D *d) { r_dict_free(&d->d, R_DICT_SPEC_(K,V)); } \
-static inline UNUSED V *R_DICT_METHOD(ptr,##__VA_ARGS__)(D *d, K k) { \
- u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
- void *vp = 0; r_dict_access(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
- return vp; \
-} \
-static inline UNUSED bool R_DICT_METHOD(get,##__VA_ARGS__)(D *d, V *v, K k) { \
- u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
- void *vp; int r = r_dict_access(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
- if (v) *v = r > 0 ? *(V *)vp : d->default_val; \
- return r > 0; \
-} \
-static inline UNUSED bool R_DICT_METHOD(set,##__VA_ARGS__)(D *d, K k, V v) { \
- u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
- void *vp; int r = r_dict_access(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
- if (r > 0) *(V *)vp = v; \
- return r > 0; \
-} \
-static inline UNUSED int R_DICT_METHOD(put,##__VA_ARGS__)(D *d, K k, V v) { \
- ASSERT(d->d.niter == 0, "can not call put while iterating through dict"); \
- u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
- void *vp; int r = r_dict_insert(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
- if (r >= 0) *(V *)vp = v; \
- return r; \
-} \
-static inline UNUSED bool R_DICT_METHOD(del,##__VA_ARGS__)(D *d, V *v, K k) { \
- u64 h = R_DICT_HASH(&k, d->d.seed, R_DICT_SPEC_(K,V)); /* Allow hash to be inlined. */ \
- void *vp; int r = r_dict_delete(&d->d, &vp, &k, h, R_DICT_SPEC_(K,V)); \
- if (v) *v = r > 0 ? *(V *)vp : d->default_val; \
- return r > 0; \
-} \
-static inline UNUSED void R_DICT_METHOD(clr,##__VA_ARGS__)(D *d) { r_dict_clear(&d->d, R_DICT_SPEC_(K,V)); }
-
-#define R_DICT_DEFINE(D, K, V, ...) /* No op. Everything is static inline. */
-\ No newline at end of file
diff --git a/inc/table/declare.h b/inc/table/declare.h
@@ -0,0 +1,13 @@
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "../alloc.h"
+#include "../def.h"
+#include "../rand.h"
+#include "../string.h"
+
+#include "impl/macro.h"
+#include "impl/common.h"
+#include "impl/declare.h"
+#include "impl/unmacro.h"
diff --git a/inc/table/define.h b/inc/table/define.h
@@ -0,0 +1,15 @@
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "../alloc.h"
+#include "../bits.h"
+#include "../debug.h"
+#include "../def.h"
+#include "../internal/util.h"
+#include "../rand.h"
+#include "../string.h"
+
+#include "impl/macro.h"
+#include "impl/define.h"
+#include "impl/unmacro.h"
diff --git a/inc/table/impl/common.h b/inc/table/impl/common.h
@@ -0,0 +1,43 @@
+#pragma once
+
+typedef bool (*RTableEqFunc)(void *a, void *b, usize ksize);
+typedef u64 (*RTableHashFunc)(void *k, u64 seed, usize ksize);
+typedef void *(*RTableAlloczFunc)(usize size);
+typedef void (*RTableFreeFunc)(void *p);
+typedef struct r_table RTable;
+
+struct r_table {
+ u8 b; /* Log2 of number of slots */
+ u64 seed;
+ usize count;
+ usize ntombs;
+ void *mem;
+ void *data; /* Cache-aligned pointer into mem; see define.h for layout. */
+};
+
+static inline bool
+r_table_mem_eq(void *a, void *b, usize ksize) {
+ return memcmp(a, b, ksize) == 0;
+}
+static inline u64
+r_table_mem_hash(void *k, u64 seed, usize ksize) {
+ return r_hash(k, ksize, seed);
+}
+
+static inline bool
+r_table_cstr_eq(char **a, char **b, usize ksize) {
+ return strcmp(*a, *b) == 0;
+}
+static inline u64
+r_table_cstr_hash(char **k, u64 seed, usize ksize) {
+ return r_hash(*k, strlen(*k), seed);
+}
+
+static inline bool
+r_table_rstr_eq(Str *a, Str *b, usize ksize) {
+ return str_eq(*a, *b);
+}
+static inline u64
+r_table_rstr_hash(Str *k, u64 seed, usize ksize) {
+ return r_hash(str_bytes(*k), str_len(*k), seed);
+}
diff --git a/inc/table/impl/declare.h b/inc/table/impl/declare.h
@@ -0,0 +1,43 @@
+#ifdef TABLE_SPEC
+/* The spec(ification) struct is for passing around settings specific to a
+ * table instantiation. Fields are omitted or included according to whether
+ * certain TABLE_... macros are assigned non-default values when a table is
+ * specialized; see macro.h for details. */
+/* TODO: Add PAGE_LEN and LF_{NUM,DEN} to the spec struct so that they are
+ * controllable at run-time? */
+struct TABLE_METHOD(spec) {
+ TABLE_KSIZE_FIELD
+ TABLE_VSIZE_FIELD
+ TABLE_EQ_FIELD
+ TABLE_HASH_FIELD
+ TABLE_ALLOCZ_FIELD
+ TABLE_FREE_FIELD
+};
+#endif
+
+/* TODO: Add methods for shrinking the table and reserving space? */
+
+#ifndef TABLE_STATIC
+#define TABLE_STATIC
+#endif
+
+TABLE_STATIC UNUSED int TABLE_METHOD(init)(RTable *t, usize hint TABLE_SPEC_PARAM);
+TABLE_STATIC UNUSED int TABLE_METHOD(acc)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM);
+TABLE_STATIC UNUSED int TABLE_METHOD(ins)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM);
+TABLE_STATIC UNUSED int TABLE_METHOD(del)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM);
+TABLE_STATIC UNUSED void TABLE_METHOD(clr)(RTable *t TABLE_SPEC_PARAM);
+
+#undef TABLE_STATIC
+
+static inline UNUSED void
+TABLE_METHOD(free)(RTable *t TABLE_SPEC_PARAM) {
+ int e = errno;
+ TABLE_FREE(t->mem);
+ t->data = 0; /* Ensure hard error on use after free. */
+ errno = e;
+}
+
+static inline UNUSED usize
+TABLE_METHOD(len)(RTable *t TABLE_SPEC_PARAM) {
+ return t->count;
+}
diff --git a/inc/table/impl/define.h b/inc/table/impl/define.h
@@ -0,0 +1,355 @@
+/* In a table with n slots (i.e., the table has room for n key-value pairs),
+ * the allocated table data consists of n metadata bytes followed by n/PAGE_LEN
+ * pages, where a page consists of PAGE_LEN keys followed by PAGE_LEN values.
+ * A meta byte is either EMPTY, TOMBSTONE, or the most significant hash byte of
+ * the key stored in the corresponding slot (roughly speaking; see hash1). When
+ * scanning the dict, we use SIMD to probe PROBE_LEN cells at once. Thus, the
+ * dict size n is always at least MAX(PROBE_LEN, PAGE_LEN). Actually, we also
+ * require the dict size to be at least R_CACHE_LINE_SIZE, so that the meta
+ * bytes occupy a whole number of cache lines, and so that pages are
+ * cache-aligned in the common case of key types K and value types V such that
+ * sizeof(K) * PAGE_LEN and sizeof(V) * PAGE_LEN are both multiples of
+ * R_CACHE_LINE_SIZE. */
+
+/* PAGE_LEN should be a multiple of 8 for packing and alignment. */
+/* TODO: make these parameters programmer-controllable. PAGE_LEN == 8 is
+ * a conservative choice. In most cases (i.e., with the key size being a
+ * multiple of 8), PAGE_LEN == 1 causes no packing issues and would improve
+ * caching performance with very large keys or values. */
+
+/* The dict size doubles when count + ntombs exceeds the proportion of the
+ * dict size indicated by LF_NUM and LF_DEN. See exceeds_lf. */
+
+/* TODO: At some point, we should replace SSE2 with AVX2... or should we?
+ * Longer probe vectors means higher chance of conflicts within one probe
+ * vector, so maybe SSE2 is actually better. Before replacing SSE2 with AVX2,
+ * be sure to benchmark and do some statistical analysis. */
+#if defined(__GNUC__) && defined(R_HAVE_SSE2)
+#include "../../simd.h"
+#define USE_SIMD 1
+#define PROBE_LEN 16u
+#define PROBE_LEN_BITS 4u
+#else
+#define PROBE_LEN 8u
+#define PROBE_LEN_BITS 3u
+#endif
+
+#define EMPTY 0u /* Must be 0. */
+#define TOMBSTONE 1u
+#define MIN_HASH1 2u
+
+#define ACCESS 0
+#define INSERT 1
+#define DELETE 2
+
+static inline u8
+TABLE_METHOD(hash1)(u64 h, u8 b) {
+ u64 h1 = h >> 56;
+ if (h1 < MIN_HASH1) h1 += MIN_HASH1;
+ return h1;
+}
+
+static inline bool
+TABLE_METHOD(exceeds_lf)(u64 n, u8 b) {
+ /* Note that the math here is 64-bit to avoid overflow. */
+ return TABLE_LF_DEN * n > TABLE_LF_NUM * (U64_C(1) << b);
+}
+
+static int
+TABLE_METHOD(alloc_and_align)(void **mem, void **data, u8 b TABLE_SPEC_PARAM) {
+ usize len = USIZE_C(1) << b;
+ /* XXX: We assume KSIZE and VSIZE are small, so computing bytes_per_slot
+ * shouldn't overflow. */
+ usize bytes_per_slot = 1u + TABLE_KSIZE + TABLE_VSIZE;
+ if (r_mul_will_overflow_(len, bytes_per_slot)) goto fail;
+ usize data_size = len * bytes_per_slot;
+ if (data_size > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail;
+ usize mem_size = (R_CACHE_LINE_SIZE - 1) + data_size;
+ *mem = TABLE_ALLOCZ(mem_size);
+ if (!*mem) goto fail;
+ *data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1))
+ & -(uptr)R_CACHE_LINE_SIZE);
+ return 0;
+
+fail:
+ errno = ENOMEM;
+ return -1;
+}
+
+/* grow_insert is a version of scan specialized for inserting while growing.
+ * See the comments in scan for explanation. */
+static void
+TABLE_METHOD(grow_insert)(u8 b, u8 *meta, u8 *pages, void *k, void *v, u64 h TABLE_SPEC_PARAM) {
+ usize mvi_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u;
+
+ usize h0 = h & mvi_mask;
+ u8 h1 = TABLE_METHOD(hash1)(h, b);
+
+#ifdef USE_SIMD
+ v16u8 vempty = v16u8_fill(EMPTY);
+#endif
+
+ usize insert_slot;
+
+ for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) {
+ usize i = mvi * PROBE_LEN;
+
+ #ifdef USE_SIMD
+ v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
+ u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty));
+ if (m0 != 0) {
+ insert_slot = i + r_ctz16(m0);
+ break;
+ }
+ #else
+ u64 vmeta = r_readl64(meta+i);
+ int j0 = r_rzb64(vmeta);
+ if (j0 != 8) {
+ insert_slot = i + j0;
+ break;
+ }
+ #endif
+ }
+
+ meta[insert_slot] = h1;
+ usize page_slot = insert_slot % TABLE_PAGE_LEN;
+ u8 *page = pages + (insert_slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE);
+ memcpy(page + page_slot * TABLE_KSIZE, k, TABLE_KSIZE);
+ memcpy(page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE, v, TABLE_VSIZE);
+}
+
+static int
+TABLE_METHOD(grow)(RTable *t TABLE_SPEC_PARAM) {
+ u8 oldb = t->b;
+ usize oldlen = USIZE_C(1) << oldb;
+ void *oldmem = t->mem;
+ u8 *oldmeta = t->data;
+ u8 *oldpages = (u8 *)t->data + oldlen;
+
+ u8 newb = oldb + 1;
+ ASSERT(newb < USIZE_BITS);
+ usize newlen = USIZE_C(1) << newb;
+ void *newmem, *newdata;
+ if (TABLE_METHOD(alloc_and_align)(&newmem, &newdata, newb TABLE_SPEC) < 0)
+ return -1;
+ u8 *newmeta = newdata;
+ u8 *newpages = (u8 *)newdata + newlen;
+
+ /* Rehash. */
+ for (usize slot = 0; slot < oldlen; slot++) {
+ if (oldmeta[slot] < MIN_HASH1) continue;
+ usize page_slot = slot % TABLE_PAGE_LEN;
+ u8 *page = oldpages + (slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE);
+ void *k = page + page_slot * TABLE_KSIZE;
+ void *v = page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE;
+ u64 h = TABLE_HASH(k, t->seed, TABLE_KSIZE);
+ TABLE_METHOD(grow_insert)(newb, newmeta, newpages, k, v, h TABLE_SPEC);
+ }
+
+ TABLE_FREE(oldmem);
+
+ t->b = newb;
+ t->ntombs = 0;
+ t->mem = newmem;
+ t->data = newdata;
+
+ return 0;
+}
+
+/* scan is inline, so that the mode argument becomes constant in all uses. */
+static inline int
+TABLE_METHOD(scan)(RTable *t, void **v, void *k, u64 h, int mode TABLE_SPEC_PARAM) {
+ u8 *meta = t->data;
+ u8 *pages = (u8 *)t->data + (USIZE_C(1) << t->b);
+
+ usize mvi_mask = (USIZE_C(1) << (t->b - PROBE_LEN_BITS)) - 1u;
+
+ usize h0 = h & mvi_mask;
+ u8 h1 = TABLE_METHOD(hash1)(h, t->b);
+
+#ifdef USE_SIMD
+ v16u8 vh1 = v16u8_fill(h1);
+ v16u8 vempty = v16u8_fill(EMPTY);
+#else
+ u64 vh1; memset(&vh1, h1, sizeof vh1);
+#endif
+
+ /* These variables should get optimized away when mode != INSERT. */
+ bool insert_slot_found = false;
+ usize insert_slot;
+ bool insert_replacing_tomb;
+
+ /* mvi is the metadata vector index (where each metadata vector is a
+ * u8[PROBE_LEN]), and i is the corresponding index in the meta table.
+ * Note that mvi is increased quadratically. */
+ for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) {
+ usize i = mvi * PROBE_LEN;
+
+ /* We must load the metadata vector as little endian, so that low bits
+ * in the vector represent earlier slots in the table and hence
+ * ctz/rzb gives us earlier slots first. In the SIMD case, we know
+ * we're on x86, so a memcpy suffices. In the portable case, we use
+ * readl64, which does a byte swap on big endian machines. */
+ #ifdef USE_SIMD
+ v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
+ /* m is a bit array indicating where vmeta equals h1. Thus, ctz(m) is
+ * the least j such that meta[j] == h1 (provided m != 0). */
+ for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) {
+ int j = r_ctz16(m);
+ #else
+ u64 vmeta = r_readl64(meta+i);
+ /* Initially, z has 0x00 exactly where vmeta has h1. Thus, we can use
+ * rzb64 on z to find the indices j of h1 in vmeta. After each
+ * iteration, we mask in an arbitrary bit into the jth byte so that
+ * the next iteration gets a different j. */
+ u64 z = vmeta ^ vh1;
+ for (int j = r_rzb64(z); j != 8; z |= U64_C(1)<<(8*j), j = r_rzb64(z)) {
+ #endif
+ usize slot = i + j;
+ usize page_slot = slot % TABLE_PAGE_LEN;
+ u8 *page = pages + (slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE);
+ if likely (TABLE_EQ(page + page_slot * TABLE_KSIZE, k, TABLE_KSIZE)) {
+ if (mode == DELETE) {
+ /* If the probe vector contains an empty slot, then we
+ * don't need to insert a tombstone, and fewer tombstones
+ * means lower load factor. */
+ #ifdef USE_SIMD
+ if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) {
+ #else
+ if (r_rzb64(vmeta) != 8) {
+ #endif
+ meta[slot] = EMPTY;
+ } else {
+ meta[slot] = TOMBSTONE;
+ t->ntombs += 1;
+ }
+ t->count -= 1;
+ }
+ *v = page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE;
+ return 1;
+ }
+ }
+
+ /* The key is not in this probe vector. Search for an empty slot to
+ * determine if we need to check more probe vectors. */
+ #ifdef USE_SIMD
+ u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty));
+ #else
+ int j0 = r_rzb64(vmeta);
+ #endif
+
+ /* When inserting, we need to remember the first empty slot or
+ * tombstone seen in case we don't find an existing slot with the
+ * given key. We prioritize replacing tombstones to decrease the
+ * load factor. */
+ if (mode == INSERT && !insert_slot_found) {
+ #ifdef USE_SIMD
+ v16u8 vtomb = v16u8_fill(TOMBSTONE);
+ u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb));
+ if (m1 != 0) {
+ insert_slot_found = true;
+ insert_slot = i + r_ctz16(m1);
+ insert_replacing_tomb = true;
+ } else if (m0 != 0) {
+ insert_slot_found = true;
+ insert_slot = i + r_ctz16(m0);
+ insert_replacing_tomb = false;
+ break;
+ }
+ #else
+ u64 vtomb; memset(&vtomb, TOMBSTONE, sizeof vtomb);
+ int j1 = r_rzb64(vmeta ^ vtomb);
+ if (j1 != 8) {
+ insert_slot_found = true;
+ insert_slot = i + j1;
+ insert_replacing_tomb = true;
+ } else if (j0 != 8) {
+ insert_slot_found = true;
+ insert_slot = i + j0;
+ insert_replacing_tomb = false;
+ break;
+ }
+ #endif
+ }
+
+ #ifdef USE_SIMD
+ if (m0 != 0) break;
+ #else
+ if (j0 != 8) break;
+ #endif
+ }
+
+ if (mode == INSERT) {
+ /* The key wasn't found, so insert in the first available slot found
+ * along the probe sequence. */
+ ASSERT(insert_slot_found);
+ t->count += 1;
+ if (insert_replacing_tomb) t->ntombs -= 1;
+ meta[insert_slot] = h1;
+ usize page_slot = insert_slot % TABLE_PAGE_LEN;
+ u8 *page = pages + (insert_slot - page_slot) * (TABLE_KSIZE + TABLE_VSIZE);
+ memcpy(page + page_slot * TABLE_KSIZE, k, TABLE_KSIZE);
+ *v = page + TABLE_PAGE_LEN * TABLE_KSIZE + page_slot * TABLE_VSIZE;
+ }
+
+ return 0;
+}
+
+int
+TABLE_METHOD(init)(RTable *t, usize hint TABLE_SPEC_PARAM) {
+ memset(t, 0, sizeof *t);
+
+ /* Impose maximum on hint to prevent overflow and the following while
+ * loop from being infinite. See exceeds_lf. */
+ hint = MIN(hint, (U64_MAX / TABLE_LF_DEN) >> 1);
+ u8 b = MAX(R_CACHE_LINE_BITS, PROBE_LEN_BITS);
+ b = MAX(b, TABLE_PAGE_LEN_BITS);
+ while (TABLE_METHOD(exceeds_lf)(hint, b)) b += 1;
+ ASSERT(b < USIZE_BITS);
+ t->b = b;
+
+ if (TABLE_METHOD(alloc_and_align)(&t->mem, &t->data, b TABLE_SPEC) < 0)
+ return -1;
+
+ t->seed = r_prand64();
+
+ return 0;
+}
+
+int
+TABLE_METHOD(acc)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM) {
+ return TABLE_METHOD(scan)(t, v, k, h, ACCESS TABLE_SPEC);
+}
+
+int
+TABLE_METHOD(ins)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM) {
+ if (TABLE_METHOD(exceeds_lf)(t->count + t->ntombs, t->b)) {
+ if (TABLE_METHOD(grow)(t TABLE_SPEC) < 0) return -1;
+ }
+ return TABLE_METHOD(scan)(t, v, k, h, INSERT TABLE_SPEC);
+}
+
+int
+TABLE_METHOD(del)(RTable *t, void **v, void *k, u64 h TABLE_SPEC_PARAM) {
+ return TABLE_METHOD(scan)(t, v, k, h, DELETE TABLE_SPEC);
+}
+
+void
+TABLE_METHOD(clr)(RTable *t TABLE_SPEC_PARAM) {
+ memset(t->data, EMPTY, USIZE_C(1) << t->b);
+ t->count = 0;
+ t->ntombs = 0;
+
+ /* Re-seed to prevent some attacks. */
+ t->seed = r_prand64();
+}
+
+#undef DELETE
+#undef INSERT
+#undef ACCESS
+#undef MIN_HASH1
+#undef TOMBSTONE
+#undef EMPTY
+#undef PROBE_LEN_BITS
+#undef PROBE_LEN
+#undef USE_SIMD
diff --git a/inc/table/impl/macro.h b/inc/table/impl/macro.h
@@ -0,0 +1,73 @@
+#ifndef TABLE_KSIZE
+#define TABLE_KSIZE s_->ksize
+#define TABLE_KSIZE_FIELD usize ksize;
+#define TABLE_USE_SPEC
+#else
+#define TABLE_KSIZE_FIELD
+#endif
+
+#ifndef TABLE_VSIZE
+#define TABLE_VSIZE s_->vsize
+#define TABLE_VSIZE_FIELD usize vsize;
+#define TABLE_USE_SPEC
+#else
+#define TABLE_VSIZE_FIELD
+#endif
+
+#ifndef TABLE_EQ
+#define TABLE_EQ s_->eq
+#define TABLE_EQ_FIELD RTableEqFunc eq;
+#define TABLE_USE_SPEC
+#else
+#define TABLE_EQ_FIELD
+#endif
+
+#ifndef TABLE_HASH
+#define TABLE_HASH s_->hash
+#define TABLE_HASH_FIELD RTableHashFunc hash;
+#define TABLE_USE_SPEC
+#else
+#define TABLE_HASH_FIELD
+#endif
+
+#ifndef TABLE_ALLOCZ
+#define TABLE_ALLOCZ s_->allocz
+#define TABLE_ALLOCZ_FIELD RTableAlloczFunc allocz;
+#define TABLE_USE_SPEC
+#else
+#define TABLE_ALLOCZ_FIELD
+#endif
+
+#ifndef TABLE_FREE
+#define TABLE_FREE s_->free
+#define TABLE_FREE_FIELD RTableFreeFunc free;
+#define TABLE_USE_SPEC
+#else
+#define TABLE_FREE_FIELD
+#endif
+
+#ifdef TABLE_USE_SPEC
+#define TABLE_SPEC_PARAM , struct TABLE_METHOD(spec) *s_
+#define TABLE_SPEC , s_
+#undef TABLE_USE_SPEC
+#endif
+
+#ifndef TABLE_METHOD
+#define TABLE_METHOD(name) name
+#endif
+
+// TODO: document page len and lf num/den
+
+#ifndef TABLE_PAGE_LEN_BITS
+#define TABLE_PAGE_LEN_BITS 3u
+#endif
+
+#define TABLE_PAGE_LEN (1u << (TABLE_PAGE_LEN_BITS))
+
+#ifndef TABLE_LF_NUM
+#define TABLE_LF_NUM 3u
+#endif
+
+#ifndef TABLE_LF_DEN
+#define TABLE_LF_DEN 4u
+#endif
diff --git a/inc/table/impl/unmacro.h b/inc/table/impl/unmacro.h
@@ -0,0 +1,19 @@
+#undef TABLE_LF_DEN
+#undef TABLE_LF_NUM
+#undef TABLE_PAGE_LEN
+#undef TABLE_PAGE_LEN_BITS
+#undef TABLE_METHOD
+#undef TABLE_SPEC
+#undef TABLE_SPEC_PARAM
+#undef TABLE_FREE_FIELD
+#undef TABLE_FREE
+#undef TABLE_ALLOCZ_FIELD
+#undef TABLE_ALLOCZ
+#undef TABLE_HASH_FIELD
+#undef TABLE_HASH
+#undef TABLE_EQ_FIELD
+#undef TABLE_EQ
+#undef TABLE_VSIZE_FIELD
+#undef TABLE_VSIZE
+#undef TABLE_KSIZE_FIELD
+#undef TABLE_KSIZE
diff --git a/inc/table/static.h b/inc/table/static.h
@@ -0,0 +1,18 @@
+#include <errno.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "../alloc.h"
+#include "../bits.h"
+#include "../debug.h"
+#include "../def.h"
+#include "../internal/util.h"
+#include "../rand.h"
+#include "../string.h"
+
+#include "impl/macro.h"
+#include "impl/common.h"
+#define TABLE_STATIC static
+#include "impl/declare.h"
+#include "impl/define.h"
+#include "impl/unmacro.h"
diff --git a/src/dict.c b/src/dict.c
@@ -1,373 +0,0 @@
-#include <errno.h>
-#include <string.h>
-
-#include "bits.h"
-#include "debug.h"
-#include "dict.h"
-#include "internal/util.h"
-#include "log.h"
-#include "rand.h"
-#include "rcx.h"
-
-/* In a dict with n slots (i.e., the dict has room for n key-value pairs), the
- * allocated dict data consists of n metadata bytes followed by n/PAGE_LEN
- * pages, where a page consists of PAGE_LEN keys followed by PAGE_LEN values.
- * A meta byte is either EMPTY, TOMBSTONE, or the most significant hash byte of
- * the key stored in the corresponding slot (roughly speaking; see hash1). When
- * scanning the dict, we use SIMD to probe PROBE_LEN cells at once. Thus, the
- * dict size n is always at least MAX(PROBE_LEN, PAGE_LEN). Actually, we also
- * require the dict size to be at least R_CACHE_LINE_SIZE, so that the meta
- * bytes occupy a whole number of cache lines, and so that pages are
- * cache-aligned in the common case of key types K and value types V such that
- * sizeof(K) * PAGE_LEN and sizeof(V) * PAGE_LEN are both multiples of
- * R_CACHE_LINE_SIZE. */
-
-/* TODO: At some point, we should replace SSE2 with AVX2... or should we?
- * Longer probe vectors means higher chance of conflicts within one probe
- * vector, so maybe SSE2 is actually better. Before replacing SSE2 with AVX2,
- * be sure to benchmark and do some statistical analysis. */
-#if defined(__GNUC__) && defined(R_HAVE_SSE2)
-#include "simd.h"
-#define USE_SIMD 1
-#define PROBE_LEN 16u
-#define PROBE_LEN_BITS 4u
-#else
-#define PROBE_LEN 8u
-#define PROBE_LEN_BITS 3u
-#endif
-
-/* PAGE_LEN should be a multiple of 8 for packing and alignment. */
-/* TODO: make these parameters programmer-controllable. PAGE_LEN == 8 is
- * a conservative choice. In most cases (i.e., with the key size being a
- * multiple of 8), PAGE_LEN == 1 causes no packing issues and would improve
- * caching performance with very large keys or values. */
-#define PAGE_LEN 8u
-#define PAGE_LEN_BITS 3u
-
-/* The dict size doubles when count + ntombs exceeds the proportion of the
- * dict size indicated by LF_NUM and LF_DEN. See exceeds_lf. */
-#define LF_NUM 3u
-#define LF_DEN 4u
-
-#define EMPTY 0u /* Must be 0. */
-#define TOMBSTONE 1u
-#define MIN_HASH1 2u
-
-enum scan_mode {
- ACCESS = 0,
- INSERT = 1,
- DELETE = 2,
-};
-
-static inline u8
-hash1(u64 h, u8 b) {
- u64 h1 = h >> 56;
- if (h1 < MIN_HASH1) h1 += MIN_HASH1;
- return h1;
-}
-
-static inline bool
-exceeds_lf(u64 n, u8 b) {
- /* Note that the math here is 64-bit to avoid overflow. */
- return LF_DEN * n > LF_NUM * (U64_C(1) << b);
-}
-
-static int
-alloc_and_align(void **mem, void **data, u8 b, RDictSpec *s) {
- usize len = USIZE_C(1) << b;
- /* XXX: We assume ksize and vsize are small, so computing bytes_per_slot
- * shouldn't overflow. */
- usize bytes_per_slot = 1u + s->ksize + s->vsize;
- if (r_mul_will_overflow_(len, bytes_per_slot)) goto fail;
- usize data_size = len * bytes_per_slot;
- if (data_size > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail;
- usize mem_size = (R_CACHE_LINE_SIZE - 1) + data_size;
- *mem = s->allocz(mem_size);
- if (!*mem) goto fail;
- *data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1))
- & -(uptr)R_CACHE_LINE_SIZE);
-
- return 0;
-
-fail:
- errno = ENOMEM;
- return -1;
-}
-
-/* grow_insert is a version of scan specialized for inserting while growing.
- * See the comments in scan for explanation. */
-static void
-grow_insert(u8 b, u8 *meta, u8 *pages, void *k, void *v, u64 h, RDictSpec *s) {
- usize mvi_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u;
-
- usize h0 = h & mvi_mask;
- u8 h1 = hash1(h, b);
-
-#ifdef USE_SIMD
- v16u8 vempty = v16u8_fill(EMPTY);
-#endif
-
- usize insert_slot;
-
- for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) {
- usize i = mvi * PROBE_LEN;
-
-#ifdef USE_SIMD
- v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
- u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty));
- if (m0 != 0) {
- insert_slot = i + r_ctz16(m0);
- break;
- }
-#else
- u64 vmeta = r_readl64(meta+i);
- int j0 = r_r8search64(vmeta);
- if (j0 != 8) {
- insert_slot = i + j0;
- break;
- }
-#endif
- }
-
- meta[insert_slot] = h1;
- usize page_slot = insert_slot % PAGE_LEN;
- u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize);
- memcpy(page + page_slot * s->ksize, k, s->ksize);
- memcpy(page + PAGE_LEN * s->ksize + page_slot * s->vsize, v, s->vsize);
-}
-
-static int
-grow(RDict *d, RDictSpec *s) {
- u8 oldb = d->b;
- usize oldlen = USIZE_C(1) << oldb;
- void *oldmem = d->mem;
- u8 *oldmeta = d->data;
- u8 *oldpages = (u8 *)d->data + oldlen;
-
- u8 newb = oldb + 1;
- ASSERT(newb < USIZE_BITS);
- usize newlen = USIZE_C(1) << newb;
- void *newmem, *newdata;
- if (alloc_and_align(&newmem, &newdata, newb, s) < 0) return -1;
- u8 *newmeta = newdata;
- u8 *newpages = (u8 *)newdata + newlen;
-
- /* Rehash. */
- for (usize slot = 0; slot < oldlen; slot++) {
- if (oldmeta[slot] < MIN_HASH1) continue;
- usize page_slot = slot % PAGE_LEN;
- u8 *page = oldpages + (slot - page_slot) * (s->ksize + s->vsize);
- void *k = page + page_slot * s->ksize;
- void *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize;
- u64 h = s->hash(k, d->seed, s);
- grow_insert(newb, newmeta, newpages, k, v, h, s);
- }
-
- s->free(oldmem);
-
- d->b = newb;
- d->ntombs = 0;
- d->mem = newmem;
- d->data = newdata;
-
- return 0;
-}
-
-/* scan is inline, so that the mode argument becomes constant in all uses. */
-static inline int
-scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) {
- u8 *meta = d->data;
- u8 *pages = (u8 *)d->data + (USIZE_C(1) << d->b);
-
- usize mvi_mask = (USIZE_C(1) << (d->b - PROBE_LEN_BITS)) - 1u;
-
- usize h0 = h & mvi_mask;
- u8 h1 = hash1(h, d->b);
-
-#ifdef USE_SIMD
- v16u8 vh1 = v16u8_fill(h1);
- v16u8 vempty = v16u8_fill(EMPTY);
-#else
- u64 vh1; memset(&vh1, h1, sizeof vh1);
-#endif
-
- /* These variables should get optimized away when mode != INSERT. */
- bool insert_slot_found = false;
- usize insert_slot;
- bool insert_replacing_tomb;
-
- /* mvi is the metadata vector index (where each metadata vector is a
- * u8[PROBE_LEN]), and i is the corresponding index in the meta table.
- * Note that mvi is increased quadratically. */
- for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) {
- usize i = mvi * PROBE_LEN;
-
- /* We must load the metadata vector as little endian, so that low bits
- * in the vector represent earlier slots in the table and hence
- * ctz/r8search gives us earlier slots first. In the SIMD case, we
- * know we're on x86, so a memcpy suffices. In the portable case, we
- * use readl64, which does a byte swap on big endian machines. */
-#ifdef USE_SIMD
- v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
- /* m is a bit array indicating where vmeta equals h1. Thus, ctz(m) is
- * the least j such that meta[j] == h1 (provided m != 0). */
- for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) {
- int j = r_ctz16(m);
-#else
- u64 vmeta = r_readl64(meta+i);
- /* Initially, z has 0x00 exactly where vmeta has h1. Thus, we can use
- * r8search64 on z to find the indices j of h1 in vmeta. After each
- * iteration, we mask in an arbitrary bit into the jth byte so that
- * the next iteration gets a different j. */
- u64 z = vmeta ^ vh1;
- for (int j = r_r8search64(z); j != 8; z |= U64_C(1)<<(8*j), j = r_r8search64(z)) {
-#endif
- usize slot = i + j;
- usize page_slot = slot % PAGE_LEN;
- u8 *page = pages + (slot - page_slot) * (s->ksize + s->vsize);
- if likely (s->eq(page + page_slot * s->ksize, k, s)) {
- if (mode == DELETE) {
- /* If the probe vector contains an empty slot, then we
- * don't need to insert a tombstone, and fewer tombstones
- * means lower load factor. */
-#ifdef USE_SIMD
- if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) {
-#else
- if (r_r8search64(vmeta) != 8) {
-#endif
- meta[slot] = EMPTY;
- } else {
- meta[slot] = TOMBSTONE;
- d->ntombs += 1;
- }
- d->count -= 1;
- }
- *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize;
- return 1;
- }
- }
-
- /* The key is not in this probe vector. Search for an empty slot to
- * determine if we need to check more probe vectors. */
-#ifdef USE_SIMD
- u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty));
-#else
- int j0 = r_r8search64(vmeta);
-#endif
-
- /* When inserting, we need to remember the first empty slot or
- * tombstone seen in case we don't find an existing slot with the
- * given key. We prioritize replacing tombstones to decrease the
- * load factor. */
- if (mode == INSERT && !insert_slot_found) {
-#ifdef USE_SIMD
- v16u8 vtomb = v16u8_fill(TOMBSTONE);
- u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb));
- if (m1 != 0) {
- insert_slot_found = true;
- insert_slot = i + r_ctz16(m1);
- insert_replacing_tomb = true;
- } else if (m0 != 0) {
- insert_slot_found = true;
- insert_slot = i + r_ctz16(m0);
- insert_replacing_tomb = false;
- break;
- }
-#else
- u64 vtomb; memset(&vtomb, TOMBSTONE, sizeof vtomb);
- int j1 = r_r8search64(vmeta ^ vtomb);
- if (j1 != 8) {
- insert_slot_found = true;
- insert_slot = i + j1;
- insert_replacing_tomb = true;
- } else if (j0 != 8) {
- insert_slot_found = true;
- insert_slot = i + j0;
- insert_replacing_tomb = false;
- break;
- }
-#endif
- }
-
-#ifdef USE_SIMD
- if (m0 != 0) break;
-#else
- if (j0 != 8) break;
-#endif
- }
-
- if (mode == INSERT) {
- /* The key wasn't found, so insert in the first available slot found
- * along the probe sequence. */
- ASSERT(insert_slot_found);
- d->count += 1;
- if (insert_replacing_tomb) d->ntombs -= 1;
- meta[insert_slot] = h1;
- usize page_slot = insert_slot % PAGE_LEN;
- u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize);
- memcpy(page + page_slot * s->ksize, k, s->ksize);
- *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize;
- }
-
- return 0;
-}
-
-int
-r_dict_init(RDict *d, usize hint, RDictSpec *s) {
- memset(d, 0, sizeof *d);
-
- /* Impose maximum on hint to prevent overflow and the following while
- * loop from being infinite. See exceeds_lf. */
- hint = MIN(hint, (U64_MAX / LF_DEN) >> 1);
- u8 b = MAX(R_CACHE_LINE_BITS, PROBE_LEN_BITS);
- b = MAX(b, PAGE_LEN_BITS);
- while (exceeds_lf(hint, b)) b += 1;
- ASSERT(b < USIZE_BITS);
- d->b = b;
-
- if (alloc_and_align(&d->mem, &d->data, b, s) < 0) return -1;
-
- d->seed = r_prand64();
-
- return 0;
-}
-
-void
-r_dict_free(RDict *d, RDictSpec *s) {
- int e = errno;
- if (d->niter > 0)
- r_warnf("dict freed with %"PRId32" iterators alive", d->niter);
- s->free(d->mem);
- d->data = 0; /* Ensure hard error on use after free. */
- errno = e;
-}
-
-int
-r_dict_access(RDict *d, void **v, void *k, u64 h, RDictSpec *s) {
- return scan(d, v, k, h, ACCESS, s);
-}
-
-int
-r_dict_insert(RDict *d, void **v, void *k, u64 h, RDictSpec *s) {
- if (exceeds_lf(d->count + d->ntombs, d->b)) {
- if (grow(d, s) < 0) return -1;
- }
- return scan(d, v, k, h, INSERT, s);
-}
-
-int
-r_dict_delete(RDict *d, void **v, void *k, u64 h, RDictSpec *s) {
- return scan(d, v, k, h, DELETE, s);
-}
-
-void
-r_dict_clear(RDict *d, RDictSpec *s) {
- memset(d->data, EMPTY, USIZE_C(1) << d->b);
- d->count = 0;
- d->ntombs = 0;
-
- /* Re-seed to prevent some attacks. */
- d->seed = r_prand64();
-}
-
-// TODO: iter