Improve readability in dict.c - rcx - library of miscellaneous bits of C code

commit 58461ad3af99902e7ef1a93c40d90ae031f9d9b5
parent 798ef8714182bff677eb386c979d3b782b072667
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sun, 13 Aug 2023 15:15:52 -0700

Improve readability in dict.c

Diffstat:
M inc/def.h  | 18 +++++++++---------
M inc/dict.h  | 2 +-
M src/dict.c  | 180 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------

3 files changed, 116 insertions(+), 84 deletions(-)
diff --git a/inc/def.h b/inc/def.h
@@ -4,6 +4,11 @@
 #include <stddef.h>
 #include <stdint.h>
 
+/* Reasonable code assumes CHAR_BIT == 8. */
+#if CHAR_BIT != 8
+#error "Expected CHAR_BIT == 8"
+#endif
+
 #define JOIN_AUX(a,b) a##b
 #define JOIN(a,b) JOIN_AUX(a,b)
 
@@ -62,11 +67,6 @@
 #define STATIC(T, ...) ({static __typeof__(T) tmp = __VA_ARGS__; &tmp;})
 #endif
 
-/* Reasonable code assumes CHAR_BIT == 8. */
-#if CHAR_BIT != 8
-#error "Expected CHAR_BIT == 8"
-#endif
-
 #if !defined(R_LITTLE_ENDIAN) && !defined(R_BIG_ENDIAN)
 #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 #define R_LITTLE_ENDIAN 1
@@ -83,10 +83,6 @@
 #endif
 #define R_CACHE_LINE_SIZE (1 << (R_CACHE_LINE_BITS))
 
-#ifdef __SIZEOF_INT128__
-#define R_HAVE_128 1
-#endif
-
 #ifdef __MMX__
 #define R_HAVE_MMX 1
 #endif
@@ -115,6 +111,10 @@
 #define R_HAVE_AVX2 1
 #endif
 
+#ifdef __SIZEOF_INT128__
+#define R_HAVE_128 1
+#endif
+
 /* Correct the mistakes of whoever named these macros */
 #define SHORT_MIN SHRT_MIN
 #define SHORT_MAX SHRT_MAX
diff --git a/inc/dict.h b/inc/dict.h
@@ -32,7 +32,7 @@ struct r_dict {
 	usize count;
 	usize ntombs;
 	void *mem;
-	void *data; /* Cache-aligned pointer into mem */
+	void *data; /* Cache-aligned pointer into mem; see dict.c for layout. */
 };
 
 static inline bool r_dict_mem_eq(void *a, void *b, RDictSpec *s) { return memcmp(a, b, s->ksize) == 0; }
diff --git a/src/dict.c b/src/dict.c
@@ -2,12 +2,30 @@
 #include <string.h>
 
 #include "bits.h"
+#include "debug.h"
 #include "dict.h"
 #include "internal/util.h"
 #include "log.h"
 #include "rand.h"
 #include "rcx.h"
 
+/* In a dict with n slots (i.e., the dict has room for n key-value pairs), the
+ * allocated dict data consists of n metadata bytes followed by n/PAGE_LEN
+ * pages, where a page consists of PAGE_LEN keys followed by PAGE_LEN values.
+ * A meta byte is either EMPTY, TOMBSTONE, or the most significant hash byte of
+ * the key stored in the corresponding slot (roughly speaking; see hash1). When
+ * scanning the dict, we use SIMD to probe PROBE_LEN cells at once. Thus, the
+ * dict size n is always at least MAX(PROBE_LEN, PAGE_LEN). Actually, we also
+ * require the dict size to be at least R_CACHE_LINE_SIZE, so that the meta
+ * bytes occupy a whole number of cache lines, and so that pages are
+ * cache-aligned in the common case of key types K and value types V such that
+ * sizeof(K) * PAGE_LEN and sizeof(V) * PAGE_LEN are both multiples of
+ * R_CACHE_LINE_SIZE. */
+
+/* TODO: At some point, we should replace SSE2 with AVX2... or should we?
+ * Longer probe vectors means higher chance of conflicts within one probe
+ * vector, so maybe SSE2 is actually better. Before replacing SSE2 with AVX2,
+ * be sure to benchmark and do some statistical analysis. */
 #if defined(__GNUC__) && defined(R_HAVE_SSE2)
 #include "simd.h"
 #define USE_SIMD 1
@@ -20,11 +38,14 @@
 
 /* PAGE_LEN should be a multiple of 8 for packing and alignment. */
 /* TODO: make these parameters programmer-controllable. PAGE_LEN == 8 is
- * a conservative choice. In most cases (i.e., ksize a multiple of 8),
- * PAGE_LEN == 1 causes no packing issues and would improve caching
- * performance with very large keys or values. */
+ * a conservative choice. In most cases (i.e., with the key size being a
+ * multiple of 8), PAGE_LEN == 1 causes no packing issues and would improve
+ * caching performance with very large keys or values. */
 #define PAGE_LEN 8u
 #define PAGE_LEN_BITS 3u
+
+/* The dict size doubles when count + ntombs exceeds the proportion of the
+ * dict size indicated by LF_NUM and LF_DEN. See exceeds_lf. */
 #define LF_NUM 3u
 #define LF_DEN 4u
 
@@ -32,8 +53,6 @@
 #define TOMBSTONE 1u
 #define MIN_HASH1 2u
 
-typedef u8 Page; /* For readability only; a page isn't really a u8. */
-
 enum scan_mode {
 	ACCESS = 0,
 	INSERT = 1,
@@ -56,12 +75,17 @@ exceeds_lf(u64 n, u8 b) {
 static int
 alloc_and_align(void **mem, void **data, u8 b, RDictSpec *s) {
 	usize len = USIZE_C(1) << b;
-	if (r_mul_will_overflow_(len, 1u + s->ksize + s->vsize)) goto fail;
-	usize dsize = len * (1u + s->ksize + s->vsize);
-	if (dsize > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail;
-	*mem = s->allocz((R_CACHE_LINE_SIZE - 1) + dsize);
+	/* XXX: We assume ksize and vsize are small, so computing bytes_per_slot
+	 * shouldn't overflow. */
+	usize bytes_per_slot = 1u + s->ksize + s->vsize;
+	if (r_mul_will_overflow_(len, bytes_per_slot)) goto fail;
+	usize data_size = len * bytes_per_slot;
+	if (data_size > USIZE_MAX - (R_CACHE_LINE_SIZE - 1)) goto fail;
+	usize mem_size = (R_CACHE_LINE_SIZE - 1) + data_size;
+	*mem = s->allocz(mem_size);
 	if (!*mem) goto fail;
-	*data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1)) & -(uptr)R_CACHE_LINE_SIZE);
+	*data = (void *)(((uptr)*mem + (R_CACHE_LINE_SIZE - 1))
+		& -(uptr)R_CACHE_LINE_SIZE);
 
 	return 0;
 
@@ -70,45 +94,46 @@ fail:
 	return -1;
 }
 
-/* This is a version of scan specialized for inserting while growing.
+/* grow_insert is a version of scan specialized for inserting while growing.
  * See the comments in scan for explanation. */
 static void
-grow_insert(u8 b, u8 *meta, Page *pages, void *k, void *v, u64 h, RDictSpec *s) {
-	usize meta_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u;
+grow_insert(u8 b, u8 *meta, u8 *pages, void *k, void *v, u64 h, RDictSpec *s) {
+	usize mvi_mask = (USIZE_C(1) << (b - PROBE_LEN_BITS)) - 1u;
 
-	usize h0 = h & meta_mask;
+	usize h0 = h & mvi_mask;
 	u8 h1 = hash1(h, b);
 
 #ifdef USE_SIMD
 	v16u8 vempty = v16u8_fill(EMPTY);
 #endif
 
-	usize insert_idx;
-	for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & meta_mask, jump++) {
+	usize insert_slot;
+
+	for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) {
 		usize i = mvi * PROBE_LEN;
 
 #ifdef USE_SIMD
 		v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
 		u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty));
 		if (m0 != 0) {
-			insert_idx = i + r_ctz16(m0);
+			insert_slot = i + r_ctz16(m0);
 			break;
 		}
 #else
 		u64 vmeta = r_readl64(meta+i);
 		int j0 = r_r8search64(vmeta);
 		if (j0 != 8) {
-			insert_idx = i + j0;
+			insert_slot = i + j0;
 			break;
 		}
 #endif
 	}
 
-	meta[insert_idx] = h1;
-	usize page_idx = insert_idx % PAGE_LEN;
-	Page *p = pages + (insert_idx - page_idx) * (s->ksize + s->vsize);
-	memcpy(p + page_idx * s->ksize, k, s->ksize);
-	memcpy(p + PAGE_LEN * s->ksize + page_idx * s->vsize, v, s->vsize);
+	meta[insert_slot] = h1;
+	usize page_slot = insert_slot % PAGE_LEN;
+	u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize);
+	memcpy(page + page_slot * s->ksize, k, s->ksize);
+	memcpy(page + PAGE_LEN * s->ksize + page_slot * s->vsize, v, s->vsize);
 }
 
 static int
@@ -117,22 +142,23 @@ grow(RDict *d, RDictSpec *s) {
 	usize oldlen = USIZE_C(1) << oldb;
 	void *oldmem = d->mem;
 	u8 *oldmeta = d->data;
-	Page *oldpages = (u8 *)d->data + oldlen;
+	u8 *oldpages = (u8 *)d->data + oldlen;
 
 	u8 newb = oldb + 1;
-	assert(newb < USIZE_BITS);
+	ASSERT(newb < USIZE_BITS);
 	usize newlen = USIZE_C(1) << newb;
 	void *newmem, *newdata;
 	if (alloc_and_align(&newmem, &newdata, newb, s) < 0) return -1;
 	u8 *newmeta = newdata;
-	Page *newpages = (u8 *)newdata + newlen;
-
-	for (usize i = 0; i < oldlen; i++) {
-		if (oldmeta[i] < MIN_HASH1) continue;
-		usize page_idx = i % PAGE_LEN;
-		Page *p = oldpages + (i - page_idx) * (s->ksize + s->vsize);
-		void *k = p + page_idx * s->ksize;
-		void *v = p + PAGE_LEN * s->ksize + page_idx * s->vsize;
+	u8 *newpages = (u8 *)newdata + newlen;
+
+	/* Rehash. */
+	for (usize slot = 0; slot < oldlen; slot++) {
+		if (oldmeta[slot] < MIN_HASH1) continue;
+		usize page_slot = slot % PAGE_LEN;
+		u8 *page = oldpages + (slot - page_slot) * (s->ksize + s->vsize);
+		void *k = page + page_slot * s->ksize;
+		void *v = page + PAGE_LEN * s->ksize + page_slot * s->vsize;
 		u64 h = s->hash(k, d->seed, s);
 		grow_insert(newb, newmeta, newpages, k, v, h, s);
 	}
@@ -147,14 +173,15 @@ grow(RDict *d, RDictSpec *s) {
 	return 0;
 }
 
+/* scan is inline, so that the mode argument becomes constant in all uses. */
 static inline int
 scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) {
-	/* meta_mask is a mask for reducing modulo the table size. */
-	usize meta_mask = (USIZE_C(1) << (d->b - PROBE_LEN_BITS)) - 1u;
 	u8 *meta = d->data;
-	Page *pages = (u8 *)d->data + (USIZE_C(1) << d->b);
+	u8 *pages = (u8 *)d->data + (USIZE_C(1) << d->b);
 
-	usize h0 = h & meta_mask;
+	usize mvi_mask = (USIZE_C(1) << (d->b - PROBE_LEN_BITS)) - 1u;
+
+	usize h0 = h & mvi_mask;
 	u8 h1 = hash1(h, d->b);
 
 #ifdef USE_SIMD
@@ -164,81 +191,85 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) {
 	u64 vh1; memset(&vh1, h1, sizeof vh1);
 #endif
 
+	/* These variables should get optimized away when mode != INSERT. */
 	bool insert_slot_found = false;
-	usize insert_idx;
+	usize insert_slot;
 	bool insert_replacing_tomb;
+
 	/* mvi is the metadata vector index (where each metadata vector is a
 	 * u8[PROBE_LEN]), and i is the corresponding index in the meta table.
 	 * Note that mvi is increased quadratically. */
-	for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & meta_mask, jump++) {
+	for (usize mvi = h0, jump = 1;; mvi = (mvi + jump) & mvi_mask, jump++) {
 		usize i = mvi * PROBE_LEN;
 
 		/* We must load the metadata vector as little endian, so that low bits
 		 * in the vector represent earlier slots in the table and hence
-		 * ctz/r8search gives us earlier slots first. In the SIMD cases, we
+		 * ctz/r8search gives us earlier slots first. In the SIMD case, we
 		 * know we're on x86, so a memcpy suffices. In the portable case, we
-		 * use readl64, which does a byte swap on big endian machines.
-		 *
-		 * In the SIMD case, m is a bit array indicating where vmeta equals
-		 * h1. Thus, ctz(m) is the least j such that meta[j] == h1 (provided
-		 * m != 0).
-		 * 
-		 * In the portable case, z initially has 0s exactly where vmeta has h1.
-		 * Thus, we can use r8search64 on z to find the indices j of h1 in
-		 * vmeta. After each iteration, we mask in an arbitrary bit into the
-		 * jth byte so that the next iteration gets a different j. */
+		 * use readl64, which does a byte swap on big endian machines. */
 #ifdef USE_SIMD
 		v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
+		/* m is a bit array indicating where vmeta equals h1. Thus, ctz(m) is
+		 * the least j such that meta[j] == h1 (provided m != 0). */
 		for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) {
 			int j = r_ctz16(m);
 #else
 		u64 vmeta = r_readl64(meta+i);
+		/* Initially, z has 0x00 exactly where vmeta has h1. Thus, we can use
+		 * r8search64 on z to find the indices j of h1 in vmeta. After each
+		 * iteration, we mask in an arbitrary bit into the jth byte so that
+		 * the next iteration gets a different j. */
 		u64 z = vmeta ^ vh1;
 		for (int j = r_r8search64(z); j != 8; z |= U64_C(1)<<(8*j), j = r_r8search64(z)) {
 #endif
-			usize match_idx = i + j;
-			usize page_idx = match_idx % PAGE_LEN;
-			Page *p = pages + (match_idx - page_idx) * (s->ksize + s->vsize);
-			if likely (s->eq(p + page_idx * s->ksize, k, s)) {
+			usize slot = i + j;
+			usize page_slot = slot % PAGE_LEN;
+			u8 *page = pages + (slot - page_slot) * (s->ksize + s->vsize);
+			if likely (s->eq(page + page_slot * s->ksize, k, s)) {
 				if (mode == DELETE) {
+					/* If the probe vector contains an empty slot, then we
+					 * don't need to insert a tombstone, and fewer tombstones
+					 * means lower load factor. */
 #ifdef USE_SIMD
 					if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) {
 #else
 					if (r_r8search64(vmeta) != 8) {
 #endif
-						meta[match_idx] = EMPTY;
+						meta[slot] = EMPTY;
 					} else {
-						meta[match_idx] = TOMBSTONE;
+						meta[slot] = TOMBSTONE;
 						d->ntombs += 1;
 					}
 					d->count -= 1;
 				}
-				*v = p + PAGE_LEN * s->ksize + page_idx * s->vsize;
+				*v = page + PAGE_LEN * s->ksize + page_slot * s->vsize;
 				return 1;
 			}
 		}
 
-		/* Search for an EMPTY slot. */
+		/* The key is not in this probe vector. Search for an empty slot to
+		 * determine if we need to check more probe vectors. */
 #ifdef USE_SIMD
 		u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty));
 #else
 		int j0 = r_r8search64(vmeta);
 #endif
 
-		/* In mode INSERT, we need to look for an EMPTY or TOMBSTONE slot in
-		 * case we don't find an existing slot with the given key. We
-		 * prioritize filling TOMBSTONE slots to decrease the load factor. */
+		/* When inserting, we need to remember the first empty slot or
+		 * tombstone seen in case we don't find an existing slot with the
+		 * given key. We prioritize replacing tombstones to decrease the
+		 * load factor. */
 		if (mode == INSERT && !insert_slot_found) {
 #ifdef USE_SIMD
 			v16u8 vtomb = v16u8_fill(TOMBSTONE);
 			u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb));
 			if (m1 != 0) {
 				insert_slot_found = true;
-				insert_idx = i + r_ctz16(m1);
+				insert_slot = i + r_ctz16(m1);
 				insert_replacing_tomb = true;
 			} else if (m0 != 0) {
 				insert_slot_found = true;
-				insert_idx = i + r_ctz16(m0);
+				insert_slot = i + r_ctz16(m0);
 				insert_replacing_tomb = false;
 				break;
 			}
@@ -247,18 +278,17 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) {
 			int j1 = r_r8search64(vmeta ^ vtomb);
 			if (j1 != 8) {
 				insert_slot_found = true;
-				insert_idx = i + j1;
+				insert_slot = i + j1;
 				insert_replacing_tomb = true;
 			} else if (j0 != 8) {
 				insert_slot_found = true;
-				insert_idx = i + j0;
+				insert_slot = i + j0;
 				insert_replacing_tomb = false;
 				break;
 			}
 #endif
 		}
 
-		/* If the page contains an EMPTY slot, the key can't be any further. */
 #ifdef USE_SIMD
 		if (m0 != 0) break;
 #else
@@ -267,13 +297,16 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *s) {
 	}
 
 	if (mode == INSERT) {
+		/* The key wasn't found, so insert in the first available slot found
+		 * along the probe sequence. */
+		ASSERT(insert_slot_found);
 		d->count += 1;
 		if (insert_replacing_tomb) d->ntombs -= 1;
-		meta[insert_idx] = h1;
-		usize page_idx = insert_idx % PAGE_LEN;
-		Page *p = pages + (insert_idx - page_idx) * (s->ksize + s->vsize);
-		memcpy(p + page_idx * s->ksize, k, s->ksize);
-		*v = p + PAGE_LEN * s->ksize + page_idx * s->vsize;
+		meta[insert_slot] = h1;
+		usize page_slot = insert_slot % PAGE_LEN;
+		u8 *page = pages + (insert_slot - page_slot) * (s->ksize + s->vsize);
+		memcpy(page + page_slot * s->ksize, k, s->ksize);
+		*v = page + PAGE_LEN * s->ksize + page_slot * s->vsize;
 	}
 
 	return 0;
@@ -289,7 +322,7 @@ r_dict_init(RDict *d, usize hint, RDictSpec *s) {
 	u8 b = MAX(R_CACHE_LINE_BITS, PROBE_LEN_BITS);
 	b = MAX(b, PAGE_LEN_BITS);
 	while (exceeds_lf(hint, b)) b += 1;
-	assert(b < USIZE_BITS);
+	ASSERT(b < USIZE_BITS);
 	d->b = b;
 
 	if (alloc_and_align(&d->mem, &d->data, b, s) < 0) return -1;
@@ -317,8 +350,7 @@ r_dict_access(RDict *d, void **v, void *k, u64 h, RDictSpec *s) {
 int
 r_dict_insert(RDict *d, void **v, void *k, u64 h, RDictSpec *s) {
 	if (exceeds_lf(d->count + d->ntombs, d->b)) {
-		if (grow(d, s) < 0)
-			return -1;
+		if (grow(d, s) < 0) return -1;
 	}
 	return scan(d, v, k, h, INSERT, s);
 }

	rcx library of miscellaneous bits of C code
	git clone git://git.rr3.xyz/rcx
	Log \| Files \| Refs \| README \| LICENSE

M	inc/def.h	\|	18	+++++++++---------
M	inc/dict.h	\|	2	+-
M	src/dict.c	\|	180	++++++++++++++++++++++++++++++++++++++++++++++---------------------------------