Progress on SIMDing dict - rcx - library of miscellaneous bits of C code

commit 7657930c27c7c96a5fa55334cfc87eb4404ba57f
parent ef66779c5a7e01591a175eb50a466cdb84679940
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sat,  3 Jun 2023 18:46:09 -0700

Progress on SIMDing dict

Not complete. Also, I need to rethink the dict structure. Storing
PAGE_LEN metas, then PAGE_LEN keys, and then PAGE_LEN vals does not
make much sense when PAGE_LEN is large. (The point before was that
the cache miss to read a meta/key would probably prefetch the
corresponding value, thus saving a cache miss.) It probably makes
more sense to have all meta bytes in their own array (but still same
allocation, of course) and stagger keys/vals in length 8 arrays
(like before).

Diffstat:
M src/dict.c  | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------

1 file changed, 118 insertions(+), 41 deletions(-)
diff --git a/src/dict.c b/src/dict.c
@@ -7,16 +7,21 @@
 #include "rand.h"
 #include "rcx.h"
 
-/* PAGE_LEN should be a multiple of the machine word size (i.e., 8 on 64-bit
- * arches) for optimal packing and to prevent alignment issues. There are
- * places where we assume it is exactly 8, so don't change this without
- * auditing the code. */
+#if defined(__GNUC__) && defined(R_HAVE_AVX2) && 0
+#include "simd.h"
+#define PAGE_LEN 32u
+#elif defined(__GNUC__) && defined(R_HAVE_SSE2) && 0
+#include "simd.h"
+#define PAGE_LEN 16u
+#else
 #define PAGE_LEN 8u
+#endif
+
 #define LF_NUM 3u
 #define LF_DEN 4u
 #define EMPTY 0u
 #define TOMBSTONE 1u
-#define MIN_HASH2 2u
+#define MIN_HASH1 2u
 
 typedef struct r_dict_spec Spec;
 typedef struct r_dict Dict;
@@ -39,7 +44,7 @@ struct r_dict {
 };
 
 struct page {
-	u8 h2[PAGE_LEN];
+	u8 meta[PAGE_LEN];
 	/* K key[PAGE_LEN]; */
 	/* V val[PAGE_LEN]; */
 };
@@ -49,15 +54,10 @@ hash0(u64 h, u8 b) {
 	return h & ((U64_C(1) << b) - 1u);
 }
 
-static inline u64
-hash1(u64 h, u8 b) {
-	return (h >> b) & (PAGE_LEN - 1u);
-}
-
 static inline u8
-hash2(u64 h, u8 b) {
+hash1(u64 h, u8 b) {
 	u64 t = h >> 56;
-	if (t < MIN_HASH2) t += MIN_HASH2;
+	if (t < MIN_HASH1) t += MIN_HASH1;
 	return t;
 }
 
@@ -67,6 +67,8 @@ exceeds_lf(u64 n, u8 b) {
 	return LF_DEN * n > LF_NUM * PAGE_LEN * (U64_C(1) << b);
 }
 
+/* TODO: try computing psize when in dict.h instead. */
+
 /* Compute Page size *without* overflow checks. */
 static inline usize
 psize_unsafe(Spec *spec) {
@@ -103,20 +105,23 @@ val(Page *p, usize j, Spec *spec) {
 
 static void
 insert(Page *pages, u8 b, void *k, void *v, u64 h, Spec *spec) {
-	usize psize = psize_unsafe(spec);
+	assert(false);
 
+	usize psize = psize_unsafe(spec);
 	u64 h0 = hash0(h, b);
-	u64 h1 = hash1(h, b);
-	u8 h2 = hash2(h, b);
+	u8 h1 = hash1(h, b);
 
 	usize pn = h0;
 	for (usize i = 1;; i++) {
 		Page *p = (void *)((u8 *)pages + pn*psize);
 
+		/* TODO: this is wrong. Should probably just generalize scan and
+		 * replace insert with scan. Otherwise we need a bunch of #ifs here. */
+
 		usize j = h1;
 		do {
-			if (p->h2[j] < MIN_HASH2) {
-				p->h2[j] = h2;
+			if (p->meta[j] < MIN_HASH1) {
+				p->meta[j] = h1;
 				memcpy(key(p, j, spec), k, spec->ksize);
 				memcpy(val(p, j, spec), v, spec->vsize);
 				return;
@@ -152,7 +157,7 @@ grow(Dict *d, Spec *spec) {
 		Page *p = (void *)((u8 *)old + i*psize);
 
 		for (usize j = 0; j < PAGE_LEN; j++) {
-			if (p->h2[j] < MIN_HASH2) continue;
+			if (p->meta[j] < MIN_HASH1) continue;
 			void *k = key(p, j, spec);
 			void *v = val(p, j, spec);
 			u64 h = spec->hash(k, d->seed, spec);
@@ -182,31 +187,55 @@ scan(Dict *d, void **v, void *k, u64 h, int mode, Spec *spec) {
 
 	usize psize = psize_unsafe(spec);
 	u64 h0 = hash0(h, d->b);
-	u64 h1 = hash1(h, d->b);
-	u8 h2 = hash2(h, d->b);
+	u8 h1 = hash1(h, d->b);
+
+#if PAGE_LEN == 32
+	v32u8 vh1 = v32u8_fill(h1);
+	v32u8 vempty = v32u8_fill(EMPTY);
+#elif PAGE_LEN == 16
+	v16u8 vh1 = v16u8_fill(h1);
+	v16u8 vempty = v16u8_fill(EMPTY);
+#else
+	u64 vh1; memset(&vh1, h1, sizeof vh1);
+#endif
 
 	usize pn = h0;
 	Page *createp = 0;
-	usize createj = 0; /* XXX: this does not need to be initialized, but GCC doesn't realize that */
+	usize createj;
+	bool createtomb;
 	for (usize i = 1;; i++) {
 		Page *p = (void *)((u8 *)d->pages + pn*psize);
 
-		bool seen_empty = false;
-		usize j = h1;
-		do {
-			if (p->h2[j] < MIN_HASH2) {
-				seen_empty = p->h2[j] == EMPTY;
-				if (!createp) {
-					createp = p;
-					createj = j;
-				}
-			} else if (p->h2[j] == h2 && spec->eq(key(p, j, spec), k, spec)) {
-				/* TODO: try branch predict hint true on eq */
+		/* We must load the metadata vector as little endian, so that low bits
+		 * in the vector represent earlier slots in the table and hence ctz
+		 * gives us earlier slots first. In the SIMD cases, we know we're on
+		 * x86, so a memcpy suffices. In the general fallback case, we use
+		 * readl64, which does a byte swap on big endian machines. */
+#if PAGE_LEN == 32
+		v32u8 meta; memcpy(&meta, &p->meta, sizeof meta);
+		for (u32 m = v32u8_msb(v32u8_cmpeq(meta, vh1)); m != 0; m &= m - 1) {
+			int j = r_ctz32(m);
+#elif PAGE_LEN == 16
+		v16u8 meta; memcpy(&meta, &p->meta, sizeof meta);
+		for (u16 m = v16u8_msb(v16u8_cmpeq(meta, vh1)); m != 0; m &= m - 1) {
+			int j = r_ctz16(m);
+#else
+		u64 meta = r_readl64(&p->meta);
+		u64 z = vh1 ^ meta;
+		for (int j = r_r8search64(z); j < 8; z |= U64_C(1) << (8*j), j = r_r8search64(z)) {
+#endif
+			if likely (spec->eq(key(p, j, spec), k, spec)) {
 				if (mode == DELETE) {
-					if (seen_empty || r_r8search64(r_readh64(p->h2)) < 8) {
-						p->h2[j] = EMPTY;
+#if PAGE_LEN == 32
+					if (v32u8_msb(v32u8_cmpeq(meta, vempty)) != 0) {
+#elif PAGE_LEN == 16
+					if (v16u8_msb(v16u8_cmpeq(meta, vempty)) != 0) {
+#else
+					if (r_r8search64(meta) < 8) {
+#endif
+						p->meta[j] = EMPTY;
 					} else {
-						p->h2[j] = TOMBSTONE;
+						p->meta[j] = TOMBSTONE;
 						d->ntombs += 1;
 					}
 					d->count -= 1;
@@ -214,18 +243,66 @@ scan(Dict *d, void **v, void *k, u64 h, int mode, Spec *spec) {
 				*v = val(p, j, spec);
 				return 1;
 			}
+		}
 
-			j = (j + 1) & (PAGE_LEN - 1);
-		} while (j != h1);
+		/* Search for an EMPTY slot. */
+#if PAGE_LEN == 32
+		u32 m0 = v32u8_msb(v32u8_cmpeq(meta, vempty));
+#elif PAGE_LEN == 16
+		u16 m0 = v16u8_msb(v16u8_cmpeq(meta, vempty));
+#else
+		int j0 = r_r8search64(meta);
+#endif
+
+		/* In mode CREATE, we need to look for an EMPTY or TOMBSTONE slot in
+		 * case we don't find an existing slot with the given key. We
+		 * prioritize filling TOMBSTONE slots to decrease the load factor. */
+		if (mode == CREATE && !createp) {
+#if PAGE_LEN == 32
+			v32u8 vtomb = v32u8_fill(TOMBSTONE);
+			u32 m1 = v32u8_msb(v32u8_cmpeq(meta, vtomb));
+			if (m1 != 0) {
+				createp = p; createj = r_ctz32(m1); createtomb = true;
+			} else if (m0 != 0) {
+				createp = p; createj = r_ctz32(m0); createtomb = false;
+				break;
+			}
+#elif PAGE_LEN == 16
+			v16u8 vtomb = v16u8_fill(TOMBSTONE);
+			u16 m1 = v16u8_msb(v16u8_cmpeq(meta, vtomb));
+			if (m1 != 0) {
+				createp = p; createj = r_ctz16(m1); createtomb = true;
+			} else if (m0 != 0) {
+				createp = p; createj = r_ctz16(m0); createtomb = false;
+				break;
+			}
+#else
+			u64 vtomb; memset(&vtomb, TOMBSTONE, sizeof vtomb);
+			int j1 = r_r8search64(meta ^ vtomb);
+			if (j1 != 8) {
+				createp = p; createj = j1; createtomb = true;
+			} else if (j0 != 8) {
+				createp = p; createj = j0; createtomb = false;
+				break;
+			}
+#endif
+		}
 
-		if (seen_empty) break;
+		/* If the page contains an EMPTY slot, the key can't be any further. */
+#if PAGE_LEN > 8
+		if (m0 != 0) break;
+#else
+		if (j0 != 8) break;
+#endif
 
+		/* Quadratic probing between pages */
 		pn = (pn + i) & ((USIZE_C(1) << d->b) - 1u);
 	}
 
 	if (mode == CREATE) {
 		d->count += 1;
-		createp->h2[createj] = h2;
+		if (createtomb) d->ntombs -= 1;
+		createp->meta[createj] = h1;
 		/* TODO: try returning a pointer to the key slot instead, so that the
 		 * compiler can generate copy code when ksize is a constant */
 		memcpy(key(createp, createj, spec), k, spec->ksize);
@@ -283,7 +360,7 @@ r_dict_clear(Dict *d, Spec *spec) {
 	usize len = USIZE_C(1) << d->b;
 	for (usize i = 0; i < len; i++) {
 		Page *p = (void *)((u8 *)d->pages + i*psize);
-		memset(p->h2, EMPTY, sizeof p->h2);
+		memset(p->meta, EMPTY, sizeof p->meta);
 	}
 
 	d->count = 0;

	rcx library of miscellaneous bits of C code
	git clone git://git.rr3.xyz/rcx
	Log \| Files \| Refs \| README \| LICENSE