commit 7657930c27c7c96a5fa55334cfc87eb4404ba57f
parent ef66779c5a7e01591a175eb50a466cdb84679940
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Sat, 3 Jun 2023 18:46:09 -0700
Progress on SIMDing dict
Not complete. Also, I need to rethink the dict structure. Storing
PAGE_LEN metas, then PAGE_LEN keys, and then PAGE_LEN vals does not
make much sense when PAGE_LEN is large. (The point before was that
the cache miss to read a meta/key would probably prefetch the
corresponding value, thus saving a cache miss.) It probably makes
more sense to have all meta bytes in their own array (but still same
allocation, of course) and stagger keys/vals in length 8 arrays
(like before).
Diffstat:
| M | src/dict.c | | | 159 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
1 file changed, 118 insertions(+), 41 deletions(-)
diff --git a/src/dict.c b/src/dict.c
@@ -7,16 +7,21 @@
#include "rand.h"
#include "rcx.h"
-/* PAGE_LEN should be a multiple of the machine word size (i.e., 8 on 64-bit
- * arches) for optimal packing and to prevent alignment issues. There are
- * places where we assume it is exactly 8, so don't change this without
- * auditing the code. */
+#if defined(__GNUC__) && defined(R_HAVE_AVX2) && 0
+#include "simd.h"
+#define PAGE_LEN 32u
+#elif defined(__GNUC__) && defined(R_HAVE_SSE2) && 0
+#include "simd.h"
+#define PAGE_LEN 16u
+#else
#define PAGE_LEN 8u
+#endif
+
#define LF_NUM 3u
#define LF_DEN 4u
#define EMPTY 0u
#define TOMBSTONE 1u
-#define MIN_HASH2 2u
+#define MIN_HASH1 2u
typedef struct r_dict_spec Spec;
typedef struct r_dict Dict;
@@ -39,7 +44,7 @@ struct r_dict {
};
struct page {
- u8 h2[PAGE_LEN];
+ u8 meta[PAGE_LEN];
/* K key[PAGE_LEN]; */
/* V val[PAGE_LEN]; */
};
@@ -49,15 +54,10 @@ hash0(u64 h, u8 b) {
return h & ((U64_C(1) << b) - 1u);
}
-static inline u64
-hash1(u64 h, u8 b) {
- return (h >> b) & (PAGE_LEN - 1u);
-}
-
static inline u8
-hash2(u64 h, u8 b) {
+hash1(u64 h, u8 b) {
u64 t = h >> 56;
- if (t < MIN_HASH2) t += MIN_HASH2;
+ if (t < MIN_HASH1) t += MIN_HASH1;
return t;
}
@@ -67,6 +67,8 @@ exceeds_lf(u64 n, u8 b) {
return LF_DEN * n > LF_NUM * PAGE_LEN * (U64_C(1) << b);
}
+/* TODO: try computing psize when in dict.h instead. */
+
/* Compute Page size *without* overflow checks. */
static inline usize
psize_unsafe(Spec *spec) {
@@ -103,20 +105,23 @@ val(Page *p, usize j, Spec *spec) {
static void
insert(Page *pages, u8 b, void *k, void *v, u64 h, Spec *spec) {
- usize psize = psize_unsafe(spec);
+ assert(false);
+ usize psize = psize_unsafe(spec);
u64 h0 = hash0(h, b);
- u64 h1 = hash1(h, b);
- u8 h2 = hash2(h, b);
+ u8 h1 = hash1(h, b);
usize pn = h0;
for (usize i = 1;; i++) {
Page *p = (void *)((u8 *)pages + pn*psize);
+ /* TODO: this is wrong. Should probably just generalize scan and
+ * replace insert with scan. Otherwise we need a bunch of #ifs here. */
+
usize j = h1;
do {
- if (p->h2[j] < MIN_HASH2) {
- p->h2[j] = h2;
+ if (p->meta[j] < MIN_HASH1) {
+ p->meta[j] = h1;
memcpy(key(p, j, spec), k, spec->ksize);
memcpy(val(p, j, spec), v, spec->vsize);
return;
@@ -152,7 +157,7 @@ grow(Dict *d, Spec *spec) {
Page *p = (void *)((u8 *)old + i*psize);
for (usize j = 0; j < PAGE_LEN; j++) {
- if (p->h2[j] < MIN_HASH2) continue;
+ if (p->meta[j] < MIN_HASH1) continue;
void *k = key(p, j, spec);
void *v = val(p, j, spec);
u64 h = spec->hash(k, d->seed, spec);
@@ -182,31 +187,55 @@ scan(Dict *d, void **v, void *k, u64 h, int mode, Spec *spec) {
usize psize = psize_unsafe(spec);
u64 h0 = hash0(h, d->b);
- u64 h1 = hash1(h, d->b);
- u8 h2 = hash2(h, d->b);
+ u8 h1 = hash1(h, d->b);
+
+#if PAGE_LEN == 32
+ v32u8 vh1 = v32u8_fill(h1);
+ v32u8 vempty = v32u8_fill(EMPTY);
+#elif PAGE_LEN == 16
+ v16u8 vh1 = v16u8_fill(h1);
+ v16u8 vempty = v16u8_fill(EMPTY);
+#else
+ u64 vh1; memset(&vh1, h1, sizeof vh1);
+#endif
usize pn = h0;
Page *createp = 0;
- usize createj = 0; /* XXX: this does not need to be initialized, but GCC doesn't realize that */
+ usize createj;
+ bool createtomb;
for (usize i = 1;; i++) {
Page *p = (void *)((u8 *)d->pages + pn*psize);
- bool seen_empty = false;
- usize j = h1;
- do {
- if (p->h2[j] < MIN_HASH2) {
- seen_empty = p->h2[j] == EMPTY;
- if (!createp) {
- createp = p;
- createj = j;
- }
- } else if (p->h2[j] == h2 && spec->eq(key(p, j, spec), k, spec)) {
- /* TODO: try branch predict hint true on eq */
+ /* We must load the metadata vector as little endian, so that low bits
+ * in the vector represent earlier slots in the table and hence ctz
+ * gives us earlier slots first. In the SIMD cases, we know we're on
+ * x86, so a memcpy suffices. In the general fallback case, we use
+ * readl64, which does a byte swap on big endian machines. */
+#if PAGE_LEN == 32
+ v32u8 meta; memcpy(&meta, &p->meta, sizeof meta);
+ for (u32 m = v32u8_msb(v32u8_cmpeq(meta, vh1)); m != 0; m &= m - 1) {
+ int j = r_ctz32(m);
+#elif PAGE_LEN == 16
+ v16u8 meta; memcpy(&meta, &p->meta, sizeof meta);
+ for (u16 m = v16u8_msb(v16u8_cmpeq(meta, vh1)); m != 0; m &= m - 1) {
+ int j = r_ctz16(m);
+#else
+ u64 meta = r_readl64(&p->meta);
+ u64 z = vh1 ^ meta;
+ for (int j = r_r8search64(z); j < 8; z |= U64_C(1) << (8*j), j = r_r8search64(z)) {
+#endif
+ if likely (spec->eq(key(p, j, spec), k, spec)) {
if (mode == DELETE) {
- if (seen_empty || r_r8search64(r_readh64(p->h2)) < 8) {
- p->h2[j] = EMPTY;
+#if PAGE_LEN == 32
+ if (v32u8_msb(v32u8_cmpeq(meta, vempty)) != 0) {
+#elif PAGE_LEN == 16
+ if (v16u8_msb(v16u8_cmpeq(meta, vempty)) != 0) {
+#else
+ if (r_r8search64(meta) < 8) {
+#endif
+ p->meta[j] = EMPTY;
} else {
- p->h2[j] = TOMBSTONE;
+ p->meta[j] = TOMBSTONE;
d->ntombs += 1;
}
d->count -= 1;
@@ -214,18 +243,66 @@ scan(Dict *d, void **v, void *k, u64 h, int mode, Spec *spec) {
*v = val(p, j, spec);
return 1;
}
+ }
- j = (j + 1) & (PAGE_LEN - 1);
- } while (j != h1);
+ /* Search for an EMPTY slot. */
+#if PAGE_LEN == 32
+ u32 m0 = v32u8_msb(v32u8_cmpeq(meta, vempty));
+#elif PAGE_LEN == 16
+ u16 m0 = v16u8_msb(v16u8_cmpeq(meta, vempty));
+#else
+ int j0 = r_r8search64(meta);
+#endif
+
+ /* In mode CREATE, we need to look for an EMPTY or TOMBSTONE slot in
+ * case we don't find an existing slot with the given key. We
+ * prioritize filling TOMBSTONE slots to decrease the load factor. */
+ if (mode == CREATE && !createp) {
+#if PAGE_LEN == 32
+ v32u8 vtomb = v32u8_fill(TOMBSTONE);
+ u32 m1 = v32u8_msb(v32u8_cmpeq(meta, vtomb));
+ if (m1 != 0) {
+ createp = p; createj = r_ctz32(m1); createtomb = true;
+ } else if (m0 != 0) {
+ createp = p; createj = r_ctz32(m0); createtomb = false;
+ break;
+ }
+#elif PAGE_LEN == 16
+ v16u8 vtomb = v16u8_fill(TOMBSTONE);
+ u16 m1 = v16u8_msb(v16u8_cmpeq(meta, vtomb));
+ if (m1 != 0) {
+ createp = p; createj = r_ctz16(m1); createtomb = true;
+ } else if (m0 != 0) {
+ createp = p; createj = r_ctz16(m0); createtomb = false;
+ break;
+ }
+#else
+ u64 vtomb; memset(&vtomb, TOMBSTONE, sizeof vtomb);
+ int j1 = r_r8search64(meta ^ vtomb);
+ if (j1 != 8) {
+ createp = p; createj = j1; createtomb = true;
+ } else if (j0 != 8) {
+ createp = p; createj = j0; createtomb = false;
+ break;
+ }
+#endif
+ }
- if (seen_empty) break;
+ /* If the page contains an EMPTY slot, the key can't be any further. */
+#if PAGE_LEN > 8
+ if (m0 != 0) break;
+#else
+ if (j0 != 8) break;
+#endif
+ /* Quadratic probing between pages */
pn = (pn + i) & ((USIZE_C(1) << d->b) - 1u);
}
if (mode == CREATE) {
d->count += 1;
- createp->h2[createj] = h2;
+ if (createtomb) d->ntombs -= 1;
+ createp->meta[createj] = h1;
/* TODO: try returning a pointer to the key slot instead, so that the
* compiler can generate copy code when ksize is a constant */
memcpy(key(createp, createj, spec), k, spec->ksize);
@@ -283,7 +360,7 @@ r_dict_clear(Dict *d, Spec *spec) {
usize len = USIZE_C(1) << d->b;
for (usize i = 0; i < len; i++) {
Page *p = (void *)((u8 *)d->pages + i*psize);
- memset(p->h2, EMPTY, sizeof p->h2);
+ memset(p->meta, EMPTY, sizeof p->meta);
}
d->count = 0;