commit b0e120dd5dbafa7230ce7ca9c7b46c32f7ccbb47
parent 51071af719559df83b990c518e076a269dbae576
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Thu, 8 Jun 2023 19:11:38 -0700
Remove AVX support
It was slower than SSE! Also, supporting two SIMD APIs just makes
the code even more of an ifdef mess.
Diffstat:
2 files changed, 12 insertions(+), 37 deletions(-)
diff --git a/inc/dict.h b/inc/dict.h
@@ -30,7 +30,7 @@ struct r_dict {
usize count;
usize ntombs;
void *mem;
- u8 *data; /* Cache line-aligned pointer into mem */
+ u8 *data; /* Cache-aligned pointer into mem */
};
static inline bool r_dict_mem_eq(void *a, void *b, RDictSpec *spec) { return memcmp(a, b, spec->ksize) == 0; }
@@ -73,7 +73,6 @@ typedef struct D { \
.free = (RDictFreeFunc)(R_DICT_FREE), \
}
-/* TODO: function for assigning without allocating */
#define R_DICT_DECLARE(D, K, V, ...)\
static inline UNUSED usize R_DICT_METHOD(len,##__VA_ARGS__)(D *d) { return d->d.count; } \
static inline UNUSED void R_DICT_METHOD(set_default,##__VA_ARGS__)(D *d, V v) { d->default_val = v; } \
diff --git a/src/dict.c b/src/dict.c
@@ -140,10 +140,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) {
usize h0 = h & meta_mask;
u8 h1 = hash1(h, d->b);
-#if PROBE_LEN == 32
- v32u8 vh1 = v32u8_fill(h1);
- v32u8 vempty = v32u8_fill(EMPTY);
-#elif PROBE_LEN == 16
+#ifdef USE_SIMD
v16u8 vh1 = v16u8_fill(h1);
v16u8 vempty = v16u8_fill(EMPTY);
#else
@@ -160,12 +157,12 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) {
usize i = mvi * PROBE_LEN;
/* We must load the metadata vector as little endian, so that low bits
- * in the vector represent earlier slots in the table and hence ctz
- * gives us earlier slots first. In the SIMD cases, we know we're on
- * x86, so a memcpy suffices. In the general fallback case, we use
- * readl64, which does a byte swap on big endian machines.
+ * in the vector represent earlier slots in the table and hence
+ * ctz/r8search gives us earlier slots first. In the SIMD cases, we
+ * know we're on x86, so a memcpy suffices. In the portable case, we
+ * use readl64, which does a byte swap on big endian machines.
*
- * In the SIMD cases, m is a bit array indicating where vmeta equals
+ * In the SIMD case, m is a bit array indicating where vmeta equals
* h1. Thus, ctz(m) is the least j such that meta[j] == h1 (provided
* m != 0).
*
@@ -173,11 +170,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) {
* Thus, we can use r8search64 on z to find the indices j of h1 in
* vmeta. After each iteration, we mask in an arbitrary bit into the
* jth byte so that the next iteration gets a different j. */
-#if PROBE_LEN == 32
- v32u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
- for (u32 m = v32u8_msb(v32u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) {
- int j = r_ctz32(m);
-#elif PROBE_LEN == 16
+#ifdef USE_SIMD
v16u8 vmeta; memcpy(&vmeta, meta+i, sizeof vmeta);
for (u16 m = v16u8_msb(v16u8_cmpeq(vmeta, vh1)); m != 0; m &= m - 1) {
int j = r_ctz16(m);
@@ -191,9 +184,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) {
Page *p = pages + (match_idx - page_idx) * (spec->ksize + spec->vsize);
if likely (spec->eq(p + page_idx * spec->ksize, k, spec)) {
if (mode == DELETE) {
-#if PROBE_LEN == 32
- if (v32u8_msb(v32u8_cmpeq(vmeta, vempty)) != 0) {
-#elif PROBE_LEN == 16
+#ifdef USE_SIMD
if (v16u8_msb(v16u8_cmpeq(vmeta, vempty)) != 0) {
#else
if (r_r8search64(vmeta) != 8) {
@@ -211,9 +202,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) {
}
/* Search for an EMPTY slot. */
-#if PROBE_LEN == 32
- u32 m0 = v32u8_msb(v32u8_cmpeq(vmeta, vempty));
-#elif PROBE_LEN == 16
+#ifdef USE_SIMD
u16 m0 = v16u8_msb(v16u8_cmpeq(vmeta, vempty));
#else
int j0 = r_r8search64(vmeta);
@@ -223,20 +212,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) {
* case we don't find an existing slot with the given key. We
* prioritize filling TOMBSTONE slots to decrease the load factor. */
if (mode == CREATE && !create_slot_found) {
-#if PROBE_LEN == 32
- v32u8 vtomb = v32u8_fill(TOMBSTONE);
- u32 m1 = v32u8_msb(v32u8_cmpeq(vmeta, vtomb));
- if (m1 != 0) {
- create_slot_found = true;
- create_idx = i + r_ctz32(m1);
- create_replacing_tomb = true;
- } else if (m0 != 0) {
- create_slot_found = true;
- create_idx = i + r_ctz32(m0);
- create_replacing_tomb = false;
- break;
- }
-#elif PROBE_LEN == 16
+#ifdef USE_SIMD
v16u8 vtomb = v16u8_fill(TOMBSTONE);
u16 m1 = v16u8_msb(v16u8_cmpeq(vmeta, vtomb));
if (m1 != 0) {
@@ -266,7 +242,7 @@ scan(RDict *d, void **v, void *k, u64 h, int mode, RDictSpec *spec) {
}
/* If the page contains an EMPTY slot, the key can't be any further. */
-#if PROBE_LEN > 8
+#ifdef USE_SIMD
if (m0 != 0) break;
#else
if (j0 != 8) break;