commit faa22537775db1efe4cd1a0321e5ef3b67f779e4
parent 855375dd9931deb337fcac9c8e60f10896fd2e15
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Sun, 28 May 2023 19:44:28 -0700
Add ctz functions
And explain the technique, which is very clever!
Diffstat:
| M | inc/bits.h | | | 94 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- |
1 file changed, 92 insertions(+), 2 deletions(-)
diff --git a/inc/bits.h b/inc/bits.h
@@ -16,7 +16,7 @@ static inline int r_popcnt64(u64 n) { return __builtin_popcountll(n); }
#else
-/* See Henry Warren, "Hacker's Delight", 2 ed., sec. 5.1. */
+/* See Warren, "Hacker's Delight", 2 ed., sec. 5.1. */
static inline int
r_popcnt8(u8 n) {
@@ -49,6 +49,96 @@ r_popcnt64(u64 n) {
#endif
+/* The ctz functions are implemented with the following theory: Let n be a
+ * b bit integer. We handle the case n == 0 separately, so assume n != 0. Then
+ * ctzb(n) is the index of least significant 1 bit in n. We can isolate this
+ * bit as n & -n. Thus, we have reduced computing ctzb to determining the index
+ * of a single bit. For this, we construct a minimal perfect hash function H;
+ * i.e., given a b bit integer with a single bit set, H outputs a integer in
+ * [0..b). We then apply a suitable permutation P to the output of H to get the
+ * bit index.
+ *
+ * The hash function H is constructed with the notion of de Bruijn cycles: Let
+ * A be an alphabet with k symbols and let l > 0. An (A,l)-de Bruijn cycle is a
+ * length k^l cylic string over A which has all length l strings over A as a
+ * substring. It is a fact that de Bruijn cycles exist for any A and l. For our
+ * application, we take A = {0,1} and l = lg(b). For example, here is de Bruijn
+ * cycle with b = 8 (hence l = 3):
+ *
+ * 0 0 0 1 1 1 0 1 = 0x1d
+ * ---------------
+ * 0 0 0
+ * 0 0 1
+ * 0 1 1
+ * 1 1 1
+ * 1 1 0
+ * 1 0 1
+ * 0 0 1 (wrap-around)
+ * 0 0 1 (wrap-around)
+ *
+ * Clearly, when such a de Bruijn cycle is rotated left by 0,1,...,b-1, all
+ * possible integers [0..b) appear on its top l bits. If the de Bruijn cycle
+ * has l 0's on the left (as in the above example), then the top l bits are the
+ * same whether we rotate left or shift left (because left shift brings in 0's
+ * on the right). (Of course, any de Bruijn cycle can be rotated until l 0's
+ * appear at the left.) But shifting left is equivalent to multiplying by an
+ * integer with exactly 1 bit set. It is now clear how to define H:
+ *
+ * H(n) = (c * n) >> (b - l)
+ *
+ * where c is any ({0,1},l)-de Bruijn cycle starting with l 0's. Once c is
+ * fixed, the desired permutation P is easily computed. */
+
+/* Somehow, the de Bruijn cycle method beats the code generated by
+ * __builtin_clz{,l,ll} (using the tzcnt instruction) unless an appropriate
+ * -march is specified, in which case GCC recognizes that the de Bruijn code
+ * implements ctz and it regresses to using the tzcnt instruction, thereby
+ * tying with the builtin. Ugh. */
+
+static inline int
+r_ctz8(u8 n) {
+ static u8 cycle = U8_C(0x1d);
+ static u8 perm[8] = {0, 1, 6, 2, 7, 5, 4, 3};
+ if (n == 0) return 8;
+ return perm[(u8)((n & -n) * cycle) >> (8 - 3)];
+}
+
+static inline int
+r_ctz16(u16 n) {
+ static u16 cycle = U16_C(0x0f65);
+ static u8 perm[16] = {
+ 0, 1, 11, 2, 14, 12, 8, 3, 15, 10, 13, 7, 9, 6, 5, 4,
+ };
+ if (n == 0) return 16;
+ return perm[(u16)((n & -n) * cycle) >> (16 - 4)];
+}
+
+static inline int
+r_ctz32(u32 n) {
+ /* cycle and perm are from Warren, "Hacker's Delight", 2 ed., p. 112. */
+ static u32 cycle = U32_C(0x04d7651f);
+ static u8 perm[32] = {
+ 0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26,
+ 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27,
+ };
+ if (n == 0) return 32;
+ return perm[(u32)((n & -n) * cycle) >> (32 - 5)];
+}
+
+static inline int
+r_ctz64(u64 n) {
+ /* cycle and perm are from Knuth, TAOCP, vol. 4A, p. 142. */
+ static u64 cycle = U64_C(0x03f79d71b4ca8b09);
+ static u8 perm[64] = {
+ 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
+ 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
+ 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
+ 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
+ };
+ if (n == 0) return 64;
+ return perm[(u64)((n & -n) * cycle) >> (64 - 6)];
+}
+
/* Perform a full-width multiply of x and y, storing the upper (resp., lower)
* 64 bits of the product in *h (resp., *l). */
static inline void
@@ -89,4 +179,4 @@ static inline u64 r_read64b(u8 *p) { return ((u64)r_read32b(p+0) << 32) | (u64)r
static inline u64 r_read64l(u8 *p) { return ((u64)r_read32l(p+4) << 32) | (u64)r_read32l(p+0); }
static inline u64 r_read64h(u8 *p) { return le64toh(r_read64l(p)); }
-// TODO: r_write{16,32,64}{b,l,h}
+/* TODO: r_write{16,32,64}{b,l,h} */