rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit faa22537775db1efe4cd1a0321e5ef3b67f779e4
parent 855375dd9931deb337fcac9c8e60f10896fd2e15
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Sun, 28 May 2023 19:44:28 -0700

Add ctz functions

And explain the technique, which is very clever!

Diffstat:
Minc/bits.h | 94+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 92 insertions(+), 2 deletions(-)

diff --git a/inc/bits.h b/inc/bits.h @@ -16,7 +16,7 @@ static inline int r_popcnt64(u64 n) { return __builtin_popcountll(n); } #else -/* See Henry Warren, "Hacker's Delight", 2 ed., sec. 5.1. */ +/* See Warren, "Hacker's Delight", 2 ed., sec. 5.1. */ static inline int r_popcnt8(u8 n) { @@ -49,6 +49,96 @@ r_popcnt64(u64 n) { #endif +/* The ctz functions are implemented with the following theory: Let n be a + * b bit integer. We handle the case n == 0 separately, so assume n != 0. Then + * ctzb(n) is the index of least significant 1 bit in n. We can isolate this + * bit as n & -n. Thus, we have reduced computing ctzb to determining the index + * of a single bit. For this, we construct a minimal perfect hash function H; + * i.e., given a b bit integer with a single bit set, H outputs a integer in + * [0..b). We then apply a suitable permutation P to the output of H to get the + * bit index. + * + * The hash function H is constructed with the notion of de Bruijn cycles: Let + * A be an alphabet with k symbols and let l > 0. An (A,l)-de Bruijn cycle is a + * length k^l cylic string over A which has all length l strings over A as a + * substring. It is a fact that de Bruijn cycles exist for any A and l. For our + * application, we take A = {0,1} and l = lg(b). For example, here is de Bruijn + * cycle with b = 8 (hence l = 3): + * + * 0 0 0 1 1 1 0 1 = 0x1d + * --------------- + * 0 0 0 + * 0 0 1 + * 0 1 1 + * 1 1 1 + * 1 1 0 + * 1 0 1 + * 0 0 1 (wrap-around) + * 0 0 1 (wrap-around) + * + * Clearly, when such a de Bruijn cycle is rotated left by 0,1,...,b-1, all + * possible integers [0..b) appear on its top l bits. If the de Bruijn cycle + * has l 0's on the left (as in the above example), then the top l bits are the + * same whether we rotate left or shift left (because left shift brings in 0's + * on the right). (Of course, any de Bruijn cycle can be rotated until l 0's + * appear at the left.) But shifting left is equivalent to multiplying by an + * integer with exactly 1 bit set. It is now clear how to define H: + * + * H(n) = (c * n) >> (b - l) + * + * where c is any ({0,1},l)-de Bruijn cycle starting with l 0's. Once c is + * fixed, the desired permutation P is easily computed. */ + +/* Somehow, the de Bruijn cycle method beats the code generated by + * __builtin_clz{,l,ll} (using the tzcnt instruction) unless an appropriate + * -march is specified, in which case GCC recognizes that the de Bruijn code + * implements ctz and it regresses to using the tzcnt instruction, thereby + * tying with the builtin. Ugh. */ + +static inline int +r_ctz8(u8 n) { + static u8 cycle = U8_C(0x1d); + static u8 perm[8] = {0, 1, 6, 2, 7, 5, 4, 3}; + if (n == 0) return 8; + return perm[(u8)((n & -n) * cycle) >> (8 - 3)]; +} + +static inline int +r_ctz16(u16 n) { + static u16 cycle = U16_C(0x0f65); + static u8 perm[16] = { + 0, 1, 11, 2, 14, 12, 8, 3, 15, 10, 13, 7, 9, 6, 5, 4, + }; + if (n == 0) return 16; + return perm[(u16)((n & -n) * cycle) >> (16 - 4)]; +} + +static inline int +r_ctz32(u32 n) { + /* cycle and perm are from Warren, "Hacker's Delight", 2 ed., p. 112. */ + static u32 cycle = U32_C(0x04d7651f); + static u8 perm[32] = { + 0, 1, 2, 24, 3, 19, 6, 25, 22, 4, 20, 10, 16, 7, 12, 26, + 31, 23, 18, 5, 21, 9, 15, 11, 30, 17, 8, 14, 29, 13, 28, 27, + }; + if (n == 0) return 32; + return perm[(u32)((n & -n) * cycle) >> (32 - 5)]; +} + +static inline int +r_ctz64(u64 n) { + /* cycle and perm are from Knuth, TAOCP, vol. 4A, p. 142. */ + static u64 cycle = U64_C(0x03f79d71b4ca8b09); + static u8 perm[64] = { + 0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4, + 62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5, + 63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11, + 54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6, + }; + if (n == 0) return 64; + return perm[(u64)((n & -n) * cycle) >> (64 - 6)]; +} + /* Perform a full-width multiply of x and y, storing the upper (resp., lower) * 64 bits of the product in *h (resp., *l). */ static inline void @@ -89,4 +179,4 @@ static inline u64 r_read64b(u8 *p) { return ((u64)r_read32b(p+0) << 32) | (u64)r static inline u64 r_read64l(u8 *p) { return ((u64)r_read32l(p+4) << 32) | (u64)r_read32l(p+0); } static inline u64 r_read64h(u8 *p) { return le64toh(r_read64l(p)); } -// TODO: r_write{16,32,64}{b,l,h} +/* TODO: r_write{16,32,64}{b,l,h} */