rcx

library of miscellaneous bits of C code
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit c7f3417b06d4b75ba61f7f1d8e37c7d519c405fa
parent 9f64d5767fb44e69e67de22d9e385016eddec2ea
Author: Robert Russell <robertrussell.72001@gmail.com>
Date:   Mon, 29 May 2023 12:06:06 -0700

More bit stuff

Diffstat:
Minc/bits.h | 171++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Msrc/rand.c | 4++--
2 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/inc/bits.h b/inc/bits.h @@ -1,9 +1,12 @@ #pragma once -#include <endian.h> +#include <string.h> #include "def.h" + +/* ----- Bit rotates ----- */ + static inline u8 r_rotl8 (u8 n, uint c) { return (n << (c& 7u)) | (n >> (-c& 7u)); } static inline u16 r_rotl16(u16 n, uint c) { return (n << (c&15u)) | (n >> (-c&15u)); } static inline u32 r_rotl32(u32 n, uint c) { return (n << (c&31u)) | (n >> (-c&31u)); } @@ -14,9 +17,13 @@ static inline u16 r_rotr16(u16 n, uint c) { return (n >> (c&15u)) | (n << (-c&15 static inline u32 r_rotr32(u32 n, uint c) { return (n >> (c&31u)) | (n << (-c&31u)); } static inline u64 r_rotr64(u64 n, uint c) { return (n >> (c&63u)) | (n << (-c&63u)); } + +/* ----- Population count ----- */ + /* The GCC builtin's do not use hardware popcnt unless an appropriate -march * is specified, in which case __POPCNT__ is defined. Otherwise, GCC emits * calls to libgcc which are slower than the below custom implementions. */ + #if defined(__GNUC__) && defined(__POPCNT__) static inline int r_popcnt8 (u8 n) { return __builtin_popcount(n); } @@ -59,6 +66,9 @@ r_popcnt64(u64 n) { #endif + +/* ----- Count trailing zeros ----- */ + /* The ctz functions are implemented with the following theory: Let n be a * b bit integer. We handle the case n == 0 separately, so assume n != 0. Then * ctzb(n) is the index of least significant 1 bit in n. We can isolate this @@ -70,10 +80,10 @@ r_popcnt64(u64 n) { * * The hash function H is constructed with the notion of de Bruijn cycles: Let * A be an alphabet with k symbols and let l > 0. An (A,l)-de Bruijn cycle is a - * length k^l cylic string over A which has all length l strings over A as a + * length k^l cyclic string over A which has all length l strings over A as a * substring. It is a fact that de Bruijn cycles exist for any A and l. For our - * application, we take A = {0,1} and l = lg(b). For example, here is de Bruijn - * cycle with b = 8 (hence l = 3): + * application, we take A = {0,1} and l = lg(b). For example, here is a + * de Bruijn cycle with b = 8 (hence l = 3): * * 0 0 0 1 1 1 0 1 = 0x1d * --------------- @@ -102,8 +112,8 @@ r_popcnt64(u64 n) { /* Somehow, the de Bruijn cycle method beats the code generated by * __builtin_clz{,l,ll} (using the tzcnt instruction) unless an appropriate * -march is specified, in which case GCC recognizes that the de Bruijn code - * implements ctz and it regresses to using the tzcnt instruction, thereby - * tying with the builtin. Ugh. */ + * (at least in the 32-bit and 64-bit cases) implements ctz and it regresses + * to using the tzcnt instruction, thereby tying with the builtin. Ugh. */ static inline int r_ctz8(u8 n) { @@ -149,8 +159,30 @@ r_ctz64(u64 n) { return perm[(u64)((n & -n) * cycle) >> (64 - 6)]; } -/* Perform a full-width multiply of x and y, storing the upper (resp., lower) - * 64 bits of the product in *h (resp., *l). */ + +/* ----- Full width multiply ----- */ + +static inline void +r_mul8(u8 *h, u8 *l, u8 x, u8 y) { + u16 xy = (u16)x * (u16)y; + *l = xy; + *h = xy >> 8; +} + +static inline void +r_mul16(u16 *h, u16 *l, u16 x, u16 y) { + u32 xy = (u32)x * (u32)y; + *l = xy; + *h = xy >> 16; +} + +static inline void +r_mul32(u32 *h, u32 *l, u32 x, u32 y) { + u64 xy = (u64)x * (u64)y; + *l = xy; + *h = xy >> 32; +} + static inline void r_mul64(u64 *h, u64 *l, u64 x, u64 y) { #ifdef R_HAVE_128 @@ -177,16 +209,119 @@ r_mul64(u64 *h, u64 *l, u64 x, u64 y) { #endif } -static inline u16 r_read16b(u8 *p) { return ((u16)p[0] << 8) | ((u16)p[1] << 0); } -static inline u16 r_read16l(u8 *p) { return ((u16)p[1] << 8) | ((u16)p[0] << 0); } -static inline u16 r_read16h(u8 *p) { return le16toh(r_read16l(p)); } -static inline u32 r_read32b(u8 *p) { return ((u32)r_read16b(p+0) << 16) | (u32)r_read16b(p+2); } -static inline u32 r_read32l(u8 *p) { return ((u32)r_read16l(p+2) << 16) | (u32)r_read16l(p+0); } -static inline u32 r_read32h(u8 *p) { return le32toh(r_read32l(p)); } +/* ----- Byte swaps/reversals ----- */ -static inline u64 r_read64b(u8 *p) { return ((u64)r_read32b(p+0) << 32) | (u64)r_read32b(p+4); } -static inline u64 r_read64l(u8 *p) { return ((u64)r_read32l(p+4) << 32) | (u64)r_read32l(p+0); } -static inline u64 r_read64h(u8 *p) { return le64toh(r_read64l(p)); } +#ifdef __GNUC__ -/* TODO: r_write{16,32,64}{b,l,h} */ +static inline u16 r_swap16(u16 n) { return __builtin_bswap16(n); } +static inline u32 r_swap32(u32 n) { return __builtin_bswap32(n); } +static inline u64 r_swap64(u64 n) { return __builtin_bswap64(n); } + +#else + +/* See Warren, "Hacker's Delight", 2 ed., sec. 7.1. */ + +static inline u16 +r_swap16(u16 n) { + return (n << 8) | (n >> 8); +} + +static inline u32 +r_swap32(u32 n) { + n = ((n & U32_C(0x00ff00ff)) << 8) | ((n >> 8) & U32_C(0x00ff00ff)); + return (n << 16) | (n >> 16); +} + +static inline u64 +r_swap64(u64 n) { + n = ((n & U64_C(0x00ff00ff00ff00ff)) << 8) | ((n >> 8) & U64_C(0x00ff00ff00ff00ff)); + n = ((n & U64_C(0x0000ffff0000ffff)) << 16) | ((n >> 16) & U64_C(0x0000ffff0000ffff)); + return (n << 32) | (n >> 32); +} + +#endif + + +/* ----- Endian conversions ----- */ + +/* There is 2x redundancy here (e.g., ltoh = htol), but this allows code using + * these functions to make the intent clear. */ + +#if defined(R_LITTLE_ENDIAN) +static inline u16 r_ltoh16(u16 n) { return n; } +static inline u32 r_ltoh32(u32 n) { return n; } +static inline u64 r_ltoh64(u64 n) { return n; } +static inline u16 r_btoh16(u16 n) { return r_swap16(n); } +static inline u32 r_btoh32(u32 n) { return r_swap32(n); } +static inline u64 r_btoh64(u64 n) { return r_swap64(n); } +static inline u16 r_htol16(u16 n) { return n; } +static inline u32 r_htol32(u32 n) { return n; } +static inline u64 r_htol64(u64 n) { return n; } +static inline u16 r_htob16(u16 n) { return r_swap16(n); } +static inline u32 r_htob32(u32 n) { return r_swap32(n); } +static inline u64 r_htob64(u64 n) { return r_swap64(n); } +#elif defined(R_BIG_ENDIAN) +static inline u16 r_btoh16(u16 n) { return n; } +static inline u32 r_btoh32(u32 n) { return n; } +static inline u64 r_btoh64(u64 n) { return n; } +static inline u16 r_ltoh16(u16 n) { return r_swap16(n); } +static inline u32 r_ltoh32(u32 n) { return r_swap32(n); } +static inline u64 r_ltoh64(u64 n) { return r_swap64(n); } +static inline u16 r_htob16(u16 n) { return n; } +static inline u32 r_htob32(u32 n) { return n; } +static inline u64 r_htob64(u64 n) { return n; } +static inline u16 r_htol16(u16 n) { return r_swap16(n); } +static inline u32 r_htol32(u32 n) { return r_swap32(n); } +static inline u64 r_htol64(u64 n) { return r_swap64(n); } +#endif + + +/* ----- Binary decoding ----- */ + +#if defined(R_LITTLE_ENDIAN) +static inline u16 r_readl16(void *p) { u16 v; memcpy(&v, p, 2); return v; } +static inline u32 r_readl32(void *p) { u32 v; memcpy(&v, p, 4); return v; } +static inline u64 r_readl64(void *p) { u64 v; memcpy(&v, p, 8); return v; } +static inline u16 r_readb16(void *p) { return r_swap16(r_readl16(p)); } +static inline u32 r_readb32(void *p) { return r_swap32(r_readl32(p)); } +static inline u64 r_readb64(void *p) { return r_swap64(r_readl64(p)); } +static inline u16 r_readh16(void *p) { return r_readl16(p); } +static inline u32 r_readh32(void *p) { return r_readl32(p); } +static inline u64 r_readh64(void *p) { return r_readl64(p); } +#elif defined(R_BIG_ENDIAN) +static inline u16 r_readb16(void *p) { u16 v; memcpy(&v, p, 2); return v; } +static inline u32 r_readb32(void *p) { u32 v; memcpy(&v, p, 4); return v; } +static inline u64 r_readb64(void *p) { u64 v; memcpy(&v, p, 8); return v; } +static inline u16 r_readl16(void *p) { return r_swap16(r_readb16(p)); } +static inline u32 r_readl32(void *p) { return r_swap32(r_readb32(p)); } +static inline u64 r_readl64(void *p) { return r_swap64(r_readb64(p)); } +static inline u16 r_readh16(void *p) { return r_readb16(p); } +static inline u32 r_readh32(void *p) { return r_readb32(p); } +static inline u64 r_readh64(void *p) { return r_readb64(p); } +#endif + + +/* ----- Binary encoding ----- */ + +#if defined(R_LITTLE_ENDIAN) +static inline void r_writel16(void *p, u16 v) { memcpy(p, &v, 2); } +static inline void r_writel32(void *p, u32 v) { memcpy(p, &v, 4); } +static inline void r_writel64(void *p, u64 v) { memcpy(p, &v, 8); } +static inline void r_writeb16(void *p, u16 v) { r_writel16(p, r_swap16(v)); } +static inline void r_writeb32(void *p, u32 v) { r_writel32(p, r_swap32(v)); } +static inline void r_writeb64(void *p, u64 v) { r_writel64(p, r_swap64(v)); } +static inline void r_writeh16(void *p, u16 v) { r_writel16(p, v); } +static inline void r_writeh32(void *p, u32 v) { r_writel32(p, v); } +static inline void r_writeh64(void *p, u64 v) { r_writel64(p, v); } +#elif defined(R_BIG_ENDIAN) +static inline void r_writeb16(void *p, u16 v) { memcpy(&v, p, 2); } +static inline void r_writeb32(void *p, u32 v) { memcpy(&v, p, 4); } +static inline void r_writeb64(void *p, u64 v) { memcpy(&v, p, 8); } +static inline void r_writel16(void *p, u16 v) { r_writeb16(p, r_swap16(v)); } +static inline void r_writel32(void *p, u32 v) { r_writeb32(p, r_swap32(v)); } +static inline void r_writel64(void *p, u64 v) { r_writeb64(p, r_swap64(v)); } +static inline void r_writeh16(void *p, u16 v) { r_writeb16(p, v); } +static inline void r_writeh32(void *p, u32 v) { r_writeb32(p, v); } +static inline void r_writeh64(void *p, u64 v) { r_writeb64(p, v); } +#endif diff --git a/src/rand.c b/src/rand.c @@ -8,8 +8,8 @@ #include "unix.h" #define mix r_wymix_ -#define r4(p) ((u64)r_read32h(p)) -#define r8(p) ((u64)r_read64h(p)) +#define r4(p) ((u64)r_readh32(p)) +#define r8(p) ((u64)r_readh64(p)) u64 r_seed = 0;