commit c7f3417b06d4b75ba61f7f1d8e37c7d519c405fa
parent 9f64d5767fb44e69e67de22d9e385016eddec2ea
Author: Robert Russell <robertrussell.72001@gmail.com>
Date: Mon, 29 May 2023 12:06:06 -0700
More bit stuff
Diffstat:
| M | inc/bits.h | | | 171 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- |
| M | src/rand.c | | | 4 | ++-- |
2 files changed, 155 insertions(+), 20 deletions(-)
diff --git a/inc/bits.h b/inc/bits.h
@@ -1,9 +1,12 @@
#pragma once
-#include <endian.h>
+#include <string.h>
#include "def.h"
+
+/* ----- Bit rotates ----- */
+
static inline u8 r_rotl8 (u8 n, uint c) { return (n << (c& 7u)) | (n >> (-c& 7u)); }
static inline u16 r_rotl16(u16 n, uint c) { return (n << (c&15u)) | (n >> (-c&15u)); }
static inline u32 r_rotl32(u32 n, uint c) { return (n << (c&31u)) | (n >> (-c&31u)); }
@@ -14,9 +17,13 @@ static inline u16 r_rotr16(u16 n, uint c) { return (n >> (c&15u)) | (n << (-c&15
static inline u32 r_rotr32(u32 n, uint c) { return (n >> (c&31u)) | (n << (-c&31u)); }
static inline u64 r_rotr64(u64 n, uint c) { return (n >> (c&63u)) | (n << (-c&63u)); }
+
+/* ----- Population count ----- */
+
/* The GCC builtin's do not use hardware popcnt unless an appropriate -march
* is specified, in which case __POPCNT__ is defined. Otherwise, GCC emits
* calls to libgcc which are slower than the below custom implementions. */
+
#if defined(__GNUC__) && defined(__POPCNT__)
static inline int r_popcnt8 (u8 n) { return __builtin_popcount(n); }
@@ -59,6 +66,9 @@ r_popcnt64(u64 n) {
#endif
+
+/* ----- Count trailing zeros ----- */
+
/* The ctz functions are implemented with the following theory: Let n be a
* b bit integer. We handle the case n == 0 separately, so assume n != 0. Then
* ctzb(n) is the index of least significant 1 bit in n. We can isolate this
@@ -70,10 +80,10 @@ r_popcnt64(u64 n) {
*
* The hash function H is constructed with the notion of de Bruijn cycles: Let
* A be an alphabet with k symbols and let l > 0. An (A,l)-de Bruijn cycle is a
- * length k^l cylic string over A which has all length l strings over A as a
+ * length k^l cyclic string over A which has all length l strings over A as a
* substring. It is a fact that de Bruijn cycles exist for any A and l. For our
- * application, we take A = {0,1} and l = lg(b). For example, here is de Bruijn
- * cycle with b = 8 (hence l = 3):
+ * application, we take A = {0,1} and l = lg(b). For example, here is a
+ * de Bruijn cycle with b = 8 (hence l = 3):
*
* 0 0 0 1 1 1 0 1 = 0x1d
* ---------------
@@ -102,8 +112,8 @@ r_popcnt64(u64 n) {
/* Somehow, the de Bruijn cycle method beats the code generated by
* __builtin_clz{,l,ll} (using the tzcnt instruction) unless an appropriate
* -march is specified, in which case GCC recognizes that the de Bruijn code
- * implements ctz and it regresses to using the tzcnt instruction, thereby
- * tying with the builtin. Ugh. */
+ * (at least in the 32-bit and 64-bit cases) implements ctz and it regresses
+ * to using the tzcnt instruction, thereby tying with the builtin. Ugh. */
static inline int
r_ctz8(u8 n) {
@@ -149,8 +159,30 @@ r_ctz64(u64 n) {
return perm[(u64)((n & -n) * cycle) >> (64 - 6)];
}
-/* Perform a full-width multiply of x and y, storing the upper (resp., lower)
- * 64 bits of the product in *h (resp., *l). */
+
+/* ----- Full width multiply ----- */
+
+static inline void
+r_mul8(u8 *h, u8 *l, u8 x, u8 y) {
+ u16 xy = (u16)x * (u16)y;
+ *l = xy;
+ *h = xy >> 8;
+}
+
+static inline void
+r_mul16(u16 *h, u16 *l, u16 x, u16 y) {
+ u32 xy = (u32)x * (u32)y;
+ *l = xy;
+ *h = xy >> 16;
+}
+
+static inline void
+r_mul32(u32 *h, u32 *l, u32 x, u32 y) {
+ u64 xy = (u64)x * (u64)y;
+ *l = xy;
+ *h = xy >> 32;
+}
+
static inline void
r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
#ifdef R_HAVE_128
@@ -177,16 +209,119 @@ r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
#endif
}
-static inline u16 r_read16b(u8 *p) { return ((u16)p[0] << 8) | ((u16)p[1] << 0); }
-static inline u16 r_read16l(u8 *p) { return ((u16)p[1] << 8) | ((u16)p[0] << 0); }
-static inline u16 r_read16h(u8 *p) { return le16toh(r_read16l(p)); }
-static inline u32 r_read32b(u8 *p) { return ((u32)r_read16b(p+0) << 16) | (u32)r_read16b(p+2); }
-static inline u32 r_read32l(u8 *p) { return ((u32)r_read16l(p+2) << 16) | (u32)r_read16l(p+0); }
-static inline u32 r_read32h(u8 *p) { return le32toh(r_read32l(p)); }
+/* ----- Byte swaps/reversals ----- */
-static inline u64 r_read64b(u8 *p) { return ((u64)r_read32b(p+0) << 32) | (u64)r_read32b(p+4); }
-static inline u64 r_read64l(u8 *p) { return ((u64)r_read32l(p+4) << 32) | (u64)r_read32l(p+0); }
-static inline u64 r_read64h(u8 *p) { return le64toh(r_read64l(p)); }
+#ifdef __GNUC__
-/* TODO: r_write{16,32,64}{b,l,h} */
+static inline u16 r_swap16(u16 n) { return __builtin_bswap16(n); }
+static inline u32 r_swap32(u32 n) { return __builtin_bswap32(n); }
+static inline u64 r_swap64(u64 n) { return __builtin_bswap64(n); }
+
+#else
+
+/* See Warren, "Hacker's Delight", 2 ed., sec. 7.1. */
+
+static inline u16
+r_swap16(u16 n) {
+ return (n << 8) | (n >> 8);
+}
+
+static inline u32
+r_swap32(u32 n) {
+ n = ((n & U32_C(0x00ff00ff)) << 8) | ((n >> 8) & U32_C(0x00ff00ff));
+ return (n << 16) | (n >> 16);
+}
+
+static inline u64
+r_swap64(u64 n) {
+ n = ((n & U64_C(0x00ff00ff00ff00ff)) << 8) | ((n >> 8) & U64_C(0x00ff00ff00ff00ff));
+ n = ((n & U64_C(0x0000ffff0000ffff)) << 16) | ((n >> 16) & U64_C(0x0000ffff0000ffff));
+ return (n << 32) | (n >> 32);
+}
+
+#endif
+
+
+/* ----- Endian conversions ----- */
+
+/* There is 2x redundancy here (e.g., ltoh = htol), but this allows code using
+ * these functions to make the intent clear. */
+
+#if defined(R_LITTLE_ENDIAN)
+static inline u16 r_ltoh16(u16 n) { return n; }
+static inline u32 r_ltoh32(u32 n) { return n; }
+static inline u64 r_ltoh64(u64 n) { return n; }
+static inline u16 r_btoh16(u16 n) { return r_swap16(n); }
+static inline u32 r_btoh32(u32 n) { return r_swap32(n); }
+static inline u64 r_btoh64(u64 n) { return r_swap64(n); }
+static inline u16 r_htol16(u16 n) { return n; }
+static inline u32 r_htol32(u32 n) { return n; }
+static inline u64 r_htol64(u64 n) { return n; }
+static inline u16 r_htob16(u16 n) { return r_swap16(n); }
+static inline u32 r_htob32(u32 n) { return r_swap32(n); }
+static inline u64 r_htob64(u64 n) { return r_swap64(n); }
+#elif defined(R_BIG_ENDIAN)
+static inline u16 r_btoh16(u16 n) { return n; }
+static inline u32 r_btoh32(u32 n) { return n; }
+static inline u64 r_btoh64(u64 n) { return n; }
+static inline u16 r_ltoh16(u16 n) { return r_swap16(n); }
+static inline u32 r_ltoh32(u32 n) { return r_swap32(n); }
+static inline u64 r_ltoh64(u64 n) { return r_swap64(n); }
+static inline u16 r_htob16(u16 n) { return n; }
+static inline u32 r_htob32(u32 n) { return n; }
+static inline u64 r_htob64(u64 n) { return n; }
+static inline u16 r_htol16(u16 n) { return r_swap16(n); }
+static inline u32 r_htol32(u32 n) { return r_swap32(n); }
+static inline u64 r_htol64(u64 n) { return r_swap64(n); }
+#endif
+
+
+/* ----- Binary decoding ----- */
+
+#if defined(R_LITTLE_ENDIAN)
+static inline u16 r_readl16(void *p) { u16 v; memcpy(&v, p, 2); return v; }
+static inline u32 r_readl32(void *p) { u32 v; memcpy(&v, p, 4); return v; }
+static inline u64 r_readl64(void *p) { u64 v; memcpy(&v, p, 8); return v; }
+static inline u16 r_readb16(void *p) { return r_swap16(r_readl16(p)); }
+static inline u32 r_readb32(void *p) { return r_swap32(r_readl32(p)); }
+static inline u64 r_readb64(void *p) { return r_swap64(r_readl64(p)); }
+static inline u16 r_readh16(void *p) { return r_readl16(p); }
+static inline u32 r_readh32(void *p) { return r_readl32(p); }
+static inline u64 r_readh64(void *p) { return r_readl64(p); }
+#elif defined(R_BIG_ENDIAN)
+static inline u16 r_readb16(void *p) { u16 v; memcpy(&v, p, 2); return v; }
+static inline u32 r_readb32(void *p) { u32 v; memcpy(&v, p, 4); return v; }
+static inline u64 r_readb64(void *p) { u64 v; memcpy(&v, p, 8); return v; }
+static inline u16 r_readl16(void *p) { return r_swap16(r_readb16(p)); }
+static inline u32 r_readl32(void *p) { return r_swap32(r_readb32(p)); }
+static inline u64 r_readl64(void *p) { return r_swap64(r_readb64(p)); }
+static inline u16 r_readh16(void *p) { return r_readb16(p); }
+static inline u32 r_readh32(void *p) { return r_readb32(p); }
+static inline u64 r_readh64(void *p) { return r_readb64(p); }
+#endif
+
+
+/* ----- Binary encoding ----- */
+
+#if defined(R_LITTLE_ENDIAN)
+static inline void r_writel16(void *p, u16 v) { memcpy(p, &v, 2); }
+static inline void r_writel32(void *p, u32 v) { memcpy(p, &v, 4); }
+static inline void r_writel64(void *p, u64 v) { memcpy(p, &v, 8); }
+static inline void r_writeb16(void *p, u16 v) { r_writel16(p, r_swap16(v)); }
+static inline void r_writeb32(void *p, u32 v) { r_writel32(p, r_swap32(v)); }
+static inline void r_writeb64(void *p, u64 v) { r_writel64(p, r_swap64(v)); }
+static inline void r_writeh16(void *p, u16 v) { r_writel16(p, v); }
+static inline void r_writeh32(void *p, u32 v) { r_writel32(p, v); }
+static inline void r_writeh64(void *p, u64 v) { r_writel64(p, v); }
+#elif defined(R_BIG_ENDIAN)
+static inline void r_writeb16(void *p, u16 v) { memcpy(&v, p, 2); }
+static inline void r_writeb32(void *p, u32 v) { memcpy(&v, p, 4); }
+static inline void r_writeb64(void *p, u64 v) { memcpy(&v, p, 8); }
+static inline void r_writel16(void *p, u16 v) { r_writeb16(p, r_swap16(v)); }
+static inline void r_writel32(void *p, u32 v) { r_writeb32(p, r_swap32(v)); }
+static inline void r_writel64(void *p, u64 v) { r_writeb64(p, r_swap64(v)); }
+static inline void r_writeh16(void *p, u16 v) { r_writeb16(p, v); }
+static inline void r_writeh32(void *p, u32 v) { r_writeb32(p, v); }
+static inline void r_writeh64(void *p, u64 v) { r_writeb64(p, v); }
+#endif
diff --git a/src/rand.c b/src/rand.c
@@ -8,8 +8,8 @@
#include "unix.h"
#define mix r_wymix_
-#define r4(p) ((u64)r_read32h(p))
-#define r8(p) ((u64)r_read64h(p))
+#define r4(p) ((u64)r_readh32(p))
+#define r8(p) ((u64)r_readh64(p))
u64 r_seed = 0;