commit 840e43fa9a1ba629c16d76b65d84b8ba9076620a
parent ae1cedbd986cc171d082577490ad38bf6fc04f11
Author: Robert Russell <robert@rr3.xyz>
Date: Sun, 12 Jan 2025 19:16:12 -0800
Change add/sub with carry to arbitrary ternary add/sub
See comment in source code.
This is not tested at all yet.
Diffstat:
| M | inc/bits.h | | | 134 | ++++++++++++++++++++++++++++++++++++++----------------------------------------- |
1 file changed, 64 insertions(+), 70 deletions(-)
diff --git a/inc/bits.h b/inc/bits.h
@@ -205,92 +205,86 @@ r_rzb64(u64 n) {
}
-/* ----- Add/subtract with carry/borrow ----- */
-
-/* TODO: Use addc/subc intrinsics for non-64 bit adds/subs as well? */
-#if __GNUC__ >= 14
- #if __SIZEOF_INT__ == 8
- #define HAVE_ADDC_SUBC 1
- #define ADDC64 __builtin_addc
- #define SUBC64 __builtin_subc
- #elif __SIZEOF_LONG__ == 8
- #define HAVE_ADDC_SUBC 1
- #define ADDC64 __builtin_addcl
- #define SUBC64 __builtin_subcl
- #elif __SIZEOF_LONG_LONG == 8
- #define HAVE_ADDC_SUBC 1
- #define ADDC64 __builtin_addcll
- #define SUBC64 __builtin_subcll
- #endif
-#endif
+/* ----- Ternary add and subtract ----- */
+
+/* We implement ternary add/sub on arbitrary unsigned integers instead of with
+ * the third argument contrained to be a 1 bit carry/borrow, because compilers
+ * (or at least GCC) suck at generating good code with carries/borrows, so we
+ * might as well take the extra flexibility. Actually, the GCC 14.2 code gen
+ * for these functions is slightly better than the new __builtin_addc{,l,ll}
+ * intrinsics on my machine (according to a microbenchmark). */
static inline void
-r_add8(u8 *co, u8 *z, u8 x, u8 y, u8 ci) {
- u16 s = (u16)x + (u16)y + (u16)ci;
- *z = s;
- *co = s >> 8;
+r_add8(u8 *h, u8 *l, u8 x, u8 y, u8 z) {
+ u16 hl = (u16)x + (u16)y + (u16)z;
+ *l = hl;
+ *h = hl >> 8;
}
static inline void
-r_add16(u16 *co, u16 *z, u16 x, u16 y, u16 ci) {
- u32 s = (u32)x + (u32)y + (u32)ci;
- *z = s;
- *co = s >> 16;
+r_add16(u16 *h, u16 *l, u16 x, u16 y, u16 z) {
+ u32 hl = (u32)x + (u32)y + (u32)z;
+ *l = hl;
+ *h = hl >> 16;
}
static inline void
-r_add32(u32 *co, u32 *z, u32 x, u32 y, u32 ci) {
- u64 s = (u64)x + (u64)y + (u64)ci;
- *z = s;
- *co = s >> 32;
+r_add32(u32 *h, u32 *l, u32 x, u32 y, u32 z) {
+ u64 hl = (u64)x + (u64)y + (u64)z;
+ *l = hl;
+ *h = hl >> 32;
}
static inline void
-r_add64(u64 *co, u64 *z, u64 x, u64 y, u64 ci) {
-#ifdef HAVE_ADDC_SUBC
- *z = ADDC64(x, y, ci, co);
+r_add64(u64 *h, u64 *l, u64 x, u64 y, u64 z) {
+#ifdef R_HAVE_128
+ u128 hl = (u128)x + (u128)y + (u128)z;
+ *l = hl;
+ *h = hl >> 64;
#else
u64 s = x + y;
- u64 co0 = s < x;
- u64 t = s + ci;
- u64 co1 = t < s;
- *z = t;
- *co = co0 | co1;
+ bool c0 = s < x;
+ u64 t = s + z;
+ bool c1 = t < s;
+ *l = t;
+ *h = c0 + c1;
#endif
}
static inline void
-r_sub8(u8 *bo, u8 *z, u8 x, u8 y, u8 bi) {
- u16 d = (u16)x - (u16)y - (u16)bi;
- *z = d;
- *bo = -(d >> 8);
+r_sub8(u8 *h, u8 *l, u8 x, u8 y, u8 z) {
+ u16 hl = (u16)x - (u16)y - (u16)z;
+ *l = hl;
+ *h = hl >> 8;
}
static inline void
-r_sub16(u16 *bo, u16 *z, u16 x, u16 y, u16 bi) {
- u32 d = (u32)x + (u32)y + (u32)bi;
- *z = d;
- *bo = -(d >> 16);
+r_sub16(u16 *h, u16 *l, u16 x, u16 y, u16 z) {
+ u32 hl = (u32)x - (u32)y - (u32)z;
+ *l = hl;
+ *h = hl >> 16;
}
static inline void
-r_sub32(u32 *bo, u32 *z, u32 x, u32 y, u32 bi) {
- u64 d = (u64)x + (u64)y + (u64)bi;
- *z = d;
- *bo = -(d >> 32);
+r_sub32(u32 *h, u32 *l, u32 x, u32 y, u32 z) {
+ u64 hl = (u64)x - (u64)y - (u64)z;
+ *l = hl;
+ *h = hl >> 32;
}
static inline void
-r_sub64(u64 *bo, u64 *z, u64 x, u64 y, u64 bi) {
-#ifdef HAVE_ADDC_SUBC
- *z = SUBC64(x, y, bi, bo);
+r_sub64(u64 *h, u64 *l, u64 x, u64 y, u64 z) {
+#ifdef R_HAVE_128
+ u128 hl = (u128)x - (u128)y - (u128)z;
+ *l = hl;
+ *h = hl >> 64;
#else
u64 s = x - y;
- u64 bo0 = s > x;
- u64 t = s - bi;
- u64 bo1 = t > s;
- *z = t;
- *bo = bo0 | bo1;
+ bool c0 = s > x;
+ u64 t = s - z;
+ bool c1 = t > s;
+ *l = t;
+ *h = -c0 - c1;
#endif
}
@@ -303,31 +297,31 @@ r_sub64(u64 *bo, u64 *z, u64 x, u64 y, u64 bi) {
static inline void
r_mul8(u8 *h, u8 *l, u8 x, u8 y) {
- u16 xy = (u16)x * (u16)y;
- *l = xy;
- *h = xy >> 8;
+ u16 hl = (u16)x * (u16)y;
+ *l = hl;
+ *h = hl >> 8;
}
static inline void
r_mul16(u16 *h, u16 *l, u16 x, u16 y) {
- u32 xy = (u32)x * (u32)y;
- *l = xy;
- *h = xy >> 16;
+ u32 hl = (u32)x * (u32)y;
+ *l = hl;
+ *h = hl >> 16;
}
static inline void
r_mul32(u32 *h, u32 *l, u32 x, u32 y) {
- u64 xy = (u64)x * (u64)y;
- *l = xy;
- *h = xy >> 32;
+ u64 hl = (u64)x * (u64)y;
+ *l = hl;
+ *h = hl >> 32;
}
static inline void
r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
#ifdef R_HAVE_128
- u128 xy = (u128)x * (u128)y;
- *l = xy;
- *h = xy >> 64;
+ u128 hl = (u128)x * (u128)y;
+ *l = hl;
+ *h = hl >> 64;
#else
const u64 m = (U64_C(1)<<32) - 1;