Change add/sub with carry to arbitrary ternary add/sub - rcx

commit 840e43fa9a1ba629c16d76b65d84b8ba9076620a
parent ae1cedbd986cc171d082577490ad38bf6fc04f11
Author: Robert Russell <robert@rr3.xyz>
Date:   Sun, 12 Jan 2025 19:16:12 -0800

Change add/sub with carry to arbitrary ternary add/sub

See comment in source code.

This is not tested at all yet.

Diffstat:
M inc/bits.h  | 134 ++++++++++++++++++++++++++++++++++++++-----------------------------------------

1 file changed, 64 insertions(+), 70 deletions(-)
diff --git a/inc/bits.h b/inc/bits.h
@@ -205,92 +205,86 @@ r_rzb64(u64 n) {
 }
 
 
-/* ----- Add/subtract with carry/borrow ----- */
-
-/* TODO: Use addc/subc intrinsics for non-64 bit adds/subs as well? */
-#if __GNUC__ >= 14
-	#if __SIZEOF_INT__ == 8
-		#define HAVE_ADDC_SUBC 1
-		#define ADDC64 __builtin_addc
-		#define SUBC64 __builtin_subc
-	#elif __SIZEOF_LONG__ == 8
-		#define HAVE_ADDC_SUBC 1
-		#define ADDC64 __builtin_addcl
-		#define SUBC64 __builtin_subcl
-	#elif __SIZEOF_LONG_LONG == 8
-		#define HAVE_ADDC_SUBC 1
-		#define ADDC64 __builtin_addcll
-		#define SUBC64 __builtin_subcll
-	#endif
-#endif
+/* ----- Ternary add and subtract ----- */
+
+/* We implement ternary add/sub on arbitrary unsigned integers instead of with
+ * the third argument contrained to be a 1 bit carry/borrow, because compilers
+ * (or at least GCC) suck at generating good code with carries/borrows, so we
+ * might as well take the extra flexibility. Actually, the GCC 14.2 code gen
+ * for these functions is slightly better than the new __builtin_addc{,l,ll}
+ * intrinsics on my machine (according to a microbenchmark). */
 
 static inline void
-r_add8(u8 *co, u8 *z, u8 x, u8 y, u8 ci) {
-	u16 s = (u16)x + (u16)y + (u16)ci;
-	*z = s;
-	*co = s >> 8;
+r_add8(u8 *h, u8 *l, u8 x, u8 y, u8 z) {
+	u16 hl = (u16)x + (u16)y + (u16)z;
+	*l = hl;
+	*h = hl >> 8;
 }
 
 static inline void
-r_add16(u16 *co, u16 *z, u16 x, u16 y, u16 ci) {
-	u32 s = (u32)x + (u32)y + (u32)ci;
-	*z = s;
-	*co = s >> 16;
+r_add16(u16 *h, u16 *l, u16 x, u16 y, u16 z) {
+	u32 hl = (u32)x + (u32)y + (u32)z;
+	*l = hl;
+	*h = hl >> 16;
 }
 
 static inline void
-r_add32(u32 *co, u32 *z, u32 x, u32 y, u32 ci) {
-	u64 s = (u64)x + (u64)y + (u64)ci;
-	*z = s;
-	*co = s >> 32;
+r_add32(u32 *h, u32 *l, u32 x, u32 y, u32 z) {
+	u64 hl = (u64)x + (u64)y + (u64)z;
+	*l = hl;
+	*h = hl >> 32;
 }
 
 static inline void
-r_add64(u64 *co, u64 *z, u64 x, u64 y, u64 ci) {
-#ifdef HAVE_ADDC_SUBC
-	*z = ADDC64(x, y, ci, co);
+r_add64(u64 *h, u64 *l, u64 x, u64 y, u64 z) {
+#ifdef R_HAVE_128
+	u128 hl = (u128)x + (u128)y + (u128)z;
+	*l = hl;
+	*h = hl >> 64;
 #else
 	u64 s = x + y;
-	u64 co0 = s < x;
-	u64 t = s + ci;
-	u64 co1 = t < s;
-	*z = t;
-	*co = co0 | co1;
+	bool c0 = s < x;
+	u64 t = s + z;
+	bool c1 = t < s;
+	*l = t;
+	*h = c0 + c1;
 #endif
 }
 
 static inline void
-r_sub8(u8 *bo, u8 *z, u8 x, u8 y, u8 bi) {
-	u16 d = (u16)x - (u16)y - (u16)bi;
-	*z = d;
-	*bo = -(d >> 8);
+r_sub8(u8 *h, u8 *l, u8 x, u8 y, u8 z) {
+	u16 hl = (u16)x - (u16)y - (u16)z;
+	*l = hl;
+	*h = hl >> 8;
 }
 
 static inline void
-r_sub16(u16 *bo, u16 *z, u16 x, u16 y, u16 bi) {
-	u32 d = (u32)x + (u32)y + (u32)bi;
-	*z = d;
-	*bo = -(d >> 16);
+r_sub16(u16 *h, u16 *l, u16 x, u16 y, u16 z) {
+	u32 hl = (u32)x - (u32)y - (u32)z;
+	*l = hl;
+	*h = hl >> 16;
 }
 
 static inline void
-r_sub32(u32 *bo, u32 *z, u32 x, u32 y, u32 bi) {
-	u64 d = (u64)x + (u64)y + (u64)bi;
-	*z = d;
-	*bo = -(d >> 32);
+r_sub32(u32 *h, u32 *l, u32 x, u32 y, u32 z) {
+	u64 hl = (u64)x - (u64)y - (u64)z;
+	*l = hl;
+	*h = hl >> 32;
 }
 
 static inline void
-r_sub64(u64 *bo, u64 *z, u64 x, u64 y, u64 bi) {
-#ifdef HAVE_ADDC_SUBC
-	*z = SUBC64(x, y, bi, bo);
+r_sub64(u64 *h, u64 *l, u64 x, u64 y, u64 z) {
+#ifdef R_HAVE_128
+	u128 hl = (u128)x - (u128)y - (u128)z;
+	*l = hl;
+	*h = hl >> 64;
 #else
 	u64 s = x - y;
-	u64 bo0 = s > x;
-	u64 t = s - bi;
-	u64 bo1 = t > s;
-	*z = t;
-	*bo = bo0 | bo1;
+	bool c0 = s > x;
+	u64 t = s - z;
+	bool c1 = t > s;
+	*l = t;
+	*h = -c0 - c1;
 #endif
 }
 
@@ -303,31 +297,31 @@ r_sub64(u64 *bo, u64 *z, u64 x, u64 y, u64 bi) {
 
 static inline void
 r_mul8(u8 *h, u8 *l, u8 x, u8 y) {
-	u16 xy = (u16)x * (u16)y;
-	*l = xy;
-	*h = xy >> 8;
+	u16 hl = (u16)x * (u16)y;
+	*l = hl;
+	*h = hl >> 8;
 }
 
 static inline void
 r_mul16(u16 *h, u16 *l, u16 x, u16 y) {
-	u32 xy = (u32)x * (u32)y;
-	*l = xy;
-	*h = xy >> 16;
+	u32 hl = (u32)x * (u32)y;
+	*l = hl;
+	*h = hl >> 16;
 }
 
 static inline void
 r_mul32(u32 *h, u32 *l, u32 x, u32 y) {
-	u64 xy = (u64)x * (u64)y;
-	*l = xy;
-	*h = xy >> 32;
+	u64 hl = (u64)x * (u64)y;
+	*l = hl;
+	*h = hl >> 32;
 }
 
 static inline void
 r_mul64(u64 *h, u64 *l, u64 x, u64 y) {
 #ifdef R_HAVE_128
-	u128 xy = (u128)x * (u128)y;
-	*l = xy;
-	*h = xy >> 64;
+	u128 hl = (u128)x * (u128)y;
+	*l = hl;
+	*h = hl >> 64;
 #else
 	const u64 m = (U64_C(1)<<32) - 1;

	rcx miscellaneous C library
	git clone git://git.rr3.xyz/rcx
	Log \| Files \| Refs \| README \| LICENSE