rcx

miscellaneous C library
git clone git://git.rr3.xyz/rcx
Log | Files | Refs | README | LICENSE

commit 840e43fa9a1ba629c16d76b65d84b8ba9076620a
parent ae1cedbd986cc171d082577490ad38bf6fc04f11
Author: Robert Russell <robert@rr3.xyz>
Date:   Sun, 12 Jan 2025 19:16:12 -0800

Change add/sub with carry to arbitrary ternary add/sub

See comment in source code.

This is not tested at all yet.

Diffstat:
Minc/bits.h | 134++++++++++++++++++++++++++++++++++++++-----------------------------------------
1 file changed, 64 insertions(+), 70 deletions(-)

diff --git a/inc/bits.h b/inc/bits.h @@ -205,92 +205,86 @@ r_rzb64(u64 n) { } -/* ----- Add/subtract with carry/borrow ----- */ - -/* TODO: Use addc/subc intrinsics for non-64 bit adds/subs as well? */ -#if __GNUC__ >= 14 - #if __SIZEOF_INT__ == 8 - #define HAVE_ADDC_SUBC 1 - #define ADDC64 __builtin_addc - #define SUBC64 __builtin_subc - #elif __SIZEOF_LONG__ == 8 - #define HAVE_ADDC_SUBC 1 - #define ADDC64 __builtin_addcl - #define SUBC64 __builtin_subcl - #elif __SIZEOF_LONG_LONG == 8 - #define HAVE_ADDC_SUBC 1 - #define ADDC64 __builtin_addcll - #define SUBC64 __builtin_subcll - #endif -#endif +/* ----- Ternary add and subtract ----- */ + +/* We implement ternary add/sub on arbitrary unsigned integers instead of with + * the third argument contrained to be a 1 bit carry/borrow, because compilers + * (or at least GCC) suck at generating good code with carries/borrows, so we + * might as well take the extra flexibility. Actually, the GCC 14.2 code gen + * for these functions is slightly better than the new __builtin_addc{,l,ll} + * intrinsics on my machine (according to a microbenchmark). */ static inline void -r_add8(u8 *co, u8 *z, u8 x, u8 y, u8 ci) { - u16 s = (u16)x + (u16)y + (u16)ci; - *z = s; - *co = s >> 8; +r_add8(u8 *h, u8 *l, u8 x, u8 y, u8 z) { + u16 hl = (u16)x + (u16)y + (u16)z; + *l = hl; + *h = hl >> 8; } static inline void -r_add16(u16 *co, u16 *z, u16 x, u16 y, u16 ci) { - u32 s = (u32)x + (u32)y + (u32)ci; - *z = s; - *co = s >> 16; +r_add16(u16 *h, u16 *l, u16 x, u16 y, u16 z) { + u32 hl = (u32)x + (u32)y + (u32)z; + *l = hl; + *h = hl >> 16; } static inline void -r_add32(u32 *co, u32 *z, u32 x, u32 y, u32 ci) { - u64 s = (u64)x + (u64)y + (u64)ci; - *z = s; - *co = s >> 32; +r_add32(u32 *h, u32 *l, u32 x, u32 y, u32 z) { + u64 hl = (u64)x + (u64)y + (u64)z; + *l = hl; + *h = hl >> 32; } static inline void -r_add64(u64 *co, u64 *z, u64 x, u64 y, u64 ci) { -#ifdef HAVE_ADDC_SUBC - *z = ADDC64(x, y, ci, co); +r_add64(u64 *h, u64 *l, u64 x, u64 y, u64 z) { +#ifdef R_HAVE_128 + u128 hl = (u128)x + (u128)y + (u128)z; + *l = hl; + *h = hl >> 64; #else u64 s = x + y; - u64 co0 = s < x; - u64 t = s + ci; - u64 co1 = t < s; - *z = t; - *co = co0 | co1; + bool c0 = s < x; + u64 t = s + z; + bool c1 = t < s; + *l = t; + *h = c0 + c1; #endif } static inline void -r_sub8(u8 *bo, u8 *z, u8 x, u8 y, u8 bi) { - u16 d = (u16)x - (u16)y - (u16)bi; - *z = d; - *bo = -(d >> 8); +r_sub8(u8 *h, u8 *l, u8 x, u8 y, u8 z) { + u16 hl = (u16)x - (u16)y - (u16)z; + *l = hl; + *h = hl >> 8; } static inline void -r_sub16(u16 *bo, u16 *z, u16 x, u16 y, u16 bi) { - u32 d = (u32)x + (u32)y + (u32)bi; - *z = d; - *bo = -(d >> 16); +r_sub16(u16 *h, u16 *l, u16 x, u16 y, u16 z) { + u32 hl = (u32)x - (u32)y - (u32)z; + *l = hl; + *h = hl >> 16; } static inline void -r_sub32(u32 *bo, u32 *z, u32 x, u32 y, u32 bi) { - u64 d = (u64)x + (u64)y + (u64)bi; - *z = d; - *bo = -(d >> 32); +r_sub32(u32 *h, u32 *l, u32 x, u32 y, u32 z) { + u64 hl = (u64)x - (u64)y - (u64)z; + *l = hl; + *h = hl >> 32; } static inline void -r_sub64(u64 *bo, u64 *z, u64 x, u64 y, u64 bi) { -#ifdef HAVE_ADDC_SUBC - *z = SUBC64(x, y, bi, bo); +r_sub64(u64 *h, u64 *l, u64 x, u64 y, u64 z) { +#ifdef R_HAVE_128 + u128 hl = (u128)x - (u128)y - (u128)z; + *l = hl; + *h = hl >> 64; #else u64 s = x - y; - u64 bo0 = s > x; - u64 t = s - bi; - u64 bo1 = t > s; - *z = t; - *bo = bo0 | bo1; + bool c0 = s > x; + u64 t = s - z; + bool c1 = t > s; + *l = t; + *h = -c0 - c1; #endif } @@ -303,31 +297,31 @@ r_sub64(u64 *bo, u64 *z, u64 x, u64 y, u64 bi) { static inline void r_mul8(u8 *h, u8 *l, u8 x, u8 y) { - u16 xy = (u16)x * (u16)y; - *l = xy; - *h = xy >> 8; + u16 hl = (u16)x * (u16)y; + *l = hl; + *h = hl >> 8; } static inline void r_mul16(u16 *h, u16 *l, u16 x, u16 y) { - u32 xy = (u32)x * (u32)y; - *l = xy; - *h = xy >> 16; + u32 hl = (u32)x * (u32)y; + *l = hl; + *h = hl >> 16; } static inline void r_mul32(u32 *h, u32 *l, u32 x, u32 y) { - u64 xy = (u64)x * (u64)y; - *l = xy; - *h = xy >> 32; + u64 hl = (u64)x * (u64)y; + *l = hl; + *h = hl >> 32; } static inline void r_mul64(u64 *h, u64 *l, u64 x, u64 y) { #ifdef R_HAVE_128 - u128 xy = (u128)x * (u128)y; - *l = xy; - *h = xy >> 64; + u128 hl = (u128)x * (u128)y; + *l = hl; + *h = hl >> 64; #else const u64 m = (U64_C(1)<<32) - 1;