bigmul

big multiplication in C
git clone git://git.rr3.xyz/bigmul
Log | Files | Refs | README | LICENSE

commit ea49702480864306556b6818008f555fcab651ba
parent be1311cb7ebfa8ce641861617a4f0c5f739badd5
Author: Robert Russell <robert@rr3.xyz>
Date:   Thu,  2 Jan 2025 01:36:45 -0800

Use __builtin_{add,sub}cl

Diffstat:
Mbigmul.c | 88+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mconfig.mk | 2+-
2 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/bigmul.c b/bigmul.c @@ -3,6 +3,9 @@ #include <stdio.h> #include <unistd.h> +// TODO: Add (slow) fallback for __builtin_{add,sub}cl when not using clang or +// GCC 14. Maybe that should go in rcx. + #define KARATSUBA_THRESH 32 // Best power of 2 determined via benchmarking struct nat { @@ -16,25 +19,19 @@ typedef struct nat Nat[1]; /* ----- Wide u64 math ----- */ -// TODO: Wide sub doesn't make much sense. -#define WIDE(rh, rl, x, op, y) do { \ - u128 r = (u128)(x) op (u128)(y); \ - *(rh) = r >> 64; \ - *(rl) = r; \ - } while (0) -inline void add64(u64 *rh, u64 *rl, u64 x, u64 y) { WIDE(rh, rl, x, +, y); } -inline void sub64(u64 *rh, u64 *rl, u64 x, u64 y) { WIDE(rh, rl, x, -, y); } -inline void mul64(u64 *rh, u64 *rl, u64 x, u64 y) { WIDE(rh, rl, x, *, y); } -#undef WIDE +inline void +mul64(u64 *rh, u64 *rl, u64 x, u64 y) { + u128 r = (u128)x * (u128)y; + *rh = r >> 64; + *rl = r; +} inline void fmaa64(u64 *rh, u64 *rl, u64 w, u64 x, u64 y, u64 z) { - u64 h0, h1, h2, l; - mul64(&h0, &l, w, x); // h0:l = w * x - add64(&h1, &l, l, y); // h1:l = l + y - add64(&h2, &l, l, z); // h2:l = l + z - *rh = h0 + h1 + h2; - *rl = l; + u64 h0, h1, l; + mul64(&h0, &l, w, x); // h0:l = w * x + *rl = __builtin_addcl(l, y, z, &h1); // h1:rl = l + y + z + *rh = h0 + h1; } @@ -44,15 +41,26 @@ fmaa64(u64 *rh, u64 *rl, u64 w, u64 x, u64 y, u64 z) { u64 add(u64 *r, u64 *x, usize m, u64 *y, usize n) { u64 c = 0; - for (usize i = 0; i < n; i++) { - u64 h0, h1, l; - add64(&h0, &l, x[i], y[i]); // h0:l = x[i] + y[i] - add64(&h1, &l, l, c); // h1:l = l + c - c = h0 + h1; - r[i] = l; + usize i = 0; + + for (; i + 3 < n; i += 4) { + r[i + 0] = __builtin_addcl(x[i + 0], y[i + 0], c, &c); + r[i + 1] = __builtin_addcl(x[i + 1], y[i + 1], c, &c); + r[i + 2] = __builtin_addcl(x[i + 2], y[i + 2], c, &c); + r[i + 3] = __builtin_addcl(x[i + 3], y[i + 3], c, &c); + } + for (; i < n; i++) + r[i] = __builtin_addcl(x[i], y[i], c, &c); + + for (; i + 3 < m; i += 4) { + r[i + 0] = __builtin_addcl(x[i + 0], 0, c, &c); + r[i + 1] = __builtin_addcl(x[i + 1], 0, c, &c); + r[i + 2] = __builtin_addcl(x[i + 2], 0, c, &c); + r[i + 3] = __builtin_addcl(x[i + 3], 0, c, &c); } - for (usize i = n; i < m; i++) - add64(&c, &r[i], x[i], c); // c:r[i] = x[i] + c + for (; i < m; i++) + r[i] = __builtin_addcl(x[i], 0, c, &c); + return c; } @@ -61,20 +69,28 @@ add(u64 *r, u64 *x, usize m, u64 *y, usize n) { // ("sub" backwards) for when m < n. u64 sub(u64 *r, u64 *x, usize m, u64 *y, usize n) { - u64 b = 0; - for (usize i = 0; i < n; i++) { - u64 h0, h1, l; - sub64(&h0, &l, x[i], y[i]); // h0:l = x[i] - y[i] - sub64(&h1, &l, l, b); // h1:l = l - b - b = -h0 + -h1; - r[i] = l; + u64 c = 0; + usize i = 0; + + for (; i + 3 < n; i += 4) { + r[i + 0] = __builtin_subcl(x[i + 0], y[i + 0], c, &c); + r[i + 1] = __builtin_subcl(x[i + 1], y[i + 1], c, &c); + r[i + 2] = __builtin_subcl(x[i + 2], y[i + 2], c, &c); + r[i + 3] = __builtin_subcl(x[i + 3], y[i + 3], c, &c); } - for (usize i = n; i < m; i++) { - u64 h; - sub64(&h, &r[i], x[i], b); - b = -h; + for (; i < n; i++) + r[i] = __builtin_subcl(x[i], y[i], c, &c); + + for (; i + 3 < m; i += 4) { + r[i + 0] = __builtin_subcl(x[i + 0], 0, c, &c); + r[i + 1] = __builtin_subcl(x[i + 1], 0, c, &c); + r[i + 2] = __builtin_subcl(x[i + 2], 0, c, &c); + r[i + 3] = __builtin_subcl(x[i + 3], 0, c, &c); } - return b; + for (; i < m; i++) + r[i] = __builtin_subcl(x[i], 0, c, &c); + + return c; } // Precondition: r does not intersect x nor y diff --git a/config.mk b/config.mk @@ -1,3 +1,3 @@ CFLAGS = -Wall -O3 -march=native -CC = cc +CC = gcc-14.2