commit ea49702480864306556b6818008f555fcab651ba
parent be1311cb7ebfa8ce641861617a4f0c5f739badd5
Author: Robert Russell <robert@rr3.xyz>
Date: Thu, 2 Jan 2025 01:36:45 -0800
Use __builtin_{add,sub}cl
Diffstat:
| M | bigmul.c | | | 88 | +++++++++++++++++++++++++++++++++++++++++++++++-------------------------------- |
| M | config.mk | | | 2 | +- |
2 files changed, 53 insertions(+), 37 deletions(-)
diff --git a/bigmul.c b/bigmul.c
@@ -3,6 +3,9 @@
#include <stdio.h>
#include <unistd.h>
+// TODO: Add (slow) fallback for __builtin_{add,sub}cl when not using clang or
+// GCC 14. Maybe that should go in rcx.
+
#define KARATSUBA_THRESH 32 // Best power of 2 determined via benchmarking
struct nat {
@@ -16,25 +19,19 @@ typedef struct nat Nat[1];
/* ----- Wide u64 math ----- */
-// TODO: Wide sub doesn't make much sense.
-#define WIDE(rh, rl, x, op, y) do { \
- u128 r = (u128)(x) op (u128)(y); \
- *(rh) = r >> 64; \
- *(rl) = r; \
- } while (0)
-inline void add64(u64 *rh, u64 *rl, u64 x, u64 y) { WIDE(rh, rl, x, +, y); }
-inline void sub64(u64 *rh, u64 *rl, u64 x, u64 y) { WIDE(rh, rl, x, -, y); }
-inline void mul64(u64 *rh, u64 *rl, u64 x, u64 y) { WIDE(rh, rl, x, *, y); }
-#undef WIDE
+inline void
+mul64(u64 *rh, u64 *rl, u64 x, u64 y) {
+ u128 r = (u128)x * (u128)y;
+ *rh = r >> 64;
+ *rl = r;
+}
inline void
fmaa64(u64 *rh, u64 *rl, u64 w, u64 x, u64 y, u64 z) {
- u64 h0, h1, h2, l;
- mul64(&h0, &l, w, x); // h0:l = w * x
- add64(&h1, &l, l, y); // h1:l = l + y
- add64(&h2, &l, l, z); // h2:l = l + z
- *rh = h0 + h1 + h2;
- *rl = l;
+ u64 h0, h1, l;
+ mul64(&h0, &l, w, x); // h0:l = w * x
+ *rl = __builtin_addcl(l, y, z, &h1); // h1:rl = l + y + z
+ *rh = h0 + h1;
}
@@ -44,15 +41,26 @@ fmaa64(u64 *rh, u64 *rl, u64 w, u64 x, u64 y, u64 z) {
u64
add(u64 *r, u64 *x, usize m, u64 *y, usize n) {
u64 c = 0;
- for (usize i = 0; i < n; i++) {
- u64 h0, h1, l;
- add64(&h0, &l, x[i], y[i]); // h0:l = x[i] + y[i]
- add64(&h1, &l, l, c); // h1:l = l + c
- c = h0 + h1;
- r[i] = l;
+ usize i = 0;
+
+ for (; i + 3 < n; i += 4) {
+ r[i + 0] = __builtin_addcl(x[i + 0], y[i + 0], c, &c);
+ r[i + 1] = __builtin_addcl(x[i + 1], y[i + 1], c, &c);
+ r[i + 2] = __builtin_addcl(x[i + 2], y[i + 2], c, &c);
+ r[i + 3] = __builtin_addcl(x[i + 3], y[i + 3], c, &c);
+ }
+ for (; i < n; i++)
+ r[i] = __builtin_addcl(x[i], y[i], c, &c);
+
+ for (; i + 3 < m; i += 4) {
+ r[i + 0] = __builtin_addcl(x[i + 0], 0, c, &c);
+ r[i + 1] = __builtin_addcl(x[i + 1], 0, c, &c);
+ r[i + 2] = __builtin_addcl(x[i + 2], 0, c, &c);
+ r[i + 3] = __builtin_addcl(x[i + 3], 0, c, &c);
}
- for (usize i = n; i < m; i++)
- add64(&c, &r[i], x[i], c); // c:r[i] = x[i] + c
+ for (; i < m; i++)
+ r[i] = __builtin_addcl(x[i], 0, c, &c);
+
return c;
}
@@ -61,20 +69,28 @@ add(u64 *r, u64 *x, usize m, u64 *y, usize n) {
// ("sub" backwards) for when m < n.
u64
sub(u64 *r, u64 *x, usize m, u64 *y, usize n) {
- u64 b = 0;
- for (usize i = 0; i < n; i++) {
- u64 h0, h1, l;
- sub64(&h0, &l, x[i], y[i]); // h0:l = x[i] - y[i]
- sub64(&h1, &l, l, b); // h1:l = l - b
- b = -h0 + -h1;
- r[i] = l;
+ u64 c = 0;
+ usize i = 0;
+
+ for (; i + 3 < n; i += 4) {
+ r[i + 0] = __builtin_subcl(x[i + 0], y[i + 0], c, &c);
+ r[i + 1] = __builtin_subcl(x[i + 1], y[i + 1], c, &c);
+ r[i + 2] = __builtin_subcl(x[i + 2], y[i + 2], c, &c);
+ r[i + 3] = __builtin_subcl(x[i + 3], y[i + 3], c, &c);
}
- for (usize i = n; i < m; i++) {
- u64 h;
- sub64(&h, &r[i], x[i], b);
- b = -h;
+ for (; i < n; i++)
+ r[i] = __builtin_subcl(x[i], y[i], c, &c);
+
+ for (; i + 3 < m; i += 4) {
+ r[i + 0] = __builtin_subcl(x[i + 0], 0, c, &c);
+ r[i + 1] = __builtin_subcl(x[i + 1], 0, c, &c);
+ r[i + 2] = __builtin_subcl(x[i + 2], 0, c, &c);
+ r[i + 3] = __builtin_subcl(x[i + 3], 0, c, &c);
}
- return b;
+ for (; i < m; i++)
+ r[i] = __builtin_subcl(x[i], 0, c, &c);
+
+ return c;
}
// Precondition: r does not intersect x nor y
diff --git a/config.mk b/config.mk
@@ -1,3 +1,3 @@
CFLAGS = -Wall -O3 -march=native
-CC = cc
+CC = gcc-14.2