commit 6929edbb9449c0929418f06fc6137aafc4b342cf
Author: Robert Russell <robert@rr3.xyz>
Date: Tue, 31 Dec 2024 19:13:37 -0800
Add apparently working naive and Karatsuba multipliers
Diffstat:
5 files changed, 179 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+bigmul
+\ No newline at end of file
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2024, Robert Russell
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+\ No newline at end of file
diff --git a/Makefile b/Makefile
@@ -0,0 +1,11 @@
+.POSIX:
+
+include config.mk
+
+bigmul: bigmul.c config.mk
+ $(CC) $(CFLAGS) -o $@ $< -lrcx
+
+clean:
+ rm -f bigmul
+
+.PHONY: clean
diff --git a/bigmul.c b/bigmul.c
@@ -0,0 +1,147 @@
+#include <rcx/all.h>
+#include <stdio.h>
+#include <unistd.h>
+
+struct nat {
+ usize cap;
+ usize len;
+ u64 dat[];
+};
+
+typedef struct nat Nat[1];
+
+// TODO: Make sure that GCC inlines and compiles {add,sub}64 well.
+// Try __builtin_{add,sub}c{,l,ll}.
+
+inline void
+add64(u64 *co, u64 *r, u64 x, u64 y, u64 ci) {
+ u128 cr = (u128)x + (u128)y + (u128)ci;
+ *co = cr >> 64;
+ *r = cr;
+}
+
+inline void
+sub64(u64 *bo, u64 *r, u64 x, u64 y, u64 bi) {
+ u128 br = (u128)x - (u128)y - (u128)bi;
+ *bo = -(br >> 64);
+ *r = br;
+}
+
+inline void
+mul64(u64 *rh, u64 *rl, u64 x, u64 y) {
+ u128 r = (u128)x * (u128)y;
+ *rh = r >> 64;
+ *rl = r;
+}
+
+/*
+inline void
+fma64(u64 *rh, u64 *rl, u64 x, u64 y, u64 z) {
+ u64 h, l, c;
+ mul64(&h, &l, x, y);
+ add64(&c, rl, l, z, 0);
+ *rh = h + c;
+}
+*/
+
+inline void
+fmaa64(u64 *rh, u64 *rl, u64 w, u64 x, u64 y, u64 z) {
+ u64 h, l, c, d;
+ mul64(&h, &l, w, x); // [h, l] = w * x
+ add64(&c, &l, l, y, 0); // [c, l] = l + y
+ add64(&d, rl, l, z, 0); // [d, rl] = l + z
+ *rh = h + c + d; // rh = h + c + d
+}
+
+// Precondition: m >= n
+void
+add(u64 *r, u64 *x, usize m, u64 *y, usize n) {
+ u64 c = 0;
+ for (usize i = 0; i < n; i++)
+ add64(&c, &r[i], x[i], y[i], c);
+ for (usize i = n; i < m; i++)
+ add64(&c, &r[i], x[i], c, 0);
+ r[m] = c;
+}
+
+// Precondition: m >= n
+// TODO: sub is not commutative like add, so we need a "bus" operation
+// ("sub" backwards) for when m < n.
+void
+sub(u64 *r, u64 *x, usize m, u64 *y, usize n) {
+ u64 b = 0;
+ for (usize i = 0; i < n; i++)
+ sub64(&b, &r[i], x[i], y[i], b);
+ for (usize i = n; i < m; i++)
+ sub64(&b, &r[i], x[i], b, 0);
+ r[m] = -b;
+}
+
+// Precondition: r does not intersect x nor y
+void
+mul_quadratic(u64 *r, u64 *x, usize m, u64 *y, usize n) {
+ memset(r, 0, (m + n) * sizeof *r);
+ for (usize j = 0; j < n; j++) {
+ u64 c = 0;
+ for (usize i = 0; i < m; i++)
+ fmaa64(&c, &r[i + j], x[i], y[j], r[i + j], c);
+ r[m + j] = c;
+ }
+}
+
+// Precondition: r does not intersect x nor y
+void
+mul_karatsuba(u64 *r, u64 *x, usize m, u64 *y, usize n) {
+ // If we allow m < 4 and n < 4, then the recursion is not well-founded.
+ // TODO: Determine best threshold for quadratic.
+ if (m < 4 && n < 4) {
+ mul_quadratic(r, x, m, y, n);
+ return;
+ }
+
+ usize k = (MAX(m, n) + 1) / 2;
+
+ // 1. Split x
+ usize mh = m > k ? m - k : 0, ml = MIN(k, m);
+ u64 *xh = x + ml, *xl = x; // x = xh * b^k + xl
+
+ // 2. Split y
+ usize nh = n > k ? n - k : 0, nl = MIN(k, n);
+ u64 *yh = y + nl, *yl = y; // y = yh * b^k + yl
+
+ // 3. Assign/allocate memory
+ // Note that we use the output buffer r as temporary storage for
+ // intermediate results.
+ // TODO: Accept capacity of r as an argument, and use excess memory for t
+ // if it's big enough instead of allocating. Or at least allocate once in
+ // the first recursive instances of the function, and pass the memory down.
+ usize rw = m + n;
+ usize pw = MAX(ml, mh) + 1; u64 *p = r;
+ usize qw = MAX(nl, nh) + 1; u64 *q = r + pw;
+ usize tw = pw + qw; u64 *t = r_eallocn(tw, sizeof *t);
+ usize uw = ml + nl; u64 *u = r;
+ usize sw = mh + mh; u64 *s = r + 2 * k;
+
+ // 4. Arithmetic
+ // TODO: Explain algorithm.
+ add(p, xl, ml, xh, mh); // p = xl + xh
+ add(q, yl, nl, yh, nh); // q = yl + yh
+ mul_karatsuba(t, p, pw, q, qw); // t = p * q
+ mul_karatsuba(u, xl, ml, yl, nl); // u = xl * yl
+ mul_karatsuba(s, xh, mh, yh, nh); // s = xh * yh
+ sub(t, t, tw, u, uw); // t -= u
+ sub(t, t, tw, s, sw); // t -= s
+ add(r + k, r + k, rw - k, t, tw); // r[k..] += t
+
+ free(t);
+}
+
+int
+main(void) {
+ u64 x[] = { 0x1234123412341234, 0x5678567856785678, 0x89ab89ab89ab89ab, 0xcdefcdefcdefcdef };
+ u64 y[] = { 0x4321432143214321, 0x8765876587658765, 0xba98ba98ba98ba98, 0xfedcfedcfedcfedc };
+ u64 r0[LEN(x) + LEN(y)]; mul_quadratic(r0, x, LEN(x), y, LEN(y));
+ u64 r1[LEN(x) + LEN(y)]; mul_karatsuba(r1, x, LEN(x), y, LEN(y));
+ printf("0x%016lx%016lx%016lx%016lx%016lx%016lx%016lx%016lx\n", r0[7], r0[6], r0[5], r0[4], r0[3], r0[2], r0[1], r0[0]);
+ printf("0x%016lx%016lx%016lx%016lx%016lx%016lx%016lx%016lx\n", r1[7], r1[6], r1[5], r1[4], r1[3], r1[2], r1[1], r1[0]);
+}
diff --git a/config.mk b/config.mk
@@ -0,0 +1,3 @@
+CFLAGS = -Wall -O3 -march=native
+
+CC = cc