commit 353a8eb15242460ce42434860a678efadf9d4c87
parent 7b0840047035241b7d2596c216111f84c47fac1e
Author: Robert Russell <robert@rr3.xyz>
Date: Thu, 2 Jan 2025 18:18:54 -0800
Add README and clean up more
Diffstat:
2 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/README b/README
@@ -0,0 +1,8 @@
+This is an implementation of arbitrary width multiplication in C. The main goal
+here was to learn what CPU hardware features might benefit arbitrary width
+multiplication (similar to how an ADDC (add with carry) instruction makes
+arbitrary width addition efficient). I think the main takeaway is that an
+FMAA (fused multiply-add-add) instruction would be useful, especially
+considering Dadda multipliers (and probably other hardware multiplier designs)
+can be extended with a relatively small amount of circuitry to support FMAA
+without increasing latency.
diff --git a/bigmul.c b/bigmul.c
@@ -11,14 +11,6 @@
// karatsuba).
#define KARATSUBA_THRESH 32
-struct nat {
- usize cap;
- usize len;
- u64 dat[];
-};
-
-typedef struct nat Nat[1];
-
/* ----- Wide u64 math ----- */
@@ -98,6 +90,8 @@ sub(u64 *r, u64 *x, usize m, u64 *y, usize n) {
return c;
}
+// Precondition: capacity(r) >= m + n, capacity(x) = m, capactiy(y) = n
+// Precondition: r is disjoint with x and y
void
fma_quadratic(u64 *r, u64 *x, usize m, u64 *y, usize n) {
for (usize j = 0; j < n; j++) {
@@ -246,6 +240,9 @@ mul_karatsuba(u64 *r, u64 *x, usize m, u64 *y, usize n) {
(n < KARATSUBA_THRESH ? fma_quadratic : fma_karatsuba)(r, x, m, y, n);
}
+
+/* ----- Benchmarks ----- */
+
u64 x[4096];
u64 y[4096];
u64 r[8192];