commit 40dbe5ef3ef2f69f0ab4135b3b9a7237dedf70a7
Author: Robert Russell <robert@rr3.xyz>
Date: Sun, 17 Aug 2025 17:36:40 -0700
Initial commit
Diffstat:
| A | .gitignore | | | 2 | ++ |
| A | Makefile | | | 8 | ++++++++ |
| A | func.c | | | 117 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | func.h | | | 6 | ++++++ |
| A | main.c | | | 29 | +++++++++++++++++++++++++++++ |
5 files changed, 162 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+func.o
+main
diff --git a/Makefile b/Makefile
@@ -0,0 +1,8 @@
+main: main.c func.c func.h
+ gcc -c -Wall -flax-vector-conversions -O3 -march=native func.c -lrcx
+ gcc -o main -Wall -O3 -march=native main.c func.o -lrcx
+
+.PHONY: clean
+clean:
+ rm -f func.o main
+
diff --git a/func.c b/func.c
@@ -0,0 +1,117 @@
+#include <rcx/all.h>
+
+// GCC unrolls this completely on -O3.
+u128
+reduce_scalar1024(u64 *x) {
+ u128 s = 0;
+ for (usize i = 0; i < 16; i++)
+ s += (u128)x[i];
+ return s;
+}
+
+// GCC does not unroll this on -O3.
+u128
+reduce_scalar2048(u64 *x) {
+ u128 s = 0;
+ for (usize i = 0; i < 32; i++)
+ s += (u128)x[i];
+ return s;
+}
+
+u128
+reduce_avx1024(u64 *x) {
+ // 1. Load input
+ v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x0, 32);
+ v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x4, 32);
+ v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x8, 32);
+ v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0xc, 32);
+
+ // 2. Reduce to 2x256 bits
+ v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7;
+ v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63;
+ v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F;
+ v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63;
+ v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF;
+ v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63)
+ + h_04_15_26_37 + h_8C_9D_AE_BF;
+
+ // 3. Split to 128-bit halves
+ v2u64 l_048C_159D = { l_048C_159D_26AE_37BF[0], l_048C_159D_26AE_37BF[1] };
+ v2u64 h_048C_159D = { h_048C_159D_26AE_37BF[0], h_048C_159D_26AE_37BF[1] };
+ v2u64 l_26AE_37BF = { l_048C_159D_26AE_37BF[2], l_048C_159D_26AE_37BF[3] };
+ v2u64 h_26AE_37BF = { h_048C_159D_26AE_37BF[2], h_048C_159D_26AE_37BF[3] };
+
+ // 4. Reduce to 2x128 bits
+ v2u64 l_02468ACE_13579BDF = l_048C_159D + l_26AE_37BF;
+ v2u64 h_02468ACE_13579BDF = ((l_02468ACE_13579BDF < l_048C_159D) >> 63)
+ + h_048C_159D + h_26AE_37BF;
+
+ // 5. Split to 64-bits halves
+ u64 l_02468ACE = l_02468ACE_13579BDF[0];
+ u64 h_02468ACE = h_02468ACE_13579BDF[0];
+ u64 l_13579BDF = l_02468ACE_13579BDF[1];
+ u64 h_13579BDF = h_02468ACE_13579BDF[1];
+
+ // 6. Reduce to 2x64 bits
+ u64 l_0123456789ABCDEF = l_02468ACE + l_13579BDF;
+ u64 h_0123456789ABCDEF = (l_0123456789ABCDEF < l_02468ACE)
+ + h_02468ACE + h_13579BDF;
+
+ return ((u128)h_0123456789ABCDEF << 64) | (u128)l_0123456789ABCDEF;
+}
+
+u128
+reduce_avx2048(u64 *x) {
+ // 1. Load input
+ v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x00, 32);
+ v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x04, 32);
+ v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x08, 32);
+ v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0x0c, 32);
+ v4u64 x_G_H_I_J; memcpy(&x_G_H_I_J, x+0x10, 32);
+ v4u64 x_K_L_M_N; memcpy(&x_K_L_M_N, x+0x14, 32);
+ v4u64 x_O_P_Q_R; memcpy(&x_O_P_Q_R, x+0x18, 32);
+ v4u64 x_S_T_U_V; memcpy(&x_S_T_U_V, x+0x1c, 32);
+
+ // 2. Reduce to 2x256 bits
+ v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7;
+ v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63;
+ v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F;
+ v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63;
+ v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF;
+ v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63)
+ + h_04_15_26_37 + h_8C_9D_AE_BF;
+ v4u64 l_GK_HL_IM_JN = x_G_H_I_J + x_K_L_M_N;
+ v4u64 h_GK_HL_IM_JN = (l_GK_HL_IM_JN < x_G_H_I_J) >> 63;
+ v4u64 l_OS_PT_QU_RV = x_O_P_Q_R + x_S_T_U_V;
+ v4u64 h_OS_PT_QU_RV = (l_OS_PT_QU_RV < x_O_P_Q_R) >> 63;
+ v4u64 l_GKOS_HLPT_IMQU_JNRV = l_GK_HL_IM_JN + l_OS_PT_QU_RV;
+ v4u64 h_GKOS_HLPT_IMQU_JNRV = ((l_GKOS_HLPT_IMQU_JNRV < l_GK_HL_IM_JN) >> 63)
+ + h_GK_HL_IM_JN + h_OS_PT_QU_RV;
+ v4u64 l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = l_048C_159D_26AE_37BF + l_GKOS_HLPT_IMQU_JNRV;
+ v4u64 h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = ((l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV < l_048C_159D_26AE_37BF) >> 63)
+ + h_048C_159D_26AE_37BF + h_GKOS_HLPT_IMQU_JNRV;
+
+ // 3. Split to 128-bit halves
+ v2u64 l_048CGKOS_159DHLPT = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] };
+ v2u64 h_048CGKOS_159DHLPT = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] };
+ v2u64 l_26AEIMQU_37BFJNRV = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] };
+ v2u64 h_26AEIMQU_37BFJNRV = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] };
+
+ // 4. Reduce to 2x128 bits
+ v2u64 l_02468ACEGIKMOQSU_13579BDFHJLNPRTV = l_048CGKOS_159DHLPT + l_26AEIMQU_37BFJNRV;
+ v2u64 h_02468ACEGIKMOQSU_13579BDFHJLNPRTV = ((l_02468ACEGIKMOQSU_13579BDFHJLNPRTV < l_048CGKOS_159DHLPT) >> 63)
+ + h_048CGKOS_159DHLPT + h_26AEIMQU_37BFJNRV;
+
+ // 5. Split to 64-bits halves
+ u64 l_02468ACEGIKMOQSU = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0];
+ u64 h_02468ACEGIKMOQSU = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0];
+ u64 l_13579BDFHJLNPRTV = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1];
+ u64 h_13579BDFHJLNPRTV = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1];
+
+ // 6. Reduce to 2x64 bits
+ u64 l_0123456789ABCDEFGHIJKLMNOPQRSTUV = l_02468ACEGIKMOQSU + l_13579BDFHJLNPRTV;
+ u64 h_0123456789ABCDEFGHIJKLMNOPQRSTUV = (l_0123456789ABCDEFGHIJKLMNOPQRSTUV < l_02468ACEGIKMOQSU)
+ + h_02468ACEGIKMOQSU + h_13579BDFHJLNPRTV;
+
+ return ((u128)h_0123456789ABCDEFGHIJKLMNOPQRSTUV << 64) | (u128)l_0123456789ABCDEFGHIJKLMNOPQRSTUV;
+}
diff --git a/func.h b/func.h
@@ -0,0 +1,6 @@
+#include <rcx/def.h>
+
+u128 reduce_scalar1024(u64 *x);
+u128 reduce_scalar2048(u64 *x);
+u128 reduce_avx1024(u64 *x);
+u128 reduce_avx2048(u64 *x);
diff --git a/main.c b/main.c
@@ -0,0 +1,29 @@
+#include <rcx/all.h>
+#include <rcx/bench.h>
+
+#include "func.h"
+
+u64 in[16];
+volatile u128 out;
+
+#define BENCHMARK(f) \
+ void \
+ benchmark_##f(u64 N) { \
+ r_bench_start(); \
+ for (u64 n = 0; n < N; n++) \
+ out = f(in); \
+ r_bench_stop(); \
+ }
+
+BENCHMARK(reduce_scalar1024)
+BENCHMARK(reduce_scalar2048)
+BENCHMARK(reduce_avx1024)
+BENCHMARK(reduce_avx2048)
+
+int
+main(void) {
+ r_bench(benchmark_reduce_scalar1024, 3000);
+ r_bench(benchmark_reduce_scalar2048, 3000);
+ r_bench(benchmark_reduce_avx1024, 3000);
+ r_bench(benchmark_reduce_avx2048, 3000);
+}