simd-reduce

SIMD reduce implementation and benchmark
git clone git://git.rr3.xyz/simd-reduce
Log | Files | Refs

commit 40dbe5ef3ef2f69f0ab4135b3b9a7237dedf70a7
Author: Robert Russell <robert@rr3.xyz>
Date:   Sun, 17 Aug 2025 17:36:40 -0700

Initial commit

Diffstat:
A.gitignore | 2++
AMakefile | 8++++++++
Afunc.c | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Afunc.h | 6++++++
Amain.c | 29+++++++++++++++++++++++++++++
5 files changed, 162 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,2 @@ +func.o +main diff --git a/Makefile b/Makefile @@ -0,0 +1,8 @@ +main: main.c func.c func.h + gcc -c -Wall -flax-vector-conversions -O3 -march=native func.c -lrcx + gcc -o main -Wall -O3 -march=native main.c func.o -lrcx + +.PHONY: clean +clean: + rm -f func.o main + diff --git a/func.c b/func.c @@ -0,0 +1,117 @@ +#include <rcx/all.h> + +// GCC unrolls this completely on -O3. +u128 +reduce_scalar1024(u64 *x) { + u128 s = 0; + for (usize i = 0; i < 16; i++) + s += (u128)x[i]; + return s; +} + +// GCC does not unroll this on -O3. +u128 +reduce_scalar2048(u64 *x) { + u128 s = 0; + for (usize i = 0; i < 32; i++) + s += (u128)x[i]; + return s; +} + +u128 +reduce_avx1024(u64 *x) { + // 1. Load input + v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x0, 32); + v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x4, 32); + v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x8, 32); + v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0xc, 32); + + // 2. Reduce to 2x256 bits + v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7; + v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63; + v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F; + v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63; + v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF; + v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63) + + h_04_15_26_37 + h_8C_9D_AE_BF; + + // 3. Split to 128-bit halves + v2u64 l_048C_159D = { l_048C_159D_26AE_37BF[0], l_048C_159D_26AE_37BF[1] }; + v2u64 h_048C_159D = { h_048C_159D_26AE_37BF[0], h_048C_159D_26AE_37BF[1] }; + v2u64 l_26AE_37BF = { l_048C_159D_26AE_37BF[2], l_048C_159D_26AE_37BF[3] }; + v2u64 h_26AE_37BF = { h_048C_159D_26AE_37BF[2], h_048C_159D_26AE_37BF[3] }; + + // 4. Reduce to 2x128 bits + v2u64 l_02468ACE_13579BDF = l_048C_159D + l_26AE_37BF; + v2u64 h_02468ACE_13579BDF = ((l_02468ACE_13579BDF < l_048C_159D) >> 63) + + h_048C_159D + h_26AE_37BF; + + // 5. Split to 64-bits halves + u64 l_02468ACE = l_02468ACE_13579BDF[0]; + u64 h_02468ACE = h_02468ACE_13579BDF[0]; + u64 l_13579BDF = l_02468ACE_13579BDF[1]; + u64 h_13579BDF = h_02468ACE_13579BDF[1]; + + // 6. Reduce to 2x64 bits + u64 l_0123456789ABCDEF = l_02468ACE + l_13579BDF; + u64 h_0123456789ABCDEF = (l_0123456789ABCDEF < l_02468ACE) + + h_02468ACE + h_13579BDF; + + return ((u128)h_0123456789ABCDEF << 64) | (u128)l_0123456789ABCDEF; +} + +u128 +reduce_avx2048(u64 *x) { + // 1. Load input + v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+0x00, 32); + v4u64 x_4_5_6_7; memcpy(&x_4_5_6_7, x+0x04, 32); + v4u64 x_8_9_A_B; memcpy(&x_8_9_A_B, x+0x08, 32); + v4u64 x_C_D_E_F; memcpy(&x_C_D_E_F, x+0x0c, 32); + v4u64 x_G_H_I_J; memcpy(&x_G_H_I_J, x+0x10, 32); + v4u64 x_K_L_M_N; memcpy(&x_K_L_M_N, x+0x14, 32); + v4u64 x_O_P_Q_R; memcpy(&x_O_P_Q_R, x+0x18, 32); + v4u64 x_S_T_U_V; memcpy(&x_S_T_U_V, x+0x1c, 32); + + // 2. Reduce to 2x256 bits + v4u64 l_04_15_26_37 = x_0_1_2_3 + x_4_5_6_7; + v4u64 h_04_15_26_37 = (l_04_15_26_37 < x_0_1_2_3) >> 63; + v4u64 l_8C_9D_AE_BF = x_8_9_A_B + x_C_D_E_F; + v4u64 h_8C_9D_AE_BF = (l_8C_9D_AE_BF < x_8_9_A_B) >> 63; + v4u64 l_048C_159D_26AE_37BF = l_04_15_26_37 + l_8C_9D_AE_BF; + v4u64 h_048C_159D_26AE_37BF = ((l_048C_159D_26AE_37BF < l_04_15_26_37) >> 63) + + h_04_15_26_37 + h_8C_9D_AE_BF; + v4u64 l_GK_HL_IM_JN = x_G_H_I_J + x_K_L_M_N; + v4u64 h_GK_HL_IM_JN = (l_GK_HL_IM_JN < x_G_H_I_J) >> 63; + v4u64 l_OS_PT_QU_RV = x_O_P_Q_R + x_S_T_U_V; + v4u64 h_OS_PT_QU_RV = (l_OS_PT_QU_RV < x_O_P_Q_R) >> 63; + v4u64 l_GKOS_HLPT_IMQU_JNRV = l_GK_HL_IM_JN + l_OS_PT_QU_RV; + v4u64 h_GKOS_HLPT_IMQU_JNRV = ((l_GKOS_HLPT_IMQU_JNRV < l_GK_HL_IM_JN) >> 63) + + h_GK_HL_IM_JN + h_OS_PT_QU_RV; + v4u64 l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = l_048C_159D_26AE_37BF + l_GKOS_HLPT_IMQU_JNRV; + v4u64 h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV = ((l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV < l_048C_159D_26AE_37BF) >> 63) + + h_048C_159D_26AE_37BF + h_GKOS_HLPT_IMQU_JNRV; + + // 3. Split to 128-bit halves + v2u64 l_048CGKOS_159DHLPT = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] }; + v2u64 h_048CGKOS_159DHLPT = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[0], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[1] }; + v2u64 l_26AEIMQU_37BFJNRV = { l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], l_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] }; + v2u64 h_26AEIMQU_37BFJNRV = { h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[2], h_048CGKOS_159DHLPT_26AEIMQU_37BFJNRV[3] }; + + // 4. Reduce to 2x128 bits + v2u64 l_02468ACEGIKMOQSU_13579BDFHJLNPRTV = l_048CGKOS_159DHLPT + l_26AEIMQU_37BFJNRV; + v2u64 h_02468ACEGIKMOQSU_13579BDFHJLNPRTV = ((l_02468ACEGIKMOQSU_13579BDFHJLNPRTV < l_048CGKOS_159DHLPT) >> 63) + + h_048CGKOS_159DHLPT + h_26AEIMQU_37BFJNRV; + + // 5. Split to 64-bits halves + u64 l_02468ACEGIKMOQSU = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0]; + u64 h_02468ACEGIKMOQSU = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[0]; + u64 l_13579BDFHJLNPRTV = l_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1]; + u64 h_13579BDFHJLNPRTV = h_02468ACEGIKMOQSU_13579BDFHJLNPRTV[1]; + + // 6. Reduce to 2x64 bits + u64 l_0123456789ABCDEFGHIJKLMNOPQRSTUV = l_02468ACEGIKMOQSU + l_13579BDFHJLNPRTV; + u64 h_0123456789ABCDEFGHIJKLMNOPQRSTUV = (l_0123456789ABCDEFGHIJKLMNOPQRSTUV < l_02468ACEGIKMOQSU) + + h_02468ACEGIKMOQSU + h_13579BDFHJLNPRTV; + + return ((u128)h_0123456789ABCDEFGHIJKLMNOPQRSTUV << 64) | (u128)l_0123456789ABCDEFGHIJKLMNOPQRSTUV; +} diff --git a/func.h b/func.h @@ -0,0 +1,6 @@ +#include <rcx/def.h> + +u128 reduce_scalar1024(u64 *x); +u128 reduce_scalar2048(u64 *x); +u128 reduce_avx1024(u64 *x); +u128 reduce_avx2048(u64 *x); diff --git a/main.c b/main.c @@ -0,0 +1,29 @@ +#include <rcx/all.h> +#include <rcx/bench.h> + +#include "func.h" + +u64 in[16]; +volatile u128 out; + +#define BENCHMARK(f) \ + void \ + benchmark_##f(u64 N) { \ + r_bench_start(); \ + for (u64 n = 0; n < N; n++) \ + out = f(in); \ + r_bench_stop(); \ + } + +BENCHMARK(reduce_scalar1024) +BENCHMARK(reduce_scalar2048) +BENCHMARK(reduce_avx1024) +BENCHMARK(reduce_avx2048) + +int +main(void) { + r_bench(benchmark_reduce_scalar1024, 3000); + r_bench(benchmark_reduce_scalar2048, 3000); + r_bench(benchmark_reduce_avx1024, 3000); + r_bench(benchmark_reduce_avx2048, 3000); +}