simd-scan

SIMD scan implementation and benchmark
git clone git://git.rr3.xyz/simd-scan
Log | Files | Refs

commit b3b7fcc9d17a0a71eadd20f9d24790515da6d2b7
Author: Robert Russell <robert@rr3.xyz>
Date:   Mon, 22 Dec 2025 11:55:28 -0800

Initial commit

Diffstat:
A.gitignore | 3+++
AMakefile | 8++++++++
Afunc.c | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Afunc.h | 7+++++++
Amain.c | 47+++++++++++++++++++++++++++++++++++++++++++++++
Awhoa.nasm | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 200 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,3 @@ +func.o +whoa.o +main diff --git a/Makefile b/Makefile @@ -0,0 +1,8 @@ +main: main.c func.c func.h whoa.nasm + nasm -f elf64 whoa.nasm + gcc -c -Wall -O3 -march=native func.c -lrcx + gcc -o main -Wall -O3 -march=native main.c func.o whoa.o -lrcx + +.PHONY: clean +clean: + rm -f func.o main diff --git a/func.c b/func.c @@ -0,0 +1,70 @@ +#include <rcx/all.h> + +// TODO: Try also with 128 bit partial sums. + +u64 +scan_scalar1024(u64 *y, u64 *x) { + u64 s = 0; + for (usize i = 0; i < 16; i++) { + s += x[i]; + y[i] = s; + } + return s; +} + +u64 +scan_scalar2048(u64 *y, u64 *x) { + u64 s = 0; + for (usize i = 0; i < 32; i++) { + s += x[i]; + y[i] = s; + } + return s; +} + +u64 +scan_scalar2048_unrolled(u64 *y, u64 *x) { + u64 s = 0; + for (usize i = 0; i < 8; i++) { + s += x[i+0]; y[i+0] = s; + s += x[i+1]; y[i+1] = s; + s += x[i+2]; y[i+2] = s; + s += x[i+3]; y[i+3] = s; + } + return s; +} + +u64 +scan_avx1024_serial(u64 *y, u64 *x) { + v4u64 s = { 0, 0, 0, 0 }; + for (usize i = 0; i < 4; i++) { + u64 x0 = x[i+0]; + u64 x1 = x0 + x[i+1]; + u64 x2 = x1 + x[i+2]; + u64 x3 = x2 + x[i+3]; + v4u64 v = { x0, x1, x2, x3 }; + v += s; + memcpy(&y[i], &v, 32); + s = __builtin_shufflevector(v, v, 3, 3, 3, 3); + } + return s[0]; +} + +u64 +scan_avx1024(u64 *y, u64 *x) { + v4u64 z = { 0, 0, 0, 0 }; + + v4u64 s = z; + for (usize i = 0; i < 4; i++) { + v4u64 x_0_1_2_3; memcpy(&x_0_1_2_3, x+(4u*i), 32); + v4u64 x_z_0_1_2 = __builtin_shufflevector(z, x_0_1_2_3, 0, 4, 5, 6); + v4u64 x_0_01_12_23 = x_0_1_2_3 + x_z_0_1_2; + v4u64 x_z_z_0_01 = __builtin_shufflevector(z, x_0_01_12_23, 0, 0, 4, 5); + v4u64 x_0_01_012_0123 = x_0_01_12_23 + x_z_z_0_01; + v4u64 x_s0_s01_s012_s0123 = s + x_0_01_012_0123; + memcpy(y+(4u*i), &x_s0_s01_s012_s0123, 32); + s = __builtin_shufflevector(x_s0_s01_s012_s0123, z, 3, 3, 3, 3); + } + + return s[3]; +} diff --git a/func.h b/func.h @@ -0,0 +1,7 @@ +#include <rcx/def.h> + +u64 scan_scalar1024(u64 *y, u64 *x); +u64 scan_scalar2048(u64 *y, u64 *x); +u64 scan_scalar2048_unrolled(u64 *y, u64 *x); +u64 scan_avx1024_serial(u64 *y, u64 *x); +u64 scan_avx1024(u64 *y, u64 *x); diff --git a/main.c b/main.c @@ -0,0 +1,47 @@ +#include <rcx/all.h> +#include <rcx/bench.h> +#include <stdio.h> + +#include "func.h" + +extern u64 scan_avx1024_whoa(u64 *y, u64 *x); + +u64 in[32], out[32]; +volatile u64 out_total; + +#define BENCHMARK(f) \ + void \ + benchmark_##f(u64 N) { \ + r_bench_start(); \ + for (u64 n = 0; n < N; n++) \ + out_total = f(out, in); \ + r_bench_stop(); \ + } + +BENCHMARK(scan_scalar1024) +BENCHMARK(scan_scalar2048) +BENCHMARK(scan_scalar2048_unrolled) +BENCHMARK(scan_avx1024_serial) +BENCHMARK(scan_avx1024) +BENCHMARK(scan_avx1024_whoa) + +int +main(void) { + printf("%lu\n", scan_avx1024_whoa(out, (u64[16]){ + 0x0, 0x1, 0x2, 0x3, + 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, + 0xc, 0xd, 0xe, 0xf, + })); + printf("%lu %lu %lu %lu\n", out[0], out[1], out[2], out[3]); + printf("%lu %lu %lu %lu\n", out[4], out[5], out[6], out[7]); + printf("%lu %lu %lu %lu\n", out[8], out[9], out[10], out[11]); + printf("%lu %lu %lu %lu\n", out[12], out[13], out[14], out[15]); + // return 0; + r_bench(benchmark_scan_scalar1024, 3000); + r_bench(benchmark_scan_scalar2048, 3000); + r_bench(benchmark_scan_scalar2048_unrolled, 3000); + r_bench(benchmark_scan_avx1024_serial, 3000); + r_bench(benchmark_scan_avx1024, 3000); + r_bench(benchmark_scan_avx1024_whoa, 3000); +} diff --git a/whoa.nasm b/whoa.nasm @@ -0,0 +1,65 @@ +default rel + +section .text + +; RAX RDI RSI +; u64 scan_avx1024_whoa(u64 *y, u64 *x); +global scan_avx1024_whoa +scan_avx1024_whoa: + ; 1. Load + vmovdqu ymm0, [rsi+00] + vmovdqu ymm1, [rsi+32] + vmovdqu ymm2, [rsi+64] + vmovdqu ymm3, [rsi+96] + + ; 2. Transpose + vpunpcklqdq ymm4, ymm0, ymm1 + vpunpckhqdq ymm5, ymm0, ymm1 + vpunpcklqdq ymm6, ymm2, ymm3 + vpunpckhqdq ymm7, ymm2, ymm3 + vperm2i128 ymm0, ymm4, ymm6, 0x20 + vperm2i128 ymm1, ymm5, ymm7, 0x20 + vperm2i128 ymm2, ymm4, ymm6, 0x31 + vperm2i128 ymm3, ymm5, ymm7, 0x31 + + ; 3. Scan blocks + vpaddq ymm1, ymm1, ymm0 + vpaddq ymm2, ymm2, ymm1 + vpaddq ymm3, ymm3, ymm2 + ; ymm3 now contains block sums + + ; 4. Scan sums (intra-register exclusive scan) + vpslldq ymm4, ymm3, 8 + vpaddq ymm4, ymm3, ymm4 + vpermq ymm5, ymm4, 0x50 + vpxor ymm6, ymm6, ymm6 + vblendpd ymm5, ymm5, ymm6, 3 + vpaddq ymm4, ymm4, ymm5 + vpsubq ymm4, ymm4, ymm3 + + ; 5. Add in scanned block sums + vpaddq ymm0, ymm0, ymm4 + vpaddq ymm1, ymm1, ymm4 + vpaddq ymm2, ymm2, ymm4 + vpaddq ymm3, ymm3, ymm4 + + ; 6. Transpose + vpunpcklqdq ymm4, ymm0, ymm1 + vpunpckhqdq ymm5, ymm0, ymm1 + vpunpcklqdq ymm6, ymm2, ymm3 + vpunpckhqdq ymm7, ymm2, ymm3 + vperm2i128 ymm0, ymm4, ymm6, 0x20 + vperm2i128 ymm1, ymm5, ymm7, 0x20 + vperm2i128 ymm2, ymm4, ymm6, 0x31 + vperm2i128 ymm3, ymm5, ymm7, 0x31 + + ; 7. Store + vmovdqu [rdi+00], ymm0 + vmovdqu [rdi+32], ymm1 + vmovdqu [rdi+64], ymm2 + vmovdqu [rdi+96], ymm3 + + ; vextracti128 xmm15, ymm6, 1 + ; pextrq rax, xmm15, 1 + mov rax, 7 ; TODO: Output total + ret