commit d6047242df53f4cc704d0fde4b4df5df365efa6e
Author: Fabian Wermelinger <info@0xfab.ch>
Date: Fri, 27 Dec 2024 14:11:21 +0100
Add Fortran/C benchmark code and kernels
Diffstat:
10 files changed, 252 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*.o
+c_*
+fort_*
diff --git a/Makefile b/Makefile
@@ -0,0 +1,35 @@
+CC = gcc
+CFLAGS = -g -Wall -Wextra -Wpedantic
+LIBS = -lpapi -lm
+
+.PHONY: clean
+
+all: main.c
+ # fortran
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O0 $< $(LIBS) fort/sgemv/sgemv_O0.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O1 $< $(LIBS) fort/sgemv/sgemv_O1.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O2 $< $(LIBS) fort/sgemv/sgemv_O2.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O3 $< $(LIBS) fort/sgemv/sgemv_O3.o
+
+ # c
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O0 $< $(LIBS) c/sgemv/sgemv_O0.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O0 $< $(LIBS) c/sgemv/sgemv_O0.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O0 $< $(LIBS) c/sgemv/sgemv_O0.o
+
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O1 $< $(LIBS) c/sgemv/sgemv_O1.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O1 $< $(LIBS) c/sgemv/sgemv_O1.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O1 $< $(LIBS) c/sgemv/sgemv_O1.o
+
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O2 $< $(LIBS) c/sgemv/sgemv_O2.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O2 $< $(LIBS) c/sgemv/sgemv_O2.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O2 $< $(LIBS) c/sgemv/sgemv_O2.o
+
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O3 $< $(LIBS) c/sgemv/sgemv_O3.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O3 $< $(LIBS) c/sgemv/sgemv_O3.o
+ $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O3 $< $(LIBS) c/sgemv/sgemv_O3.o
+
+main: main.c
+ $(CC) -O2 $(CFLAGS) -DKERNEL=$(kernel) -o $@ $< $(LIBS) $(kernel_lib)
+
+clean:
+ rm -f main fort_* c_*
diff --git a/c/sgemv/Makefile b/c/sgemv/Makefile
@@ -0,0 +1,20 @@
+CC = gcc
+CFLAGS = -g -c
+.PHONY: clean
+
+all: sgemv_O0.o sgemv_O1.o sgemv_O2.o sgemv_O3.o
+
+sgemv_O0.o: sgemv.c
+ $(CC) -O0 $(CFLAGS) -o $@ $<
+
+sgemv_O1.o : sgemv.c
+ $(CC) -O1 $(CFLAGS) -o $@ $<
+
+sgemv_O2.o : sgemv.c
+ $(CC) -O2 $(CFLAGS) -o $@ $<
+
+sgemv_O3.o : sgemv.c
+ $(CC) -O3 $(CFLAGS) -o $@ $<
+
+clean:
+ rm -f sgemv_O*
diff --git a/c/sgemv/sgemv.c b/c/sgemv/sgemv.c
@@ -0,0 +1,36 @@
+#include <stdlib.h>
+
+void sgemv_noalias(const float *A,
+ const float *x,
+ float *__restrict__ y,
+ const int *n)
+{
+ const int N = *n;
+ for (int j = 0; j < N; ++j) {
+ for (int i = 0; i < N; ++i) {
+ y[j] += A[j * N + i] * x[i];
+ }
+ }
+}
+
+void sgemv_alias(const float *A, const float *x, float *y, const int *n)
+{
+ const int N = *n;
+ for (int j = 0; j < N; ++j) {
+ for (int i = 0; i < N; ++i) {
+ y[j] += A[j * N + i] * x[i];
+ }
+ }
+}
+
+void sgemv_temporary(const float *A, const float *x, float *y, const int *n)
+{
+ const int N = *n;
+ for (int j = 0; j < N; ++j) {
+ float temp = 0.0f;
+ for (int i = 0; i < N; ++i) {
+ temp += A[j * N + i] * x[i];
+ }
+ y[j] += temp;
+ }
+}
diff --git a/fort/blas/.gitignore b/fort/blas/.gitignore
@@ -0,0 +1,3 @@
+*
+!README
+!blas-3.12.0.tgz
diff --git a/fort/blas/README b/fort/blas/README
@@ -0,0 +1 @@
+https://netlib.org/blas/#_reference_blas_version_3_12_0
diff --git a/fort/blas/blas-3.12.0.tgz b/fort/blas/blas-3.12.0.tgz
Binary files differ.
diff --git a/fort/sgemv/Makefile b/fort/sgemv/Makefile
@@ -0,0 +1,20 @@
+FC = gfortran
+FFLAGS = -cpp -g -c -DTRANSPOSE
+.PHONY: clean
+
+all: sgemv_O0.o sgemv_O1.o sgemv_O2.o sgemv_O3.o
+
+sgemv_O0.o: sgemv.f
+ $(FC) -O0 $(FFLAGS) -o $@ $<
+
+sgemv_O1.o : sgemv.f
+ $(FC) -O1 $(FFLAGS) -o $@ $<
+
+sgemv_O2.o : sgemv.f
+ $(FC) -O2 $(FFLAGS) -o $@ $<
+
+sgemv_O3.o : sgemv.f
+ $(FC) -O3 $(FFLAGS) -o $@ $<
+
+clean:
+ rm -f sgemv_O*
diff --git a/fort/sgemv/sgemv.f b/fort/sgemv/sgemv.f
@@ -0,0 +1,24 @@
+ SUBROUTINE SGEMV(A,X,Y,N)
+ INTEGER(4) N,I,J
+ REAL(4) A(N,*),X(*),Y(*)
+ REAL(4) TEMP
+#ifdef TRANSPOSE
+! Form y := A^T*x + y.
+ DO 100 J = 1,N
+ TEMP = 0.0
+ DO 90 I = 1,N
+ TEMP = TEMP + A(I,J)*X(I)
+ 90 CONTINUE
+ Y(J) = Y(J) + TEMP
+ 100 CONTINUE
+#else
+! Form y := A*x + y.
+ DO 60 J = 1,N
+ TEMP = X(J)
+ DO 50 I = 1,N
+ Y(I) = Y(I) + TEMP*A(I,J)
+ 50 CONTINUE
+ 60 CONTINUE
+#endif
+ RETURN
+ END
diff --git a/main.c b/main.c
@@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "papi.h"
+
+// Model name: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz
+// L1d cache: 32K
+// L1i cache: 32K
+// L2 cache: 256K
+// L3 cache: 12288K
+#define L1_SIZE_KB 32
+#define L2_SIZE_KB 256
+#define L3_SIZE_KB 12288
+
+#ifndef KERNEL
+#define KERNEL sgemv_alias
+#endif /* KERNEL */
+
+extern void KERNEL(const float *A, const float *x, float *y, const int *n);
+
+int main(int argc, char *argv[])
+{
+ int n = 10000;
+ if (argc > 1) {
+ n = atoi(argv[1]);
+ }
+
+ float *A = (float *)malloc(n * n * sizeof(float));
+ float *x = (float *)malloc(n * sizeof(float));
+ float *y = (float *)malloc(n * sizeof(float));
+
+ for (int i = 0; i < n * n; i++) {
+ A[i] = 0.1f;
+ }
+ for (int i = 0; i < n; i++) {
+ x[i] = 1.0f;
+ y[i] = 0.001f;
+ }
+
+ // Initialize PAPI
+ int event_set = PAPI_NULL;
+ int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM};
+ long long int counters[4];
+ PAPI_library_init(PAPI_VER_CURRENT);
+ PAPI_create_eventset(&event_set);
+ PAPI_add_events(event_set, events, 4);
+
+ // warm up
+ KERNEL(A, x, y, &n);
+
+ // start PAPI measurement
+ PAPI_start(event_set);
+
+ // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+ // PAPI_TOT_INS slightly, neglected here)
+ const long long int t0 = PAPI_get_real_nsec();
+
+ // run code to be measured
+ KERNEL(A, x, y, &n);
+
+ // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+ // PAPI_TOT_INS slightly, neglected here)
+ const long long int t1 = PAPI_get_real_nsec();
+
+ // stop PAPI and get counter values
+ PAPI_stop(event_set, counters);
+
+ // clang-format off
+ const long long total_cycles = counters[0]; // cpu cycles
+ const long long total_instructions = counters[1]; // any instruction
+ const long long total_load_stores = counters[2]; // load/store instructions
+ const long long total_l1d_misses = counters[3]; // L1d misses
+ // clang-format on
+
+ const long long flops = 2 * n * n + n;
+ const long long mem_ops = 2 * n * n + 2 * n;
+ const double twall = ((double)t1 - t0) * 1.0e-9; // seconds
+ const double IPC = (double)total_instructions / total_cycles;
+ const double OI = (double)flops / (total_load_stores * sizeof(float));
+ const double OI_theory = (double)flops / (mem_ops * sizeof(float));
+ const double float_perf = flops / twall * 1.0e-9; // GFlop/s
+ double sum = 0.0;
+ for (long long int i = 0; i < n; i++) {
+ sum += y[i];
+ }
+
+ free(A);
+ free(x);
+ free(y);
+
+ // clang-format off
+ printf("Result: %.1f\n", sum);
+ printf("Total cycles: %lld\n", total_cycles);
+ printf("Total instructions: %lld\n", total_instructions);
+ printf("Instructions per cycle (IPC): %.2f\n", IPC);
+ printf("L1 cache size: %d KB\n", L1_SIZE_KB);
+ printf("L2 cache size: %d KB\n", L2_SIZE_KB);
+ printf("L3 cache size: %d KB\n", L3_SIZE_KB);
+ printf("Total problem size: %ld KB\n",
+ (n * n + 2 * n) * sizeof(float) / 1024);
+ printf("Total L1 data misses: %lld\n", total_l1d_misses);
+ printf("Total load/store: %lld (expected: %lld)\n",
+ total_load_stores, mem_ops);
+ printf("Operational intensity: %e (expected: %e)\n", OI, OI_theory);
+ printf("Performance [GFlop/s]: %e\n", float_perf);
+ printf("Wall-time [micro-seconds]: %e\n", twall * 1.0e6);
+ // clang-format on
+
+ return 0;
+}