main.c (1783B)
1 #include <omp.h> 2 #include <stdio.h> 3 #include <x86intrin.h> 4 5 #define N (1 << 11) // both vectors fit in cache 6 // #define N (1 << 26) // vectors too large to fit in cache -> bandwidth bound 7 8 // compute kernels 9 void saxpy(float *x, float *y, float a, size_t n); 10 void saxpy_SSE(float *x, float *y, float a, size_t n); 11 void saxpy_SSE_FMA(float *x, float *y, float a, size_t n); 12 13 void initialize(float *x, size_t n) 14 { 15 for (size_t i = 0; i < n; ++i) { 16 x[i] = 1.0f; 17 } 18 } 19 20 int main(void) 21 { 22 const int n_sample = 50; 23 24 // allocate aligned vectors: 25 float *x = (float *)_mm_malloc(N * sizeof(float), 16); 26 float *y = (float *)_mm_malloc(N * sizeof(float), 16); 27 28 // initialize data: 29 initialize(x, N); 30 initialize(y, N); 31 32 // scalar: 33 saxpy(x, y, 2.0, N); // warm-up cache 34 double t0 = omp_get_wtime(); 35 for (int i = 0; i < n_sample; ++i) { 36 saxpy(x, y, 2.0, N); 37 } 38 double t1 = omp_get_wtime(); 39 const double t_gold = (t1 - t0) / n_sample; 40 printf("Scalar saxpy: %e sec\n", t_gold); 41 42 // SIMD (SSE, 128-bit): 43 saxpy_SSE(x, y, 2.0, N); // warm-up cache 44 t0 = omp_get_wtime(); 45 for (int i = 0; i < n_sample; ++i) { 46 saxpy_SSE(x, y, 2.0, N); 47 } 48 t1 = omp_get_wtime(); 49 const double t_sse = (t1 - t0) / n_sample; 50 printf("SSE saxpy: %e sec (%.2fx speedup)\n", t_sse, t_gold / t_sse); 51 52 // SIMD w/ FMA (SSE, 128-bit): 53 saxpy_SSE_FMA(x, y, 2.0, N); // warm-up cache 54 t0 = omp_get_wtime(); 55 for (int i = 0; i < n_sample; ++i) { 56 saxpy_SSE_FMA(x, y, 2.0, N); 57 } 58 t1 = omp_get_wtime(); 59 const double t_fma = (t1 - t0) / n_sample; 60 printf("SSE FMA saxpy: %e sec (%.2fx speedup)\n", t_fma, t_gold / t_sse); 61 62 // clean up: 63 _mm_free(x); 64 _mm_free(y); 65 return 0; 66 }