cs205-lecture-examples

Example codes used during Harvard CS205 lectures
git clone https://git.0xfab.ch/cs205-lecture-examples.git
Log | Files | Refs | README | LICENSE

main.c (1783B)


      1 #include <omp.h>
      2 #include <stdio.h>
      3 #include <x86intrin.h>
      4 
      5 #define N (1 << 11) // both vectors fit in cache
      6 // #define N (1 << 26) // vectors too large to fit in cache -> bandwidth bound
      7 
      8 // compute kernels
      9 void saxpy(float *x, float *y, float a, size_t n);
     10 void saxpy_SSE(float *x, float *y, float a, size_t n);
     11 void saxpy_SSE_FMA(float *x, float *y, float a, size_t n);
     12 
     13 void initialize(float *x, size_t n)
     14 {
     15     for (size_t i = 0; i < n; ++i) {
     16         x[i] = 1.0f;
     17     }
     18 }
     19 
     20 int main(void)
     21 {
     22     const int n_sample = 50;
     23 
     24     // allocate aligned vectors:
     25     float *x = (float *)_mm_malloc(N * sizeof(float), 16);
     26     float *y = (float *)_mm_malloc(N * sizeof(float), 16);
     27 
     28     // initialize data:
     29     initialize(x, N);
     30     initialize(y, N);
     31 
     32     // scalar:
     33     saxpy(x, y, 2.0, N); // warm-up cache
     34     double t0 = omp_get_wtime();
     35     for (int i = 0; i < n_sample; ++i) {
     36         saxpy(x, y, 2.0, N);
     37     }
     38     double t1 = omp_get_wtime();
     39     const double t_gold = (t1 - t0) / n_sample;
     40     printf("Scalar saxpy: %e sec\n", t_gold);
     41 
     42     // SIMD (SSE, 128-bit):
     43     saxpy_SSE(x, y, 2.0, N); // warm-up cache
     44     t0 = omp_get_wtime();
     45     for (int i = 0; i < n_sample; ++i) {
     46         saxpy_SSE(x, y, 2.0, N);
     47     }
     48     t1 = omp_get_wtime();
     49     const double t_sse = (t1 - t0) / n_sample;
     50     printf("SSE saxpy: %e sec (%.2fx speedup)\n", t_sse, t_gold / t_sse);
     51 
     52     // SIMD w/ FMA (SSE, 128-bit):
     53     saxpy_SSE_FMA(x, y, 2.0, N); // warm-up cache
     54     t0 = omp_get_wtime();
     55     for (int i = 0; i < n_sample; ++i) {
     56         saxpy_SSE_FMA(x, y, 2.0, N);
     57     }
     58     t1 = omp_get_wtime();
     59     const double t_fma = (t1 - t0) / n_sample;
     60     printf("SSE FMA saxpy: %e sec (%.2fx speedup)\n", t_fma, t_gold / t_sse);
     61 
     62     // clean up:
     63     _mm_free(x);
     64     _mm_free(y);
     65     return 0;
     66 }