cs205-lecture-examples

Example codes used during Harvard CS205 lectures
git clone https://git.0xfab.ch/cs205-lecture-examples.git
Log | Files | Refs | README | LICENSE

dgemm.cpp (4722B)


      1 #include "papi.h"
      2 #include <cstdlib>
      3 #include <iostream>
      4 #include <numeric>
      5 #include <vector>
      6 
      7 #ifdef CBLAS
      8 #include <cblas.h>
      9 #elif defined(EIGEN)
     10 #include <eigen3/Eigen/Dense>
     11 // Wrapper to map pointer to Eigen matrix type (needed in benchmark)
     12 template <typename T>
     13 using EMat = Eigen::Map<
     14     Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
     15 #endif /* CBLAS */
     16 
     17 // Broadwell CPU on cluster, you can get one with
     18 //   salloc -N1 -c32 -t 01:00:00
     19 //
     20 // Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
     21 // L1d cache:  32K
     22 // L1i cache:  32K
     23 // L2 cache:   256K
     24 // L3 cache:   40960K
     25 #define L1_SIZE_KB 32
     26 #define L2_SIZE_KB 256
     27 #define L3_SIZE_KB 40960
     28 
     29 typedef double Real;
     30 
     31 void dgemm(const Real *A, const Real *B, Real *__restrict__ C, const size_t n)
     32 {
     33 #ifdef CBLAS
     34     cblas_dgemm(CblasRowMajor,
     35                 CblasNoTrans,
     36                 CblasNoTrans,
     37                 n,
     38                 n,
     39                 n,
     40                 1.0,
     41                 A,
     42                 n,
     43                 B,
     44                 n,
     45                 1.0,
     46                 C,
     47                 n);
     48 #elif defined(EIGEN)
     49     // Matrix wrappers
     50     const EMat<Real> Ap(const_cast<Real *>(A), n, n);
     51     const EMat<Real> Bp(const_cast<Real *>(B), n, n);
     52     EMat<Real> Cp(C, n, n);
     53     Cp.noalias() += Ap * Bp;
     54 #else
     55     for (size_t i = 0; i < n; ++i) {
     56         for (size_t k = 0; k < n; ++k) {
     57             for (size_t j = 0; j < n; ++j) {
     58                 C[i * n + j] += A[i * n + k] * B[k * n + j];
     59             }
     60         }
     61     }
     62 #endif /* CBLAS */
     63 }
     64 
     65 int main(int argc, char *argv[])
     66 {
     67     int n = 1000;
     68     if (argc > 1) {
     69         n = atoi(argv[1]);
     70     }
     71     std::vector<Real> A(n * n, 0.1);
     72     std::vector<Real> B(n * n, 0.2);
     73     std::vector<Real> C(n * n, 0.3);
     74 
     75     // Initialize PAPI
     76     int event_set = PAPI_NULL;
     77     int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM};
     78     long long int counters[4];
     79     PAPI_library_init(PAPI_VER_CURRENT);
     80     PAPI_create_eventset(&event_set);
     81     PAPI_add_events(event_set, events, 4);
     82 
     83     // warm up
     84     dgemm(A.data(), B.data(), C.data(), n);
     85 
     86     // start PAPI measurement
     87     PAPI_start(event_set);
     88 
     89     // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
     90     // PAPI_TOT_INS slightly, neglected here)
     91     const long long int t0 = PAPI_get_real_nsec();
     92 
     93     // run code to be measured
     94     dgemm(A.data(), B.data(), C.data(), n);
     95 
     96     // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
     97     // PAPI_TOT_INS slightly, neglected here)
     98     const long long int t1 = PAPI_get_real_nsec();
     99 
    100     // stop PAPI and get counter values
    101     PAPI_stop(event_set, counters);
    102 
    103     // clang-format off
    104     const long long total_cycles = counters[0];       // cpu cycles
    105     const long long total_instructions = counters[1]; // any
    106     const long long total_load_stores = counters[2];  // number of such instructions
    107     const long long total_l1d_misses = counters[3];   // number of access request to cache line
    108     // clang-format on
    109 
    110     const size_t flops = 2 * n * n * n + n * n;
    111     const size_t mem_ops = 4 * n * n;
    112     const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds
    113     const double IPC = static_cast<double>(total_instructions) / total_cycles;
    114     const double OI =
    115         static_cast<double>(flops) / (total_load_stores * sizeof(Real));
    116     const double OI_theory =
    117         static_cast<double>(flops) / (mem_ops * sizeof(Real));
    118     const double float_perf = flops / twall * 1.0e-9; // Gflop/s
    119     const double sum = std::accumulate(C.begin(), C.end(), 0.0);
    120 
    121     std::cout << "Result:                       " << sum << '\n';
    122     std::cout << "Total cycles:                 " << total_cycles << '\n';
    123     std::cout << "Total instructions:           " << total_instructions << '\n';
    124     std::cout << "Instructions per cycle (IPC): " << IPC << '\n';
    125     std::cout << "L1 cache size:                " << L1_SIZE_KB << " KB\n";
    126     std::cout << "L2 cache size:                " << L2_SIZE_KB << " KB\n";
    127     std::cout << "L3 cache size:                " << L3_SIZE_KB << " KB\n";
    128     std::cout << "Total problem size:           "
    129               << 3 * n * n * sizeof(Real) / 1024 << " KB\n";
    130     std::cout << "Total L1 data misses:         " << total_l1d_misses << '\n';
    131     std::cout << "Total load/store:             " << total_load_stores
    132               << " (expected: " << mem_ops << ")\n";
    133     std::cout << "Operational intensity:        " << std::scientific << OI
    134               << " (expected: " << OI_theory << ")\n";
    135     std::cout << "Performance [Gflop/s]:        " << float_perf << '\n';
    136     std::cout << "Wall-time   [micro-seconds]:  " << twall * 1.0e6 << '\n';
    137 
    138     return 0;
    139 }