dgemm.cpp (4722B)
1 #include "papi.h" 2 #include <cstdlib> 3 #include <iostream> 4 #include <numeric> 5 #include <vector> 6 7 #ifdef CBLAS 8 #include <cblas.h> 9 #elif defined(EIGEN) 10 #include <eigen3/Eigen/Dense> 11 // Wrapper to map pointer to Eigen matrix type (needed in benchmark) 12 template <typename T> 13 using EMat = Eigen::Map< 14 Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>; 15 #endif /* CBLAS */ 16 17 // Broadwell CPU on cluster, you can get one with 18 // salloc -N1 -c32 -t 01:00:00 19 // 20 // Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz 21 // L1d cache: 32K 22 // L1i cache: 32K 23 // L2 cache: 256K 24 // L3 cache: 40960K 25 #define L1_SIZE_KB 32 26 #define L2_SIZE_KB 256 27 #define L3_SIZE_KB 40960 28 29 typedef double Real; 30 31 void dgemm(const Real *A, const Real *B, Real *__restrict__ C, const size_t n) 32 { 33 #ifdef CBLAS 34 cblas_dgemm(CblasRowMajor, 35 CblasNoTrans, 36 CblasNoTrans, 37 n, 38 n, 39 n, 40 1.0, 41 A, 42 n, 43 B, 44 n, 45 1.0, 46 C, 47 n); 48 #elif defined(EIGEN) 49 // Matrix wrappers 50 const EMat<Real> Ap(const_cast<Real *>(A), n, n); 51 const EMat<Real> Bp(const_cast<Real *>(B), n, n); 52 EMat<Real> Cp(C, n, n); 53 Cp.noalias() += Ap * Bp; 54 #else 55 for (size_t i = 0; i < n; ++i) { 56 for (size_t k = 0; k < n; ++k) { 57 for (size_t j = 0; j < n; ++j) { 58 C[i * n + j] += A[i * n + k] * B[k * n + j]; 59 } 60 } 61 } 62 #endif /* CBLAS */ 63 } 64 65 int main(int argc, char *argv[]) 66 { 67 int n = 1000; 68 if (argc > 1) { 69 n = atoi(argv[1]); 70 } 71 std::vector<Real> A(n * n, 0.1); 72 std::vector<Real> B(n * n, 0.2); 73 std::vector<Real> C(n * n, 0.3); 74 75 // Initialize PAPI 76 int event_set = PAPI_NULL; 77 int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM}; 78 long long int counters[4]; 79 PAPI_library_init(PAPI_VER_CURRENT); 80 PAPI_create_eventset(&event_set); 81 PAPI_add_events(event_set, events, 4); 82 83 // warm up 84 dgemm(A.data(), B.data(), C.data(), n); 85 86 // start PAPI measurement 87 PAPI_start(event_set); 88 89 // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and 90 // PAPI_TOT_INS slightly, neglected here) 91 const long long int t0 = PAPI_get_real_nsec(); 92 93 // run code to be measured 94 dgemm(A.data(), B.data(), C.data(), n); 95 96 // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and 97 // PAPI_TOT_INS slightly, neglected here) 98 const long long int t1 = PAPI_get_real_nsec(); 99 100 // stop PAPI and get counter values 101 PAPI_stop(event_set, counters); 102 103 // clang-format off 104 const long long total_cycles = counters[0]; // cpu cycles 105 const long long total_instructions = counters[1]; // any 106 const long long total_load_stores = counters[2]; // number of such instructions 107 const long long total_l1d_misses = counters[3]; // number of access request to cache line 108 // clang-format on 109 110 const size_t flops = 2 * n * n * n + n * n; 111 const size_t mem_ops = 4 * n * n; 112 const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds 113 const double IPC = static_cast<double>(total_instructions) / total_cycles; 114 const double OI = 115 static_cast<double>(flops) / (total_load_stores * sizeof(Real)); 116 const double OI_theory = 117 static_cast<double>(flops) / (mem_ops * sizeof(Real)); 118 const double float_perf = flops / twall * 1.0e-9; // Gflop/s 119 const double sum = std::accumulate(C.begin(), C.end(), 0.0); 120 121 std::cout << "Result: " << sum << '\n'; 122 std::cout << "Total cycles: " << total_cycles << '\n'; 123 std::cout << "Total instructions: " << total_instructions << '\n'; 124 std::cout << "Instructions per cycle (IPC): " << IPC << '\n'; 125 std::cout << "L1 cache size: " << L1_SIZE_KB << " KB\n"; 126 std::cout << "L2 cache size: " << L2_SIZE_KB << " KB\n"; 127 std::cout << "L3 cache size: " << L3_SIZE_KB << " KB\n"; 128 std::cout << "Total problem size: " 129 << 3 * n * n * sizeof(Real) / 1024 << " KB\n"; 130 std::cout << "Total L1 data misses: " << total_l1d_misses << '\n'; 131 std::cout << "Total load/store: " << total_load_stores 132 << " (expected: " << mem_ops << ")\n"; 133 std::cout << "Operational intensity: " << std::scientific << OI 134 << " (expected: " << OI_theory << ")\n"; 135 std::cout << "Performance [Gflop/s]: " << float_perf << '\n'; 136 std::cout << "Wall-time [micro-seconds]: " << twall * 1.0e6 << '\n'; 137 138 return 0; 139 }