main.c (3599B)
1 #include <stdio.h> 2 #include <stdlib.h> 3 4 #include "papi.h" 5 6 // Model name: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz 7 // L1d cache: 32K 8 // L1i cache: 32K 9 // L2 cache: 256K 10 // L3 cache: 12288K 11 #define L1_SIZE_KB 32 12 #define L2_SIZE_KB 256 13 #define L3_SIZE_KB 12288 14 15 #ifndef KERNEL 16 #define KERNEL sgemv_alias 17 #endif /* KERNEL */ 18 19 extern void KERNEL(const float *A, const float *x, float *y, const int *n); 20 21 int main(int argc, char *argv[]) 22 { 23 int n = 10000; 24 if (argc > 1) { 25 n = atoi(argv[1]); 26 } 27 28 float *A = (float *)malloc(n * n * sizeof(float)); 29 float *x = (float *)malloc(n * sizeof(float)); 30 float *y = (float *)malloc(n * sizeof(float)); 31 32 for (int i = 0; i < n * n; i++) { 33 A[i] = 0.1f; 34 } 35 for (int i = 0; i < n; i++) { 36 x[i] = 1.0f; 37 y[i] = 0.001f; 38 } 39 40 // Initialize PAPI 41 int event_set = PAPI_NULL; 42 int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM}; 43 long long int counters[4]; 44 PAPI_library_init(PAPI_VER_CURRENT); 45 PAPI_create_eventset(&event_set); 46 PAPI_add_events(event_set, events, 4); 47 48 // warm up 49 KERNEL(A, x, y, &n); 50 51 // start PAPI measurement 52 PAPI_start(event_set); 53 54 // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and 55 // PAPI_TOT_INS slightly, neglected here) 56 const long long int t0 = PAPI_get_real_nsec(); 57 58 // run code to be measured 59 KERNEL(A, x, y, &n); 60 61 // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and 62 // PAPI_TOT_INS slightly, neglected here) 63 const long long int t1 = PAPI_get_real_nsec(); 64 65 // stop PAPI and get counter values 66 PAPI_stop(event_set, counters); 67 68 // clang-format off 69 const long long total_cycles = counters[0]; // cpu cycles 70 const long long total_instructions = counters[1]; // any instruction 71 const long long total_load_stores = counters[2]; // load/store instructions 72 const long long total_l1d_misses = counters[3]; // L1d misses 73 // clang-format on 74 75 const long long flops = 2 * n * n + n; 76 const long long mem_ops = 2 * n * n + 2 * n; 77 const double twall = ((double)t1 - t0) * 1.0e-9; // seconds 78 const double IPC = (double)total_instructions / total_cycles; 79 const double OI = (double)flops / (total_load_stores * sizeof(float)); 80 const double OI_theory = (double)flops / (mem_ops * sizeof(float)); 81 const double float_perf = flops / twall * 1.0e-9; // GFlop/s 82 double sum = 0.0; 83 for (long long int i = 0; i < n; i++) { 84 sum += y[i]; 85 } 86 87 free(A); 88 free(x); 89 free(y); 90 91 // clang-format off 92 printf("Result: %.1f\n", sum); 93 printf("Total cycles: %lld\n", total_cycles); 94 printf("Total instructions: %lld\n", total_instructions); 95 printf("Instructions per cycle (IPC): %.2f\n", IPC); 96 printf("L1 cache size: %d KB\n", L1_SIZE_KB); 97 printf("L2 cache size: %d KB\n", L2_SIZE_KB); 98 printf("L3 cache size: %d KB\n", L3_SIZE_KB); 99 printf("Total problem size: %ld KB\n", 100 (n * n + 2 * n) * sizeof(float) / 1024); 101 printf("Total L1 data misses: %lld\n", total_l1d_misses); 102 printf("Total load/store: %lld (expected: %lld)\n", 103 total_load_stores, mem_ops); 104 printf("Operational intensity: %e (expected: %e)\n", OI, OI_theory); 105 printf("Performance [GFlop/s]: %e\n", float_perf); 106 printf("Wall-time [micro-seconds]: %e\n", twall * 1.0e6); 107 // clang-format on 108 109 return 0; 110 }