cs205-lecture-examples

Example codes used during Harvard CS205 lectures
git clone https://git.0xfab.ch/cs205-lecture-examples.git
Log | Files | Refs | README | LICENSE

add_kernel_grid.cu (1436B)


      1 // File       : add_kernel_grid.cu
      2 // Description: Simple vector add kernel (launch grid: 4096 blocks, @256)
      3 // Copyright 2022 Harvard University. All Rights Reserved.
      4 #include <cmath>
      5 #include <iostream>
      6 
      7 // Kernel function to add the elements of two arrays
      8 __global__ void add(int n, float *x, float *y)
      9 {
     10     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     11     int stride = blockDim.x * gridDim.x;
     12     for (int i = tid; i < n; i += stride)
     13         y[i] = x[i] + y[i];
     14 }
     15 
     16 int main(void)
     17 {
     18     int N = 1 << 20;
     19     float *x, *y;
     20 
     21     // Allocate Unified Memory – accessible from CPU or GPU
     22     cudaMallocManaged(&x, N * sizeof(float));
     23     cudaMallocManaged(&y, N * sizeof(float));
     24 
     25     // initialize x and y arrays on the host
     26     for (int i = 0; i < N; i++) {
     27         x[i] = 1.0f;
     28         y[i] = 2.0f;
     29     }
     30 
     31     // Run kernel on 1M elements on the GPU
     32     int block_size = 256;
     33     int n_blocks = (N + block_size - 1) / block_size;
     34     add<<<n_blocks, block_size>>>(N, x, y);
     35 
     36     // Wait for GPU to finish before accessing on host.  CUDA kernel calls do
     37     // not block the calling CPU thread.
     38     cudaDeviceSynchronize();
     39 
     40     // Check for errors (all values should be 3.0f)
     41     float maxError = 0.0f;
     42     for (int i = 0; i < N; i++)
     43         maxError = std::fmax(maxError, std::fabs(y[i] - 3.0f));
     44     std::cout << "Max error: " << maxError << std::endl;
     45 
     46     // Free memory
     47     cudaFree(x);
     48     cudaFree(y);
     49 
     50     return 0;
     51 }