add_kernel_grid.cu (1436B)
1 // File : add_kernel_grid.cu 2 // Description: Simple vector add kernel (launch grid: 4096 blocks, @256) 3 // Copyright 2022 Harvard University. All Rights Reserved. 4 #include <cmath> 5 #include <iostream> 6 7 // Kernel function to add the elements of two arrays 8 __global__ void add(int n, float *x, float *y) 9 { 10 int tid = blockIdx.x * blockDim.x + threadIdx.x; 11 int stride = blockDim.x * gridDim.x; 12 for (int i = tid; i < n; i += stride) 13 y[i] = x[i] + y[i]; 14 } 15 16 int main(void) 17 { 18 int N = 1 << 20; 19 float *x, *y; 20 21 // Allocate Unified Memory – accessible from CPU or GPU 22 cudaMallocManaged(&x, N * sizeof(float)); 23 cudaMallocManaged(&y, N * sizeof(float)); 24 25 // initialize x and y arrays on the host 26 for (int i = 0; i < N; i++) { 27 x[i] = 1.0f; 28 y[i] = 2.0f; 29 } 30 31 // Run kernel on 1M elements on the GPU 32 int block_size = 256; 33 int n_blocks = (N + block_size - 1) / block_size; 34 add<<<n_blocks, block_size>>>(N, x, y); 35 36 // Wait for GPU to finish before accessing on host. CUDA kernel calls do 37 // not block the calling CPU thread. 38 cudaDeviceSynchronize(); 39 40 // Check for errors (all values should be 3.0f) 41 float maxError = 0.0f; 42 for (int i = 0; i < N; i++) 43 maxError = std::fmax(maxError, std::fabs(y[i] - 3.0f)); 44 std::cout << "Max error: " << maxError << std::endl; 45 46 // Free memory 47 cudaFree(x); 48 cudaFree(y); 49 50 return 0; 51 }