Here’s the same vector addition example the we used for OpenMP with CUDA.
#include <cuda.h>
#include <cuda_runtime.h>
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
prop.name
template <typename T>
void vector_add(T* out, T* a, T *b, size_t n)
{ for(int i = 0; i < n; i++)
out[i] = a[i] + b[i];
}
const size_t N{100'000'007};
double *a, *b, *out;
// Allocate memory
a = (double*)malloc(sizeof(double) * N);
b = (double*)malloc(sizeof(double) * N);
out = (double*)malloc(sizeof(double) * N);
// Initialize array
for(int i = 0; i < N; i++)
{ a[i] = 1.0f; b[i] = 2.0f;
}
%%timeit
vector_add(out, a, b, N);
Output
401 ms +- 2.15 ms per loop (mean +- std. dev. of 7 runs 1 loop each)
out[1]
Output
3.0000000
template <typename T>
__global__ void cuda_vector_add(T *out, T *a, T *b, size_t n)
{ auto idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx < n) out[idx] = a[idx] + b[idx];
}
double *d_a, *d_b, *d_out;
a = (double*)malloc(sizeof(double) * N);
// Allocate device memory for a
cudaMalloc((void**)&d_a, sizeof(double) * N);
cudaMalloc((void**)&d_b, sizeof(double) * N);
cudaMalloc((void**)&d_out, sizeof(double) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(double) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(double) * N, cudaMemcpyHostToDevice);
const size_t threads_per_block{256};
size_t blocks_in_grid{N / threads_per_block + 1};
%%timeit
cuda_vector_add<<<blocks_in_grid, threads_per_block>>>(d_out, d_a, d_b, N);
cudaDeviceSynchronize();
// Transfer data from device to host memory
cudaMemcpy(d_out, out, sizeof(double) * N, cudaMemcpyDeviceToHost);
out[1]
// Cleanup after kernel execution
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
free(a);
free(b);
free(out);