Here’s the same vector addition example the we used for OpenMP with CUDA.
#include <cuda.h>
#include <cuda_runtime.h>cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
prop.nametemplate <typename T>
void vector_add(T* out, T* a, T *b, size_t n)
{    for(int i = 0; i < n; i++)
        out[i] = a[i] + b[i];
}const size_t N{100'000'007};
double *a, *b, *out; 
// Allocate memory
a   = (double*)malloc(sizeof(double) * N);
b   = (double*)malloc(sizeof(double) * N);
out = (double*)malloc(sizeof(double) * N);
// Initialize array
for(int i = 0; i < N; i++)
{    a[i] = 1.0f; b[i] = 2.0f;
}%%timeit
vector_add(out, a, b, N);Output
401 ms +- 2.15 ms per loop (mean +- std. dev. of 7 runs 1 loop each)
out[1]Output
3.0000000template <typename T>
__global__ void cuda_vector_add(T *out, T *a, T *b, size_t n)
{   auto idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx < n) out[idx] = a[idx] + b[idx];
}double *d_a, *d_b, *d_out;
a = (double*)malloc(sizeof(double) * N);
// Allocate device memory for a
cudaMalloc((void**)&d_a,   sizeof(double) * N);
cudaMalloc((void**)&d_b,   sizeof(double) * N);
cudaMalloc((void**)&d_out, sizeof(double) * N);
// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(double) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(double) * N, cudaMemcpyHostToDevice);const size_t threads_per_block{256};
size_t blocks_in_grid{N / threads_per_block + 1};%%timeit
cuda_vector_add<<<blocks_in_grid, threads_per_block>>>(d_out, d_a, d_b, N);
cudaDeviceSynchronize();// Transfer data from device to host memory
cudaMemcpy(d_out, out, sizeof(double) * N, cudaMemcpyDeviceToHost);
out[1]// Cleanup after kernel execution
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);free(a);
free(b);
free(out);