Skip to article frontmatterSkip to article content

Here’s the same vector addition example the we used for OpenMP with CUDA.

#include <iostream>

// CUDA headers
#include <cuda.h>
#include <cuda_runtime.h>
cudaDeviceProp devProp;
cudaError_t cudaStatus = cudaGetDeviceProperties(&devProp, 0);
if (cudaStatus == cudaSuccess)
    std::cout << "  Name\t\t\t\t: " << devProp.name << '\n'
              << "  Total global memory\t\t: " << devProp.totalGlobalMem << " bytes\n"
              << "  Total shared memory per block\t: " << devProp.sharedMemPerBlock << " bytes\n"
              << "  Total registers per block\t: " << devProp.regsPerBlock << '\n'
              << "  Warp size\t\t\t: " << devProp.warpSize << '\n'
              << "  Maximum threads per block\t: " << devProp.maxThreadsPerBlock << '\n'
              << "  Number of multiprocessors\t: " << devProp.multiProcessorCount << '\n'
              << "  CUDA Capability\t\t: " << devProp.major << '.' << devProp.minor << std::endl;
else
    std::cerr << "cudaGetDeviceProperties failed: " << cudaGetErrorString(cudaStatus) << std::endl;

int runtimeVersion;
cudaStatus = cudaRuntimeGetVersion(&runtimeVersion);
if (cudaStatus == cudaSuccess)
{
    int major = runtimeVersion / 1000;
    int minor = (runtimeVersion % 1000) / 10;
    int patch = runtimeVersion % 10;
    std::cout << "  CUDA Runtime Version\t\t: " << major << '.' << minor << '.' << patch << std::endl;
}
else
    std::cerr << "Error getting CUDA Runtime Version: " << cudaGetErrorString(cudaStatus) << std::endl;
template <typename T>
void vector_add(T* out, T* a, T *b, size_t n)
{    for(int i = 0; i < n; i++)
        out[i] = a[i] + b[i];
}
const size_t N{100'000'007};

double *a, *b, *out; 

// Allocate memory
a   = (double*)malloc(sizeof(double) * N);
b   = (double*)malloc(sizeof(double) * N);
out = (double*)malloc(sizeof(double) * N);

// Initialize array
for(int i = 0; i < N; i++)
{    a[i] = 1.0f; b[i] = 2.0f;
}
%%timeit
vector_add(out, a, b, N);
Output
310 ms +- 46.9 ms per loop (mean +- std. dev. of 7 runs 1 loop each)
out[1]
Output
3.0000000
template <typename T>
__global__ void cuda_vector_add(T *out, T *a, T *b, size_t n)
{   auto idx = blockDim.x * blockIdx.x + threadIdx.x;
    if(idx < n) out[idx] = a[idx] + b[idx];
}
double *d_a, *d_b, *d_out;

a = (double*)malloc(sizeof(double) * N);

// Allocate device memory for a
cudaMalloc((void**)&d_a,   sizeof(double) * N);
cudaMalloc((void**)&d_b,   sizeof(double) * N);
cudaMalloc((void**)&d_out, sizeof(double) * N);

// Transfer data from host to device memory
cudaMemcpy(d_a, a, sizeof(double) * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, sizeof(double) * N, cudaMemcpyHostToDevice);
const size_t threads_per_block{256};
size_t blocks_in_grid{N / threads_per_block + 1};
%%timeit
cuda_vector_add<<<blocks_in_grid, threads_per_block>>>(d_out, d_a, d_b, N);
cudaDeviceSynchronize();
// Transfer data from device to host memory
cudaMemcpy(d_out, out, sizeof(double) * N, cudaMemcpyDeviceToHost);
out[1]
// Cleanup after kernel execution
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
free(a);
free(b);
free(out);