CUDA - Start

Accelerator programming for NVIDIA GPUs.

CUDA: Quick reference, home page, tutorial (CUDA C Programming Guide, C. Zeller [NVIDIA], videovon Alfthan)
Have a look at CUDA Toolkit documentation.

Compiling Code:

How to CUDA-parallelize the inner product:

Original code for inner product:
double scalar(const int N, const double x[], const double y[])
{
 double sum = 0.0;
 for (int i=0; i<N; ++i)
 {
    sum += x[i]*y[i];
 }
 return sum;
}

int main()
{
...
double s = scalar(n,a,b);
...
}


CUDA code for inner product:
//    kernel function on device  
__global__ void scalar(double *sg, const double *x, const double *y, int N)
{
  extern __shared__ double sdata[];
  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
  const int str = gridDim.x*blockDim.x;
  
  double s = 0.0;
  for (int i = idx; i < N; i += str)
      s+= x[i]*y[i];
  sdata[threadIdx.x] = s;
  
  __syncthreads();
  for (int s=blockDim.x >> 1; s>0; s>>=1) {
     if (threadIdx.x < s) {
         sdata[threadIdx.x] += sdata[threadIdx.x + s];
      }
  __syncthreads();
  }
  if (threadIdx.x == 0)  sg[blockIdx.x] = sdata[0];
}


//    kernel function for addition on  o n e  block on device  
__global__ void add_1_block(double *s, int N)
{
  if (N>blockDim.x) return;
  extern __shared__ double sdata[];
  const int tid = threadIdx.x;
  if (tid<N) sdata[tid]=s[tid];
  else       sdata[tid]=0.0;
  
  __syncthreads();
  for (int s=blockDim.x/2; s>0; s>>=1) {
     if (tid < s) {
         sdata[tid] += sdata[tid + s];
      }
  __syncthreads();
  }
  if (tid == 0)  s[0] = sdata[0];
}

//-------------------------------------------------------------
// Host function. Inner product calculated with device vectors
double dscapr_GPU(const int N, const double x_d[], const double y_d[]) { cudaDeviceProp prop; cudaGetDeviceProperties (&prop, 0); const int blocksize = 4 * 64, gridsize = prop.multiProcessorCount, sBytes = blocksize * sizeof(double); dim3 dimBlock(blocksize); dim3 dimGrid(gridsize); double sg, *s_d; // device vector storing the partial sums cudaMalloc((void **) &s_d, blocksize * sizeof(double)); // temp. memory on device // call the kernel function with dimGrid.x * dimBlock.x threads scalar <<< dimGrid, dimBlock, sBytes>>>(s_d, x_d, y_d, N); // power of 2 as number of treads per block const unsigned int oneBlock = 2 << (int)ceil(log(dimGrid.x + 0.0) / log(2.0)); add_1_block <<< 1, oneBlock, oneBlock *sizeof(double)>>>(s_d, dimGrid.x); // copy data: device --> host cudaMemcpy(&sg, s_d, sizeof(double), cudaMemcpyDeviceToHost); cudaFree(s_d); return sg; }
//-------------------------------------------------------------
 // main function on host int main() { double *x_h, *y_h; // host data double *x_d, *y_d; // device data double sum, tstart, tgpu; const int N = 14000000, nBytes = N*sizeof(double); const int LOOP = 100; cout << endl << gridsize << " x " << blocksize << " Threads\n"; x_h = new double [N]; y_h = new double [N]; // allocate memory on device cudaMalloc((void **) &x_d, nBytes); cudaMalloc((void **) &y_d, nBytes); for (int i=0; i<N; i++) { x_h[i] = (i % 137)+1; y_h[i] = 1.0/x_h[i];} // copy data: host --> device cudaMemcpy(x_d, x_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(y_d, y_h, nBytes, cudaMemcpyHostToDevice);
 for (int k=0; k<LOOP; ++k)
{
sum = dscapr_GPU(N, x_d, y_d); } delete [] x_h; delete [] y_h; cudaFree(x_d); cudaFree(y_d); return 0; }

and  compile

Compilation problem with CUDA 6.5 and gcc-4.9 (and higher) in Ubuntu 14.10 [Feb. 2015]: