CUDA+MPI - Start

Programming with multiple NVIDIA GPUs.

CUDA: Quick reference, home page, tutorial (von Alfthan)
Have a look at CUDA Toolkit documentation an the CUDA C Programming Guide.
MPI: Quick reference (C++), docuhome page, tutorial (LLNL, MPI-book)

Compiling Code:

How to CUDA-parallelize the inner product:

Original code for inner product:
double scalar(const int N, const double x[], const double y[])
{
 double sum = 0.0;
 for (int i=0; i<N; ++i)
 {
    sum += x[i]*y[i];
 }
 return sum;
}

int main()
{
...
double s = scalar(n,a,b);
...
}


CUDA + MPI code for inner product:
#include <mpi.h>
// kernel function on device __global__ void scalar(double *sg, const double *x, const double *y, int N) { extern __shared__ double sdata[]; const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int str = gridDim.x*blockDim.x; double s = 0.0; for (int i = idx; i < N; i += str) s+= x[i]*y[i]; sdata[threadIdx.x] = s; __syncthreads(); for (int s=blockDim.x >> 1; s>0; s>>=1) { if (threadIdx.x < s) { sdata[threadIdx.x] += sdata[threadIdx.x + s]; } __syncthreads(); } if (threadIdx.x == 0) sg[blockIdx.x] = sdata[0]; } // kernel function for addition on o n e block on device __global__ void add_1_block(double *s, int N) { if (N>blockDim.x) return; extern __shared__ double sdata[]; const int tid = threadIdx.x; if (tid<N) sdata[tid]=s[tid]; else sdata[tid]=0.0; __syncthreads(); for (int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } if (tid == 0) s[0] = sdata[0]; }
//-------------------------------------------------------------
// Host function. Inner product calculated with device vectors
double dscapr_GPU(const int N, const double x_d[], const double y_d[]) { cudaDeviceProp prop; cudaGetDeviceProperties (&prop, 0); const int blocksize = 4 * 64, gridsize = prop.multiProcessorCount, sBytes = blocksize * sizeof(double); dim3 dimBlock(blocksize); dim3 dimGrid(gridsize); double sg, *s_d; // device vector storing the partial sums cudaMalloc((void **) &s_d, blocksize * sizeof(double)); // temp. memory on device // call the kernel function with dimGrid.x * dimBlock.x threads scalar <<< dimGrid, dimBlock, sBytes>>>(s_d, x_d, y_d, N); // power of 2 as number of treads per block const unsigned int oneBlock = 2 << (int)ceil(log(dimGrid.x + 0.0) / log(2.0)); add_1_block <<< 1, oneBlock, oneBlock *sizeof(double)>>>(s_d, dimGrid.x); // copy data: device --> host cudaMemcpy(&sg, s_d, sizeof(double), cudaMemcpyDeviceToHost); cudaFree(s_d); return sg; }
//-------------------------------------------------------------
 // main function on host int main() {
MPI_Init();
 double *x_h, *y_h; // host data double *x_d, *y_d; // device data double sum, tstart, tgpu; const int N = 14000000, nBytes = N*sizeof(double); const int LOOP = 100; cout << endl << gridsize << " x " << blocksize << " Threads\n"; x_h = new double [N]; y_h = new double [N]; // allocate memory on device cudaMalloc((void **) &x_d, nBytes); cudaMalloc((void **) &y_d, nBytes); for (int i=0; i<N; i++) { x_h[i] = (i % 137)+1; y_h[i] = 1.0/x_h[i];} // copy data: host --> device cudaMemcpy(x_d, x_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(y_d, y_h, nBytes, cudaMemcpyHostToDevice);
 for (int k=0; k<LOOP; ++k)
{
double loc_sum = dscapr_GPU(N, x_d, y_d);
MPI_Allreduce(&loc_sum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
 }

 delete [] x_h; delete [] y_h; cudaFree(x_d); cudaFree(y_d);
MPI_Finalize();
 return 0; }

and  make COMPILER=MPI_NVCC_

or via make NVCC_MPI_

The assignment of devices to the MPI process has to be done as follows

#include <mpi.h>
...
{
int myrank;
MPI_Comm_rank(icomm, &myrank); // my MPI rank
int deviceCount; cudaGetDeviceCount(&deviceCount); // How many GPUs? int device_id = myrank % deviceCount; cudaSetDevice(device_id); // Map MPI-process to a GPU

}


Installation problem of OpenMPI and CUDA 6.5 in ubuntu 14.10 [Feb. 2015]: