Programming with multiple NVIDIA GPUs.
CUDA: Quick reference, home page, tutorial (von Alfthan)Compiling Code:
> nvcc -O3 -arch sm_30 --ptxas-options=-v -lineinfo skalar.cu
-o main.NVCC_
> ./main.NVCC_
> optirun ./main.NVCC_
How to CUDA-parallelize the inner product:
Original code for inner product:double scalar(const int N, const double x[], const double y[]) { double sum = 0.0; for (int i=0; i<N; ++i) { sum += x[i]*y[i]; } return sum; }
int main()
{
...
double s = scalar(n,a,b);
...
}
#include <mpi.h>
// kernel function on device __global__ void scalar(double *sg, const double *x, const double *y, int N) { extern __shared__ double sdata[]; const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int str = gridDim.x*blockDim.x; double s = 0.0; for (int i = idx; i < N; i += str) s+= x[i]*y[i]; sdata[threadIdx.x] = s; __syncthreads(); for (int s=blockDim.x >> 1; s>0; s>>=1) { if (threadIdx.x < s) { sdata[threadIdx.x] += sdata[threadIdx.x + s]; } __syncthreads(); } if (threadIdx.x == 0) sg[blockIdx.x] = sdata[0]; } // kernel function for addition on o n e block on device __global__ void add_1_block(double *s, int N) { if (N>blockDim.x) return; extern __shared__ double sdata[]; const int tid = threadIdx.x; if (tid<N) sdata[tid]=s[tid]; else sdata[tid]=0.0; __syncthreads(); for (int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } if (tid == 0) s[0] = sdata[0]; }
//-------------------------------------------------------------
// Host function. Inner product calculated with device vectors
double dscapr_GPU(const int N, const double x_d[], const double y_d[]) { cudaDeviceProp prop; cudaGetDeviceProperties (&prop, 0); const int blocksize = 4 * 64, gridsize = prop.multiProcessorCount, sBytes = blocksize * sizeof(double); dim3 dimBlock(blocksize); dim3 dimGrid(gridsize); double sg, *s_d; // device vector storing the partial sums cudaMalloc((void **) &s_d, blocksize * sizeof(double)); // temp. memory on device // call the kernel function with dimGrid.x * dimBlock.x threads scalar <<< dimGrid, dimBlock, sBytes>>>(s_d, x_d, y_d, N); // power of 2 as number of treads per block const unsigned int oneBlock = 2 << (int)ceil(log(dimGrid.x + 0.0) / log(2.0)); add_1_block <<< 1, oneBlock, oneBlock *sizeof(double)>>>(s_d, dimGrid.x); // copy data: device --> host cudaMemcpy(&sg, s_d, sizeof(double), cudaMemcpyDeviceToHost); cudaFree(s_d); return sg; }
//-------------------------------------------------------------
// main function on host int main() {
MPI_Init();
double *x_h, *y_h; // host data double *x_d, *y_d; // device data double sum, tstart, tgpu; const int N = 14000000, nBytes = N*sizeof(double); const int LOOP = 100; cout << endl << gridsize << " x " << blocksize << " Threads\n"; x_h = new double [N]; y_h = new double [N]; // allocate memory on device cudaMalloc((void **) &x_d, nBytes); cudaMalloc((void **) &y_d, nBytes); for (int i=0; i<N; i++) { x_h[i] = (i % 137)+1; y_h[i] = 1.0/x_h[i];} // copy data: host --> device cudaMemcpy(x_d, x_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(y_d, y_h, nBytes, cudaMemcpyHostToDevice);
for (int k=0; k<LOOP; ++k)
{
double loc_sum = dscapr_GPU(N, x_d, y_d);
MPI_Allreduce(&loc_sum,&sum,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
}
delete [] x_h; delete [] y_h; cudaFree(x_d); cudaFree(y_d);
MPI_Finalize();
return 0; }
mpicxx -showme:compile |sed 's/-pthread//g'
)
and link options (mpicxx -showme:link |sed 's/-pthread//g'
)> nvcc
-c -O3 -arch
sm_30 -I/usr/lib/openmpi/include
-I/usr/lib/openmpi/include/openmpi *.cu [*.cpp]
> nvcc
-lm -lcudart
-lcublas -L/usr/lib/openmpi/lib
-lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -lnsl -lutil -lm -ld
*.o -o main.MPI_NVCC_
> mpirun
-np 4 main.MPI_NVCC_
or via make NVCC_MPI_
> export OMPI_CXX= nvcc
> export OMPI_CXXFLAGS= -I/usr/lib/openmpi/include
-I/usr/lib/openmpi/include/openmpi
(= mpicxx -showme:compile |sed 's/-pthread//g'
)> export OMPI_LDFLAGS= -L/usr/lib/openmpi/lib
-lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -lnsl -lutil -lm -ld
( = mpicxx -showme:link |sed 's/-pthread//g'
)> export OMPI_LIBS=
(otherwise the nvcc linker cannot handle the option -Wl,--export-dynamic
)> mpicxx
-c -O3 -arch
sm_30 *.cu [*.cpp]
>
mpicxx
-lm
-lcudart -lcublas
*.o -o main.NVCC_MPI_
> mpirun
-np 4 main.
NVCC_MPI_
The assignment of devices to the MPI process has to be done as follows
#include <mpi.h>
...
{
int myrank;
MPI_Comm_rank(icomm, &myrank); // my MPI rank
int deviceCount; cudaGetDeviceCount(&deviceCount); // How many GPUs? int device_id = myrank % deviceCount; cudaSetDevice(device_id); // Map MPI-process to a GPU
}
Installation problem of OpenMPI and CUDA 6.5 in ubuntu 14.10 [Feb. 2015]:
> sudo apt-get install openmpi-bin
openmpi-doc libopenmpi-dev --no-install-recommends