sudo
apt install librdmacm1
Compiling Code:
> pgcc -Mmpi=mpich
-fast -acc
-gopt -ta=nvidia:cc2+,fastmath,keep,time skalar.cpp -o
main.PGI_MPI_
> /opt/pgi/linux86-64/2016/mpi/mpich/bin/mpirun -np 4
./main.PGI_MPI_
>
/opt/pgi/linux86-64/2016/mpi/mpich/bin/mpirun
-np 4 optirun -v ./main.PGI_MPI_
How to OpenACC+MPI-parallelize the inner product:
Original code for inner product:double scalar(const int N, const double x[], const double y[]) { double sum = 0.0;
for (int i=0; i<N; ++i) { sum += x[i]*y[i]; } return sum; }
int main()
{
...
double s = scalar(n,a,b);
...
}
#include <mpi.h>
// local sequential inner product
double scalar(const int N, const double x[], const double y[]) { double sum = 0.0;
#pragma acc kernels loop present_or_copyin(x[0:N], y[0:N]) independent reduction(+:sum)
for (int i=0; i<N; ++i)
sum += x[i]*y[i]; return sum; }
// MPI inner product double scalar(const int n, const double x[], const double y[], const MPI_Comm icomm) { const double s = scalar(n,x,y); // call sequential inner product double sg; MPI_Allreduce(&s,&sg,1,MPI_DOUBLE,MPI_SUM,icomm); return(sg);
}
int main(int argc, char* argv[])
{
...
MPI_Init(&argc,&argv);
#pragma acc data copyin(a[0:nnode],b[0:nnode])
{
...
double s = scalar(n,a,b,MPI_COMM_WORLD);
...
}
MPI_Finalize();
...
}
pgc++
-Mmpi=mpich
-fast -acc
-ta=nvidia:cc2+,cuda5.5,fastmath skalar.cpp
-o main.PGI_MPI_
The assignment of devices to the MPI process has to be done as follows
#ifdef _OPENACC #include <accel.h> // OpenACC #endif
#include <mpi.h>
...
{
int myrank;
MPI_Comm_rank(icomm, &myrank); // my MPI rank
#ifdef _OPENACC acc_init(acc_device_nvidia); // OpenACC call const int num_dev = acc_get_num_devices(acc_device_nvidia); // #GPUs const int dev_id = myrank % num_dev; acc_set_device_num(dev_id,acc_device_nvidia); // assign GPU to one MPI process cout << "MPI process " << myrank << " is assigned to GPU " << dev_id << "\n"; #endif
}