Shared memory programming on Intel Xeon Phi, best practice.
Quick reference (Fortran), home page, tutorial.Tools
> icpc -c -openmp -guide -parallel *.cpp 2> gap.txtmake gap_par_report .#pragma ivdep> export OFFLOAD_REPORT=3> export
OMP_NUM_THREADS=4 > export
MIC_ENV_PREFIX=MIC; export MIC_OMP_NUM_THREADS=60> export MIC_KMP_AFFINITY=balancedHow to OpenMP-parallelize the inner product:
Original code for inner product:double scalar(const int N, const double x[], const double y[])
{
double sum = 0.0;
for (int i=0; i<N; ++i)
{
sum += x[i]*y[i];
}
return sum;
}
int main()
{
...
double s = scalar(n,a,b);
...
}
double scalar(const unsigned int N, const double x[], const double y[])
{
double sum = 0.0;
unsigned int i;
#pragma omp parallel for private(i) shared(x,y) schedule(static) reduction(+:sum)
for (i = 0; i < N; ++i) {
sum += x[i] * y[i];
}
return sum;
}
int main(int argc, char **argv)
{
...
#pragma omp target map(to:y[0:N],x[0:N])
{
double s = scalar(N, x, y);
}
...
}
(g++ -fopenmp
-fopenmp-simd
skalar.cpp) icpc -openmp
-offload-attribute-target=mic
skalar.cppHint: Phi Programming for CUDA developers,