Shared memory programming on Intel Xeon Phi, best practice.
Quick reference (Fortran), home page, tutorial.Tools
> icpc -c -openmp -guide -parallel *.cpp 2> gap.txt
make gap_par_report
.#pragma ivdep
> export OFFLOAD_REPORT=3
> export
OMP_NUM_THREADS=4
> export
MIC_ENV_PREFIX=MIC; export MIC_OMP_NUM_THREADS=60
> export MIC_KMP_AFFINITY=balanced
How to OpenMP-parallelize the inner product:
Original code for inner product:double scalar(const int N, const double x[], const double y[]) { double sum = 0.0; for (int i=0; i<N; ++i) { sum += x[i]*y[i]; } return sum; }
int main()
{
...
double s = scalar(n,a,b);
...
}
double scalar(const unsigned int N, const double x[], const double y[]) { double sum = 0.0; unsigned int i; #pragma omp parallel for private(i) shared(x,y) schedule(static) reduction(+:sum) for (i = 0; i < N; ++i) { sum += x[i] * y[i]; } return sum; }
int main(int argc, char **argv) {
...
#pragma omp target map(to:y[0:N],x[0:N]) {
double s = scalar(N, x, y);
}
...
}
(g++ -fopenmp
-fopenmp-simd
skalar.cpp
) icpc -openmp
-offload-attribute-target=mic
skalar.cpp
Hint: Phi Programming for CUDA developers,