sheet 5
This commit is contained in:
parent
c13fca92db
commit
e35f3d4488
5 changed files with 473 additions and 0 deletions
99
Sheet_5/bsp_5_1/check_env.h
Normal file
99
Sheet_5/bsp_5_1/check_env.h
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <unordered_map>
|
||||
|
||||
//#####################################
|
||||
// G.Haase
|
||||
// See https://sourceforge.net/p/predef/wiki/Compilers/
|
||||
// http://www.cplusplus.com/doc/tutorial/preprocessor/
|
||||
// also: export OMP_DISPLAY_ENV=VERBOSE
|
||||
//#####################################
|
||||
/** Checks for compilers, its versions, threads etc.
|
||||
*
|
||||
@param[in] argc number of command line arguemnts
|
||||
@param[in] argv command line arguments as array of C-strings
|
||||
*/
|
||||
template <class T>
|
||||
void check_env(T argc, char const *argv[])
|
||||
{
|
||||
std::cout << "\n#######################################################################\n";
|
||||
std::cout << "Code :";
|
||||
for (T k = 0; k < argc; ++k) std::cout << " " << argv[k];
|
||||
std::cout << std::endl;
|
||||
|
||||
// compiler: https://sourceforge.net/p/predef/wiki/Compilers/
|
||||
std::cout << "Compiler: ";
|
||||
#if defined __INTEL_COMPILER
|
||||
#pragma message(" ########## INTEL ###############")
|
||||
std::cout << "INTEL " << __INTEL_COMPILER;
|
||||
// Ignore warnings for #pragma acc unrecognice
|
||||
#pragma warning disable 161
|
||||
// Ignore warnings for #pragma omp unrecognice
|
||||
#pragma warning disable 3180
|
||||
|
||||
#elif defined __PGI
|
||||
#pragma message(" ########## PGI ###############")
|
||||
std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
|
||||
#elif defined __clang__
|
||||
#pragma message(" ########## CLANG ###############")
|
||||
std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
|
||||
#elif defined __GNUC__
|
||||
#pragma message(" ########## Gnu ###############")
|
||||
std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
|
||||
#else
|
||||
#pragma message(" ########## unknown Compiler ###############")
|
||||
std::cout << "unknown";
|
||||
#endif
|
||||
std::cout << " C++ standard: " << __cplusplus << std::endl;
|
||||
|
||||
// Parallel environments
|
||||
std::cout << "Parallel: ";
|
||||
#if defined MPI_VERSION
|
||||
#pragma message(" ########## MPI ###############")
|
||||
#ifdef OPEN_MPI
|
||||
std::cout << "OpenMPI ";
|
||||
#else
|
||||
std::cout << "MPI ";
|
||||
#endif
|
||||
std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " ";
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
//https://www.openmp.org/specifications/
|
||||
//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
|
||||
std::unordered_map<unsigned, std::string> const map{
|
||||
{200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
|
||||
#pragma message(" ########## OPENMP ###############")
|
||||
//std::cout << _OPENMP;
|
||||
std::cout << "OpenMP ";
|
||||
try {
|
||||
std::cout << map.at(_OPENMP);
|
||||
}
|
||||
catch (...) {
|
||||
std::cout << _OPENMP;
|
||||
}
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp master
|
||||
{
|
||||
const int nn = omp_get_num_threads(); // OpenMP
|
||||
std::cout << " ---> " << nn << " Threads ";
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef _OPENACC
|
||||
#pragma message(" ########## OPENACC ###############")
|
||||
std::cout << "OpenACC ";
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
std::cout << "Date : " << __DATE__ << " " << __TIME__;
|
||||
std::cout << "\n#######################################################################\n";
|
||||
}
|
||||
// HG
|
||||
|
||||
133
Sheet_5/bsp_5_1/main.cpp
Normal file
133
Sheet_5/bsp_5_1/main.cpp
Normal file
|
|
@ -0,0 +1,133 @@
|
|||
#include "check_env.h"
|
||||
#include "mylib.h"
|
||||
#include <cstdlib> // atoi()
|
||||
#include <cstring> // strncmp()
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <omp.h> // OpenMP
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char const *argv[])
|
||||
{
|
||||
//int const NLOOPS = 5; // chose a value such that the benchmark runs at least 10 sec.
|
||||
//unsigned int N = 500000001; // to less memory :(
|
||||
int const NLOOPS = 50; // chose a value such that the benchmark runs at least 10 sec.
|
||||
unsigned int N = 50000001;
|
||||
//##########################################################################
|
||||
// Read Parameter from command line (C++ style)
|
||||
cout << "Checking command line parameters for: -n <number> " << endl;
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
cout << " arg[" << i << "] = " << argv[i] << endl;
|
||||
string ss(argv[i]);
|
||||
if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
|
||||
{
|
||||
N = static_cast<unsigned int>(atoi(argv[i + 1]));
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << "Corect call: " << argv[0] << " -n <number>\n";
|
||||
}
|
||||
}
|
||||
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
check_env(argc, argv);
|
||||
//########################################################################
|
||||
|
||||
//omp_set_num_threads(16);
|
||||
|
||||
int nthreads; // OpenMP
|
||||
int numproc = omp_get_num_procs();
|
||||
cout << "Es stehen maximal " << numproc << " Kerne zur Verfuegung" << endl;
|
||||
#pragma omp parallel default(none) shared(cout,nthreads)
|
||||
{
|
||||
int const th_id = omp_get_thread_num(); // OpenMP
|
||||
int const nthrds = omp_get_num_threads(); // OpenMP
|
||||
stringstream ss;
|
||||
ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
|
||||
#pragma omp critical
|
||||
{
|
||||
cout << ss.str(); // output to a shared ressource
|
||||
}
|
||||
#pragma omp master
|
||||
nthreads = nthrds; // transfer nn to to master thread
|
||||
}
|
||||
cout << " " << nthreads << " threads have been started." << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
//##########################################################################
|
||||
// Data initialization
|
||||
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
|
||||
for (unsigned int i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = i + 1;
|
||||
y[i] = 1.0 / x[i];
|
||||
}
|
||||
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking\n";
|
||||
|
||||
// Do calculation
|
||||
double tstart = omp_get_wtime(); // OpenMP
|
||||
|
||||
double sk(0.0);
|
||||
for (int i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
sk = scalar(x, y);
|
||||
sk = scalar_trans(x, y);
|
||||
//sk = norm(x);
|
||||
//sk = scalar_manual(x,y);
|
||||
}
|
||||
|
||||
double t1 = omp_get_wtime() - tstart; // OpenMP
|
||||
t1 /= NLOOPS; // divide by number of function calls
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << sk << endl;
|
||||
if (static_cast<unsigned int>(sk) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "Timing in sec. : " << t1 << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
|
||||
//#########################################################################
|
||||
|
||||
cout << "\n Try the reduction with an STL-vektor (adding)\n";
|
||||
|
||||
auto vr = reduction_vec(100);
|
||||
cout << "done\n";
|
||||
cout << vr << endl;
|
||||
|
||||
|
||||
//#########################################################################
|
||||
|
||||
cout << "\n Try the reduction with an STL-vektor (appending)\n";
|
||||
|
||||
auto vra = reduction_vec_append(10);
|
||||
cout << "done\n";
|
||||
cout << vra << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
} // memory for x and y will be deallocated their destructors
|
||||
140
Sheet_5/bsp_5_1/mylib.cpp
Normal file
140
Sheet_5/bsp_5_1/mylib.cpp
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
#include "mylib.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <functional> // multiplies<>{}
|
||||
#include <list>
|
||||
#include <numeric> // iota()
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
// int check; // for using omp_in_parallel
|
||||
//#pragma omp parallel for default(none) shared(x,y,N,check) reduction(+:sum)
|
||||
#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
// check = omp_in_parallel();
|
||||
//sum += exp(x[i])*log(y[i]);
|
||||
}
|
||||
// cout << "in parallel: " << check << endl;
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
double scalar_manual(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
|
||||
#pragma omp parallel default(none) shared(x,y,N,sum)
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
int nt = omp_get_num_threads();
|
||||
|
||||
// manual splitting of the index area
|
||||
size_t start = (N * tid) / nt;
|
||||
size_t end = (N * (tid+1)) / nt;
|
||||
|
||||
double local_sum = 0.0;
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
local_sum += x[i] * y[i];
|
||||
}
|
||||
|
||||
// local subtotal combined to global sum
|
||||
#pragma omp atomic
|
||||
sum += local_sum;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
double norm(vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(x,N) reduction(+:sum)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i]*x[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
vector<int> reduction_vec(int n)
|
||||
{
|
||||
vector<int> vec(n);
|
||||
#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp critical
|
||||
cout << omp_get_thread_num() << " : " << vec.size() << endl;
|
||||
#pragma omp barrier
|
||||
iota( vec.begin(),vec.end(), omp_get_thread_num() );
|
||||
#pragma omp barrier
|
||||
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
vector<int> reduction_vec_append(int n)
|
||||
{
|
||||
vector<int> vec;
|
||||
|
||||
#pragma omp parallel default(none) shared(n,cout) reduction(VecAppend:vec)
|
||||
{
|
||||
std::vector<int> local(n); // local vector of each thread
|
||||
|
||||
// consecutive numbers starting from thread ID
|
||||
iota(local.begin(), local.end(), omp_get_thread_num());
|
||||
|
||||
// output for checking
|
||||
#pragma omp critical
|
||||
cout << "Thread " << omp_get_thread_num() << " local size = " << local.size() << std::endl;
|
||||
|
||||
// append local to vec
|
||||
vec.insert(vec.end(), local.begin(), local.end());
|
||||
}
|
||||
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double scalar_trans(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
vector<double> z(x.size());
|
||||
//list<double> z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-')
|
||||
// c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison
|
||||
|
||||
transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
|
||||
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(z) reduction(+:sum)
|
||||
for (auto pi = cbegin(z); pi!=cend(z); ++pi)
|
||||
{
|
||||
sum += *pi;
|
||||
}
|
||||
//for (auto val: z)
|
||||
//{
|
||||
//sum += val;
|
||||
//}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
93
Sheet_5/bsp_5_1/mylib.h
Normal file
93
Sheet_5/bsp_5_1/mylib.h
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
#pragma once
|
||||
#include <cassert>
|
||||
#include <iomanip> // setw()
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
/** Inner product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar_manual(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
/** l2-norm
|
||||
@param[in] x vector
|
||||
@return resulting Euclidian norm
|
||||
*/
|
||||
double norm(std::vector<double> const &x);
|
||||
|
||||
/** Vector @p b adds its elements to vector @p a .
|
||||
@param[in] a vector
|
||||
@param[in] b vector
|
||||
@return a+=b componentwise
|
||||
*/
|
||||
template<class T>
|
||||
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
|
||||
{
|
||||
assert(a.size()==b.size());
|
||||
for (size_t k = 0; k < a.size(); ++k) {
|
||||
a[k] += b[k];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Declare the reduction operation in OpenMP for an STL-vector
|
||||
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
|
||||
// ------------------------------------------------------------
|
||||
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
|
||||
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
|
||||
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
|
||||
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
|
||||
#pragma omp declare reduction(VecAdd : std::vector<int> : omp_out += omp_in) \
|
||||
initializer (omp_priv=omp_orig)
|
||||
|
||||
#pragma omp declare reduction (VecAppend : std::vector<int> : \
|
||||
omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
|
||||
initializer (omp_priv = omp_orig)
|
||||
|
||||
// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type.
|
||||
//template <class T>
|
||||
//#pragma omp declare reduction(VecAdd : std::vector<T> : omp_out += omp_in) initializer (omp_priv(omp_orig))
|
||||
// MS: template nach #pragma !?
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
|
||||
/** Test for vector reduction.
|
||||
*
|
||||
* The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
|
||||
* Afterwards these vectors are accumulated, i.e.,
|
||||
* @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
|
||||
*
|
||||
* @param[in] n size of global/private vector
|
||||
* @return resulting global vector.
|
||||
*/
|
||||
std::vector<int> reduction_vec(int n);
|
||||
|
||||
|
||||
/** Test for vector reduction with appending the local vectors to a global
|
||||
* size of global vector: n*#threads
|
||||
*
|
||||
* @param[in] n size of local/private vector
|
||||
* @return resultign global vector
|
||||
*/
|
||||
std::vector<int> reduction_vec_append(int n);
|
||||
|
||||
|
||||
/** Output of a vector.
|
||||
@param[in,out] s output stream
|
||||
@param[in] x vector
|
||||
@return modified output stream
|
||||
*/
|
||||
template <class T>
|
||||
std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
|
||||
{
|
||||
for (auto const &v : x) s << std::setw(4) << v << " ";
|
||||
return s;
|
||||
}
|
||||
|
||||
8
Sheet_5/bsp_5_1/results_speedup.txt
Normal file
8
Sheet_5/bsp_5_1/results_speedup.txt
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
Results of the speedup test
|
||||
threads | time | factor of speedup
|
||||
1 | 0.35s |
|
||||
2 | 0.3s | 1.167
|
||||
4 | 0.31s | 1.129
|
||||
6 | 0.31s | 1.129
|
||||
8 | 0.3s | 1.167
|
||||
16 | 0.31s | 1.129 (overthreading)
|
||||
Loading…
Add table
Add a link
Reference in a new issue