From e35f3d4488eedbffc4e6f58cf3891a485cfaaae6 Mon Sep 17 00:00:00 2001 From: "g.mandl" Date: Wed, 10 Dec 2025 17:11:35 +0100 Subject: [PATCH] sheet 5 --- Sheet_5/bsp_5_1/check_env.h | 99 ++++++++++++++++++++ Sheet_5/bsp_5_1/main.cpp | 133 ++++++++++++++++++++++++++ Sheet_5/bsp_5_1/mylib.cpp | 140 ++++++++++++++++++++++++++++ Sheet_5/bsp_5_1/mylib.h | 93 ++++++++++++++++++ Sheet_5/bsp_5_1/results_speedup.txt | 8 ++ 5 files changed, 473 insertions(+) create mode 100644 Sheet_5/bsp_5_1/check_env.h create mode 100644 Sheet_5/bsp_5_1/main.cpp create mode 100644 Sheet_5/bsp_5_1/mylib.cpp create mode 100644 Sheet_5/bsp_5_1/mylib.h create mode 100644 Sheet_5/bsp_5_1/results_speedup.txt diff --git a/Sheet_5/bsp_5_1/check_env.h b/Sheet_5/bsp_5_1/check_env.h new file mode 100644 index 0000000..41bd99d --- /dev/null +++ b/Sheet_5/bsp_5_1/check_env.h @@ -0,0 +1,99 @@ +#pragma once + +#include +#ifdef _OPENMP +#include +#endif +#include + +//##################################### +// G.Haase +// See https://sourceforge.net/p/predef/wiki/Compilers/ +// http://www.cplusplus.com/doc/tutorial/preprocessor/ +// also: export OMP_DISPLAY_ENV=VERBOSE +//##################################### +/** Checks for compilers, its versions, threads etc. + * + @param[in] argc number of command line arguemnts + @param[in] argv command line arguments as array of C-strings +*/ +template +void check_env(T argc, char const *argv[]) +{ + std::cout << "\n#######################################################################\n"; + std::cout << "Code :"; + for (T k = 0; k < argc; ++k) std::cout << " " << argv[k]; + std::cout << std::endl; + + // compiler: https://sourceforge.net/p/predef/wiki/Compilers/ + std::cout << "Compiler: "; +#if defined __INTEL_COMPILER +#pragma message(" ########## INTEL ###############") + std::cout << "INTEL " << __INTEL_COMPILER; + // Ignore warnings for #pragma acc unrecognice +#pragma warning disable 161 + // Ignore warnings for #pragma omp unrecognice +#pragma warning disable 3180 + +#elif defined __PGI +#pragma message(" ########## PGI ###############") + std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__; +#elif defined __clang__ +#pragma message(" ########## CLANG ###############") + std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__; +#elif defined __GNUC__ +#pragma message(" ########## Gnu ###############") + std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__; +#else +#pragma message(" ########## unknown Compiler ###############") + std::cout << "unknown"; +#endif + std::cout << " C++ standard: " << __cplusplus << std::endl; + + // Parallel environments + std::cout << "Parallel: "; +#if defined MPI_VERSION +#pragma message(" ########## MPI ###############") +#ifdef OPEN_MPI + std::cout << "OpenMPI "; +#else + std::cout << "MPI "; +#endif + std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " "; +#endif + +#ifdef _OPENMP +//https://www.openmp.org/specifications/ +//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux + std::unordered_map const map{ + {200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}}; +#pragma message(" ########## OPENMP ###############") + //std::cout << _OPENMP; + std::cout << "OpenMP "; + try { + std::cout << map.at(_OPENMP); + } + catch (...) { + std::cout << _OPENMP; + } + #pragma omp parallel + { + #pragma omp master + { + const int nn = omp_get_num_threads(); // OpenMP + std::cout << " ---> " << nn << " Threads "; + } + #pragma omp barrier + } + +#endif +#ifdef _OPENACC +#pragma message(" ########## OPENACC ###############") + std::cout << "OpenACC "; +#endif + std::cout << std::endl; + std::cout << "Date : " << __DATE__ << " " << __TIME__; + std::cout << "\n#######################################################################\n"; +} +// HG + diff --git a/Sheet_5/bsp_5_1/main.cpp b/Sheet_5/bsp_5_1/main.cpp new file mode 100644 index 0000000..d98ee98 --- /dev/null +++ b/Sheet_5/bsp_5_1/main.cpp @@ -0,0 +1,133 @@ +#include "check_env.h" +#include "mylib.h" +#include // atoi() +#include // strncmp() +#include +#include +#include // OpenMP +#include +#include +using namespace std; + +int main(int argc, char const *argv[]) +{ + //int const NLOOPS = 5; // chose a value such that the benchmark runs at least 10 sec. + //unsigned int N = 500000001; // to less memory :( + int const NLOOPS = 50; // chose a value such that the benchmark runs at least 10 sec. + unsigned int N = 50000001; +//########################################################################## +// Read Parameter from command line (C++ style) + cout << "Checking command line parameters for: -n " << endl; + for (int i = 1; i < argc; i++) + { + cout << " arg[" << i << "] = " << argv[i] << endl; + string ss(argv[i]); + if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter + { + N = static_cast(atoi(argv[i + 1])); + } + else + { + cout << "Corect call: " << argv[0] << " -n \n"; + } + } + + cout << "\nN = " << N << endl; + + check_env(argc, argv); +//######################################################################## + + //omp_set_num_threads(16); + + int nthreads; // OpenMP + int numproc = omp_get_num_procs(); + cout << "Es stehen maximal " << numproc << " Kerne zur Verfuegung" << endl; + #pragma omp parallel default(none) shared(cout,nthreads) + { + int const th_id = omp_get_thread_num(); // OpenMP + int const nthrds = omp_get_num_threads(); // OpenMP + stringstream ss; + ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl; + #pragma omp critical + { + cout << ss.str(); // output to a shared ressource + } + #pragma omp master + nthreads = nthrds; // transfer nn to to master thread + } + cout << " " << nthreads << " threads have been started." << endl; + +//########################################################################## +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N), y(N); + + cout.precision(2); + cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +//########################################################################## +// Data initialization +// Special: x_i = i+1; y_i = 1/x_i ==> == N + for (unsigned int i = 0; i < N; ++i) + { + x[i] = i + 1; + y[i] = 1.0 / x[i]; + } + +//########################################################################## + cout << "\nStart Benchmarking\n"; + +// Do calculation + double tstart = omp_get_wtime(); // OpenMP + + double sk(0.0); + for (int i = 0; i < NLOOPS; ++i) + { + sk = scalar(x, y); + sk = scalar_trans(x, y); + //sk = norm(x); + //sk = scalar_manual(x,y); + } + + double t1 = omp_get_wtime() - tstart; // OpenMP + t1 /= NLOOPS; // divide by number of function calls + +//########################################################################## +// Check the correct result + cout << "\n = " << sk << endl; + if (static_cast(sk) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "Timing in sec. : " << t1 << endl; + cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + +//######################################################################### + + cout << "\n Try the reduction with an STL-vektor (adding)\n"; + + auto vr = reduction_vec(100); + cout << "done\n"; + cout << vr << endl; + + +//######################################################################### + + cout << "\n Try the reduction with an STL-vektor (appending)\n"; + + auto vra = reduction_vec_append(10); + cout << "done\n"; + cout << vra << endl; + + + return 0; +} // memory for x and y will be deallocated their destructors diff --git a/Sheet_5/bsp_5_1/mylib.cpp b/Sheet_5/bsp_5_1/mylib.cpp new file mode 100644 index 0000000..332f4c3 --- /dev/null +++ b/Sheet_5/bsp_5_1/mylib.cpp @@ -0,0 +1,140 @@ +#include "mylib.h" +#include // assert() +#include +#include +#include // multiplies<>{} +#include +#include // iota() +#ifdef _OPENMP +#include +#endif +#include +using namespace std; + +double scalar(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + double sum = 0.0; +// int check; // for using omp_in_parallel +//#pragma omp parallel for default(none) shared(x,y,N,check) reduction(+:sum) +#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; +// check = omp_in_parallel(); + //sum += exp(x[i])*log(y[i]); + } +// cout << "in parallel: " << check << endl; + return sum; +} + + +double scalar_manual(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); + size_t const N = x.size(); + double sum = 0.0; + + #pragma omp parallel default(none) shared(x,y,N,sum) + { + int tid = omp_get_thread_num(); + int nt = omp_get_num_threads(); + + // manual splitting of the index area + size_t start = (N * tid) / nt; + size_t end = (N * (tid+1)) / nt; + + double local_sum = 0.0; + for (size_t i = start; i < end; ++i) { + local_sum += x[i] * y[i]; + } + + // local subtotal combined to global sum + #pragma omp atomic + sum += local_sum; + } + + return sum; +} + + +double norm(vector const &x) +{ + size_t const N = x.size(); + double sum = 0.0; +#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) + for (size_t i = 0; i < N; ++i) + { + sum += x[i]*x[i]; + } + return sum; +} + + + +vector reduction_vec(int n) +{ + vector vec(n); +#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec) + { + #pragma omp barrier + #pragma omp critical + cout << omp_get_thread_num() << " : " << vec.size() << endl; + #pragma omp barrier + iota( vec.begin(),vec.end(), omp_get_thread_num() ); + #pragma omp barrier + + } + return vec; +} + + +vector reduction_vec_append(int n) +{ + vector vec; + +#pragma omp parallel default(none) shared(n,cout) reduction(VecAppend:vec) + { + std::vector local(n); // local vector of each thread + + // consecutive numbers starting from thread ID + iota(local.begin(), local.end(), omp_get_thread_num()); + + // output for checking + #pragma omp critical + cout << "Thread " << omp_get_thread_num() << " local size = " << local.size() << std::endl; + + // append local to vec + vec.insert(vec.end(), local.begin(), local.end()); + } + + return vec; +} + + + +double scalar_trans(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + vector z(x.size()); + //list z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-') + // c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison + + transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{}); + + double sum = 0.0; +#pragma omp parallel for default(none) shared(z) reduction(+:sum) + for (auto pi = cbegin(z); pi!=cend(z); ++pi) + { + sum += *pi; + } + //for (auto val: z) + //{ + //sum += val; + //} + return sum; +} + + + diff --git a/Sheet_5/bsp_5_1/mylib.h b/Sheet_5/bsp_5_1/mylib.h new file mode 100644 index 0000000..0b59849 --- /dev/null +++ b/Sheet_5/bsp_5_1/mylib.h @@ -0,0 +1,93 @@ +#pragma once +#include +#include // setw() +#include +#include +#include + +/** Inner product + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar(std::vector const &x, std::vector const &y); +double scalar_manual(std::vector const &x, std::vector const &y); +double scalar_trans(std::vector const &x, std::vector const &y); + + +/** l2-norm + @param[in] x vector + @return resulting Euclidian norm +*/ +double norm(std::vector const &x); + +/** Vector @p b adds its elements to vector @p a . + @param[in] a vector + @param[in] b vector + @return a+=b componentwise +*/ +template +std::vector &operator+=(std::vector &a, std::vector const &b) +{ + assert(a.size()==b.size()); + for (size_t k = 0; k < a.size(); ++k) { + a[k] += b[k]; + } + return a; +} + +// Declare the reduction operation in OpenMP for an STL-vector +// omp_out += omp_in requires operator+=(vector &, vector const &) from above +// ------------------------------------------------------------ +// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm +// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c +// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size +// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf +#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) \ + initializer (omp_priv=omp_orig) + +#pragma omp declare reduction (VecAppend : std::vector : \ + omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \ + initializer (omp_priv = omp_orig) + +// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type. +//template +//#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) initializer (omp_priv(omp_orig)) +// MS: template nach #pragma !? + +// ------------------------------------------------------------ + + +/** Test for vector reduction. + * + * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$. + * Afterwards these vectors are accumulated, i.e., + * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$. + * + * @param[in] n size of global/private vector + * @return resulting global vector. +*/ +std::vector reduction_vec(int n); + + +/** Test for vector reduction with appending the local vectors to a global + * size of global vector: n*#threads + * + * @param[in] n size of local/private vector + * @return resultign global vector + */ +std::vector reduction_vec_append(int n); + + +/** Output of a vector. + @param[in,out] s output stream + @param[in] x vector + @return modified output stream +*/ +template +std::ostream &operator<<(std::ostream &s, std::vector const &x) +{ + for (auto const &v : x) s << std::setw(4) << v << " "; + return s; +} + diff --git a/Sheet_5/bsp_5_1/results_speedup.txt b/Sheet_5/bsp_5_1/results_speedup.txt new file mode 100644 index 0000000..6ada9a8 --- /dev/null +++ b/Sheet_5/bsp_5_1/results_speedup.txt @@ -0,0 +1,8 @@ +Results of the speedup test +threads | time | factor of speedup + 1 | 0.35s | + 2 | 0.3s | 1.167 + 4 | 0.31s | 1.129 + 6 | 0.31s | 1.129 + 8 | 0.3s | 1.167 + 16 | 0.31s | 1.129 (overthreading) \ No newline at end of file