diff --git a/ex5/ex5_1/Makefile b/ex5/ex5_1/Makefile new file mode 100644 index 0000000..f5bc097 --- /dev/null +++ b/ex5/ex5_1/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp mylib.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_1/check_env.h b/ex5/ex5_1/check_env.h new file mode 100644 index 0000000..41bd99d --- /dev/null +++ b/ex5/ex5_1/check_env.h @@ -0,0 +1,99 @@ +#pragma once + +#include +#ifdef _OPENMP +#include +#endif +#include + +//##################################### +// G.Haase +// See https://sourceforge.net/p/predef/wiki/Compilers/ +// http://www.cplusplus.com/doc/tutorial/preprocessor/ +// also: export OMP_DISPLAY_ENV=VERBOSE +//##################################### +/** Checks for compilers, its versions, threads etc. + * + @param[in] argc number of command line arguemnts + @param[in] argv command line arguments as array of C-strings +*/ +template +void check_env(T argc, char const *argv[]) +{ + std::cout << "\n#######################################################################\n"; + std::cout << "Code :"; + for (T k = 0; k < argc; ++k) std::cout << " " << argv[k]; + std::cout << std::endl; + + // compiler: https://sourceforge.net/p/predef/wiki/Compilers/ + std::cout << "Compiler: "; +#if defined __INTEL_COMPILER +#pragma message(" ########## INTEL ###############") + std::cout << "INTEL " << __INTEL_COMPILER; + // Ignore warnings for #pragma acc unrecognice +#pragma warning disable 161 + // Ignore warnings for #pragma omp unrecognice +#pragma warning disable 3180 + +#elif defined __PGI +#pragma message(" ########## PGI ###############") + std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__; +#elif defined __clang__ +#pragma message(" ########## CLANG ###############") + std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__; +#elif defined __GNUC__ +#pragma message(" ########## Gnu ###############") + std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__; +#else +#pragma message(" ########## unknown Compiler ###############") + std::cout << "unknown"; +#endif + std::cout << " C++ standard: " << __cplusplus << std::endl; + + // Parallel environments + std::cout << "Parallel: "; +#if defined MPI_VERSION +#pragma message(" ########## MPI ###############") +#ifdef OPEN_MPI + std::cout << "OpenMPI "; +#else + std::cout << "MPI "; +#endif + std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " "; +#endif + +#ifdef _OPENMP +//https://www.openmp.org/specifications/ +//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux + std::unordered_map const map{ + {200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}}; +#pragma message(" ########## OPENMP ###############") + //std::cout << _OPENMP; + std::cout << "OpenMP "; + try { + std::cout << map.at(_OPENMP); + } + catch (...) { + std::cout << _OPENMP; + } + #pragma omp parallel + { + #pragma omp master + { + const int nn = omp_get_num_threads(); // OpenMP + std::cout << " ---> " << nn << " Threads "; + } + #pragma omp barrier + } + +#endif +#ifdef _OPENACC +#pragma message(" ########## OPENACC ###############") + std::cout << "OpenACC "; +#endif + std::cout << std::endl; + std::cout << "Date : " << __DATE__ << " " << __TIME__; + std::cout << "\n#######################################################################\n"; +} +// HG + diff --git a/ex5/ex5_1/main.cpp b/ex5/ex5_1/main.cpp new file mode 100644 index 0000000..c261c29 --- /dev/null +++ b/ex5/ex5_1/main.cpp @@ -0,0 +1,142 @@ +#include "check_env.h" +#include "mylib.h" +#include // atoi() +#include // strncmp() +#include +#include +#include // OpenMP +#include +#include +using namespace std; + +int main(int argc, char const *argv[]) +{ + omp_set_schedule(omp_sched_static, 2000000); + //omp_set_schedule(omp_sched_dynamic, 1000000); + //omp_set_schedule(omp_sched_guided, 1000000); + //omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto + + + // Speedup for different number of cores (incl. hyperthreading) + omp_set_num_threads(8); + + // Print number of available processors + cout << "Number of available processors: " << omp_get_num_procs() << endl; + + // Currently executing parallel code? -> no + cout << "Currently in parallel? " << omp_in_parallel() << endl; + + + int const NLOOPS = 10; // chose a value such that the benchmark runs at least 10 sec. + unsigned int N = 500000001; + + +//########################################################################## +// Read Parameter from command line (C++ style) + cout << "Checking command line parameters for: -n " << endl; + for (int i = 1; i < argc; i++) + { + cout << " arg[" << i << "] = " << argv[i] << endl; + string ss(argv[i]); + if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter + { + N = static_cast(atoi(argv[i + 1])); + } + else + { + cout << "Corect call: " << argv[0] << " -n \n"; + } + } + + cout << "\nN = " << N << endl; + + check_env(argc, argv); +//######################################################################## + int nthreads; // OpenMP + #pragma omp parallel default(none) shared(cout,nthreads) + { + stringstream inparallel; + inparallel << "Currently in parallel? " << omp_in_parallel() << endl; + + + int const th_id = omp_get_thread_num(); // OpenMP + int const nthrds = omp_get_num_threads(); // OpenMP + stringstream ss; + ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl; + #pragma omp critical + { + cout << ss.str(); // output to a shared ressource + cout << inparallel.str() << endl; + } + #pragma omp master + nthreads = nthrds; // transfer nn to to master thread + } + cout << " " << nthreads << " threads have been started." << endl; + +//########################################################################## +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N), y(N); + + cout.precision(2); + cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +//########################################################################## +// Data initialization +// Special: x_i = i+1; y_i = 1/x_i ==> == N + for (unsigned int i = 0; i < N; ++i) + { + x[i] = i + 1; + y[i] = 1.0 / x[i]; + } + +//########################################################################## + cout << "\nStart Benchmarking\n"; + +// Do calculation + double tstart = omp_get_wtime(); // OpenMP + + double sk(0.0); + for (int i = 0; i < NLOOPS; ++i) + { + //sk = scalar(x, y); + sk = scalar_parallel(x, y); + //sk = scalar_trans(x, y); + //sk = norm(x); + } + + double t1 = omp_get_wtime() - tstart; // OpenMP + + t1 /= NLOOPS; // divide by number of function calls + +//########################################################################## +// Check the correct result + cout << "\n = " << sk << endl; + if (static_cast(sk) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "Total benchmarking time: " << t1*NLOOPS << endl; + cout << "Timing in sec. : " << t1 << endl; + cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + +//######################################################################### + + cout << "\n Try the reduction with an STL-vektor \n"; + + auto vr = reduction_vec_append(5); + cout << "done\n"; + cout << vr << endl; + + + return 0; +} // memory for x and y will be deallocated their destructors diff --git a/ex5/ex5_1/mylib.cpp b/ex5/ex5_1/mylib.cpp new file mode 100644 index 0000000..b5f2697 --- /dev/null +++ b/ex5/ex5_1/mylib.cpp @@ -0,0 +1,137 @@ +#include "mylib.h" +#include // assert() +#include +#include +#include // multiplies<>{} +#include +#include // iota() +#ifdef _OPENMP +#include +#endif +#include +using namespace std; + + +double scalar_parallel(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + double sum = 0.0; +#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum) + { + const size_t nthreads = omp_get_num_threads(); + const size_t threadnum = omp_get_thread_num(); + const size_t chunksize = N/nthreads; + + + size_t start = threadnum*chunksize; + size_t end = start + chunksize; + if (threadnum == nthreads - 1) + end = N; + + + for (size_t i = start; i < end; ++i) + { + sum += x[i] * y[i]; + } + + } + return sum; +} + + +vector reduction_vec_append(int n) +{ + vector vec(n); +#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec) + { + #pragma omp barrier + #pragma omp critical + cout << omp_get_thread_num() << " : " << vec.size() << endl; + #pragma omp barrier + iota( vec.begin(),vec.end(), omp_get_thread_num() ); + #pragma omp barrier + + } + return vec; +} + + + + + + + + + + + +double scalar(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + double sum = 0.0; +#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime) + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; + //sum += exp(x[i])*log(y[i]); + } + return sum; +} + + +double norm(vector const &x) +{ + size_t const N = x.size(); + double sum = 0.0; +#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime) + for (size_t i = 0; i < N; ++i) + { + sum += x[i]*x[i]; + } + return sum; +} + + +vector reduction_vec(int n) +{ + vector vec(n); +#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec) + { + #pragma omp barrier + #pragma omp critical + cout << omp_get_thread_num() << " : " << vec.size() << endl; + #pragma omp barrier + iota( vec.begin(),vec.end(), omp_get_thread_num() ); + #pragma omp barrier + + } + return vec; +} + + +double scalar_trans(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + vector z(x.size()); + //list z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-') + // c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison + + transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{}); + + double sum = 0.0; +#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime) + for (auto pi = cbegin(z); pi!=cend(z); ++pi) + { + sum += *pi; + } + //for (auto val: z) + //{ + //sum += val; + //} + return sum; +} + + + diff --git a/ex5/ex5_1/mylib.h b/ex5/ex5_1/mylib.h new file mode 100644 index 0000000..fe1948a --- /dev/null +++ b/ex5/ex5_1/mylib.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include // setw() +#include +#include +#include + +/** Inner product + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar_parallel(std::vector const &x, std::vector const &y); +double scalar(std::vector const &x, std::vector const &y); +double scalar_trans(std::vector const &x, std::vector const &y); + + + +// Declare additional reduction operation in OpenMP for STL-vector +#pragma omp declare reduction(VecAppend : std::vector : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \ + initializer (omp_priv=omp_orig) + +std::vector reduction_vec_append(int n); + + +/** l2-norm + @param[in] x vector + @return resulting Euclidian norm +*/ +double norm(std::vector const &x); + +/** Vector @p b adds its elements to vector @p a . + @param[in] a vector + @param[in] b vector + @return a+=b componentwise +*/ +template +std::vector &operator+=(std::vector &a, std::vector const &b) +{ + assert(a.size()==b.size()); + for (size_t k = 0; k < a.size(); ++k) { + a[k] += b[k]; + } + return a; +} + +// Declare the reduction operation in OpenMP for an STL-vector +// omp_out += omp_in requires operator+=(vector &, vector const &) from above +// ------------------------------------------------------------ +// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm +// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c +// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size +// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf +#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) \ + initializer (omp_priv=omp_orig) + +// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type. +//template +//#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) initializer (omp_priv(omp_orig)) +// MS: template nach #pragma !? + +// ------------------------------------------------------------ + +/** Test for vector reduction. + * + * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$. + * Afterwards these vectors are accumulated, i.e., + * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$. + * + * @param[in] n size of global/private vector + * @return resulting global vector. +*/ +std::vector reduction_vec(int n); + + + +/** Output of a vector. + @param[in,out] s output stream + @param[in] x vector + @return modified output stream +*/ +template +std::ostream &operator<<(std::ostream &s, std::vector const &x) +{ + for (auto const &v : x) s << std::setw(4) << v << " "; + return s; +} + diff --git a/ex5/ex5_1/timing.h b/ex5/ex5_1/timing.h new file mode 100644 index 0000000..11d8da2 --- /dev/null +++ b/ex5/ex5_1/timing.h @@ -0,0 +1,70 @@ +#pragma once +#include // timing +#include + +using Clock = std::chrono::system_clock; //!< The wall clock timer chosen +//using Clock = std::chrono::high_resolution_clock; +using TPoint= std::chrono::time_point; + +// [Galowicz, C++17 STL Cookbook, p. 29] +inline +std::stack MyStopWatch; //!< starting time of stopwatch + +/** Starts stopwatch timer. + * Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode + * + * The timining is allowed to be nested and the recent time is stored on top of the stack. + * + * @return recent time + * @see toc + */ +inline auto tic() +{ + MyStopWatch.push(Clock::now()); + return MyStopWatch.top(); +} + +/** Returns the elapsed time from stopwatch. + * + * The time from top of the stack is used + * if time point @p t_b is not passed as input parameter. + * Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode + * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode + * The last option is to be used in the case of + * non-nested but overlapping time measurements. + * + * @param[in] t_b start time of some stop watch + * @return elapsed time in seconds. + * +*/ +inline double toc(TPoint const &t_b = MyStopWatch.top()) +{ + // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point + using Unit = std::chrono::seconds; + using FpSeconds = std::chrono::duration; + auto t_e = Clock::now(); + MyStopWatch.pop(); + return FpSeconds(t_e-t_b).count(); +} + +#include +#include +/** Executes function @p f and measures/prints elapsed wall clock time in seconds + * + * Call as + * @code measure("Time for (b = b + 1)", [&]() { + thrust::transform(b.begin(), b.end(), b.begin(), increment()); + }); @endcode + * + * @param[in] label additional string to be printed with the measurement. + * @param[in] f function to execute. + * @author Therese Bösmüller, 2025 + * +*/ +auto measure = [](const std::string& label, auto&& f) { + auto start = std::chrono::high_resolution_clock::now(); + f(); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start).count(); + std::cout << label << ": " << duration << " microseconds" << std::endl; +}; // ';' is needed for a visible documentation of this lambda-function diff --git a/ex5/ex5_2/Makefile b/ex5/ex5_2/Makefile new file mode 100644 index 0000000..7363b62 --- /dev/null +++ b/ex5/ex5_2/Makefile @@ -0,0 +1,31 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp mylib.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_2/data_1.txt b/ex5/ex5_2/data_1.txt new file mode 100644 index 0000000..0cc4bb8 --- /dev/null +++ b/ex5/ex5_2/data_1.txt @@ -0,0 +1,501 @@ +141 +261 +87 +430 +258 +298 +425 +120 +496 +707 +244 +786 +75 +394 +4 +221 +2 +190 +143 +269 +175 +139 +599 +902 +940 +222 +483 +377 +524 +265 +69 +437 +174 +27 +955 +431 +962 +763 +8 +681 +706 +646 +553 +219 +773 +229 +371 +891 +857 +403 +319 +609 +911 +910 +592 +333 +854 +443 +905 +34 +533 +717 +180 +337 +188 +322 +404 +549 +49 +553 +275 +242 +244 +155 +957 +936 +819 +729 +176 +361 +189 +2 +317 +700 +626 +544 +440 +288 +502 +762 +763 +577 +748 +646 +124 +505 +348 +93 +148 +199 +673 +432 +695 +257 +10 +533 +280 +947 +907 +393 +25 +672 +838 +972 +57 +451 +583 +687 +720 +651 +727 +374 +582 +117 +58 +980 +285 +595 +963 +186 +194 +342 +933 +391 +274 +152 +398 +375 +132 +436 +92 +615 +11 +574 +790 +236 +449 +570 +62 +497 +643 +222 +838 +972 +847 +506 +279 +747 +237 +958 +621 +601 +173 +91 +256 +859 +912 +700 +726 +230 +577 +811 +404 +989 +90 +321 +512 +61 +726 +557 +530 +830 +859 +790 +318 +453 +753 +110 +110 +270 +525 +973 +711 +312 +292 +851 +912 +640 +256 +89 +839 +585 +949 +62 +585 +286 +828 +191 +443 +394 +827 +677 +208 +319 +134 +672 +571 +170 +148 +477 +909 +553 +33 +54 +806 +452 +383 +790 +365 +533 +712 +872 +329 +651 +975 +76 +588 +414 +310 +264 +759 +996 +187 +782 +196 +993 +803 +425 +729 +499 +809 +357 +74 +591 +911 +194 +433 +750 +40 +947 +764 +559 +184 +498 +518 +995 +855 +963 +679 +404 +935 +480 +232 +397 +706 +559 +757 +996 +963 +536 +964 +116 +52 +305 +581 +531 +902 +541 +432 +543 +713 +17 +801 +143 +479 +257 +370 +662 +170 +279 +199 +196 +327 +881 +472 +404 +180 +969 +408 +845 +616 +377 +878 +785 +465 +814 +899 +430 +335 +597 +902 +703 +378 +735 +955 +543 +541 +312 +72 +182 +93 +464 +10 +916 +643 +2 +31 +209 +455 +128 +9 +728 +355 +781 +437 +437 +50 +50 +92 +595 +242 +842 +858 +964 +489 +221 +227 +537 +763 +348 +462 +640 +918 +162 +716 +578 +434 +885 +394 +179 +634 +625 +328 +803 +1000 +981 +128 +233 +24 +608 +111 +408 +885 +549 +370 +209 +441 +957 +125 +471 +857 +44 +692 +979 +284 +134 +686 +910 +611 +900 +194 +755 +347 +419 +156 +820 +625 +739 +806 +68 +951 +498 +756 +743 +832 +157 +458 +619 +933 +836 +896 +583 +583 +855 +35 +886 +408 +37 +747 +155 +144 +606 +255 +325 +402 +407 +387 +610 +167 +189 +95 +324 +770 +235 +741 +693 +825 +828 +294 +310 +524 +326 +832 +811 +557 +263 +681 +234 +457 +385 +539 +992 +756 +981 +235 +529 +52 +757 +602 +858 +989 +930 +410 +1 +541 +208 +220 +326 +96 +748 +749 +544 +339 +833 +553 +958 +893 +357 +547 +347 +623 +797 +746 +126 +823 +26 +415 +732 +782 +368 + diff --git a/ex5/ex5_2/main.GCC_ b/ex5/ex5_2/main.GCC_ new file mode 100755 index 0000000..ef16f39 Binary files /dev/null and b/ex5/ex5_2/main.GCC_ differ diff --git a/ex5/ex5_2/main.cpp b/ex5/ex5_2/main.cpp new file mode 100644 index 0000000..1bea099 --- /dev/null +++ b/ex5/ex5_2/main.cpp @@ -0,0 +1,130 @@ +#include "mylib.h" +#include +#include +#include +#include +using namespace std; + + +int main() +{ + // read vector from file + vector data_vector = {}; + + ifstream input_stream("data_1.txt"); + + size_t line; + while(input_stream >> line) + { + data_vector.push_back(line); + } + data_vector.shrink_to_fit(); + + + + + // specify loops + size_t NLOOPS = 10000; + + + + // ############# Parallelization with openMP ############# + // calculate arithmetic mean, geometric mean and harmonic mean + double am_omp, gm_omp, hm_omp; + + double tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + means_omp(data_vector, am_omp, gm_omp, hm_omp); + + double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS; + + // calculate minimum and maximum + size_t min, max; + + tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + minmax_omp(data_vector, min, max); + + double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS; + + + + + + + // ############# Parallelization with C++ algorithms ############# + // calculate arithmetic mean, geometric mean and harmonic mean + double am_cpp, gm_cpp, hm_cpp; + + tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp); + + double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS; + + // calculate minimum and maximum + size_t min_cpp, max_cpp; + + tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + minmax_cpp(data_vector, min_cpp, max_cpp); + + double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS; + + + + + + // print results + cout << "####### OpenMP #######" << endl; + cout << "minimum: " << min << endl; + cout << "maximum: " << max << endl; + cout << "duration: " << t_minmax_omp << endl << endl; + + cout << "arithmetic mean: " << am_omp << endl; + cout << "geometric mean: " << gm_omp << endl; + cout << "harmonic mean: " << hm_omp << endl; + cout << "duration: " << t_means_omp << endl << endl; + + + cout << "####### C++ #######" << endl; + cout << "minimum: " << min_cpp << endl; + cout << "maximum: " << max_cpp << endl; + cout << "duration: " << t_minmax_cpp << endl << endl; + + cout << "arithmetic mean: " << am_cpp << endl; + cout << "geometric mean: " << gm_cpp << endl; + cout << "harmonic mean: " << hm_cpp << endl; + cout << "duration: " << t_means_cpp << endl << endl; + + + + // ####### OpenMP ####### + // minimum: 1 + // maximum: 1000 + // duration: 3.52086e-06 + + // arithmetic mean: 498.184 + // geometric mean: 364.412 + // harmonic mean: 95.6857 + // duration: 5.90171e-06 + + // ####### C++ ####### + // minimum: 1 + // maximum: 1000 + // duration: 1.76816e-05 + + // arithmetic mean: 498.184 + // geometric mean: 364.412 + // harmonic mean: 95.6857 + // duration: 2.35728e-05 + + // --> the openMP variant is faster in both cases + + + return 0; +} diff --git a/ex5/ex5_2/main.o b/ex5/ex5_2/main.o new file mode 100644 index 0000000..0a6cbf3 Binary files /dev/null and b/ex5/ex5_2/main.o differ diff --git a/ex5/ex5_2/mylib.cpp b/ex5/ex5_2/mylib.cpp new file mode 100644 index 0000000..34e5992 --- /dev/null +++ b/ex5/ex5_2/mylib.cpp @@ -0,0 +1,103 @@ +#include "mylib.h" +#include +#include +#include +#include +#include +#include + +using namespace std; + + +void means_omp(const std::vector numbers, double &am, double &gm, double &hm) +{ + size_t const n = numbers.size(); + + am = 0.; + gm = 0.; + hm = 0.; + +#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm) + for (size_t i = 0; i < n; ++i) + { + am += numbers[i]; + gm += log(numbers[i]); + hm += 1.0/numbers[i]; + + // #pragma omp critical + // { + // cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl; + // } + } + + am /= n; + gm = exp(gm/n); + hm = n/hm; +} + + +void minmax_omp(const std::vector numbers, size_t &global_min, size_t &global_max) +{ + size_t const n = numbers.size(); + + global_min = -1; // gives the maximum size_t value + global_max = 0; + +#pragma omp parallel shared(numbers, n, global_min, global_max) + { + const size_t nthreads = omp_get_num_threads(); + const size_t threadnum = omp_get_thread_num(); + const size_t chunksize = n/nthreads; + + + size_t start = threadnum*chunksize; + size_t end = start + chunksize; + if (threadnum == nthreads - 1) + end = n; + + + size_t local_min = -1; + size_t local_max = 0; + for (size_t i = start; i < end ; ++i) + { + if (numbers[i] < local_min) + local_min = numbers[i]; + + if (numbers[i] > local_max) + local_max = numbers[i]; + } + + #pragma omp critical + { + if (local_min < global_min) + global_min = local_min; + + if (local_max > global_max) + global_max = local_max; + } + } +} + + +void means_cpp(const std::vector numbers, double &am, double &gm, double &hm) +{ + size_t const n = numbers.size(); + + am = reduce(std::execution::par, numbers.begin(), numbers.end()); + gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } ); + hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; }); + + am /= n; + gm = exp(gm/n); + hm = n/hm; +} + + +void minmax_cpp(const std::vector numbers, size_t &global_min, size_t &global_max) +{ + auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end()); + auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end()); + + global_min = *min_it; + global_max = *max_it; +} \ No newline at end of file diff --git a/ex5/ex5_2/mylib.h b/ex5/ex5_2/mylib.h new file mode 100644 index 0000000..01d2fa9 --- /dev/null +++ b/ex5/ex5_2/mylib.h @@ -0,0 +1,42 @@ +#include + +/** + This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector. + Uses openMP parallelization. + @param[in] numbers vector containing integers + @param[out] am arithmetic mean + @param[out] gm geometric mean + @param[out] hm harmonic mean +*/ +void means_omp(const std::vector numbers, double &am, double &gm, double &hm); + + +/** + This function calculates the minimum and maximum of a vector. + Uses openMP parallelization. + @param[in] numbers vector containing integers + @param[out] global_min minimum + @param[out] global_max maximum +*/ +void minmax_omp(const std::vector numbers, size_t &global_min, size_t &global_max); + + +/** + This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector. + Uses C++ parallelization. + @param[in] numbers vector containing integers + @param[out] am arithmetic mean + @param[out] gm geometric mean + @param[out] hm harmonic mean +*/ +void means_cpp(const std::vector numbers, double &am, double &gm, double &hm); + + +/** + This function calculates the minimum and maximum of a vector. + Uses C++ parallelization. + @param[in] numbers vector containing integers + @param[out] global_min minimum + @param[out] global_max maximum +*/ +void minmax_cpp(const std::vector numbers, size_t &global_min, size_t &global_max); \ No newline at end of file diff --git a/ex5/ex5_2/mylib.o b/ex5/ex5_2/mylib.o new file mode 100644 index 0000000..7b34810 Binary files /dev/null and b/ex5/ex5_2/mylib.o differ diff --git a/ex5/ex5_3/Makefile b/ex5/ex5_3/Makefile new file mode 100644 index 0000000..4a714a3 --- /dev/null +++ b/ex5/ex5_3/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp goldbach.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_3/goldbach.cpp b/ex5/ex5_3/goldbach.cpp new file mode 100644 index 0000000..55709c3 --- /dev/null +++ b/ex5/ex5_3/goldbach.cpp @@ -0,0 +1,46 @@ +#include "goldbach.h" +#include +#include +#include + +size_t single_goldbach(size_t k) +{ + const std::vector relevant_primes = get_primes(k); + size_t m = relevant_primes.size(); + + size_t counter = 0; + +#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter) + for(size_t i = 0; i < m; ++i) + { + for(size_t j = i; j < m; ++j) + { + if(relevant_primes[i] + relevant_primes[j] == k) + ++counter; + } + } + + return counter; +} + + +std::vector count_goldbach(size_t n) +{ + const std::vector relevant_primes = get_primes(n); + size_t m = relevant_primes.size(); + + std::vector counter_vector(n + 1, 0); + +#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector) + for(size_t i = 0; i < m; ++i) + { + for(size_t j = i; j < m; ++j) + { + size_t sum = relevant_primes[i] + relevant_primes[j]; + if(sum <= n) + ++counter_vector[relevant_primes[i] + relevant_primes[j]]; + } + } + + return counter_vector; +} diff --git a/ex5/ex5_3/goldbach.h b/ex5/ex5_3/goldbach.h new file mode 100644 index 0000000..15df6d4 --- /dev/null +++ b/ex5/ex5_3/goldbach.h @@ -0,0 +1,45 @@ +#pragma once +#include "mayer_primes.h" +#include +#include + + +/** + This function returns the number of possible decompositions of an integer into a sum of two prime numbers. + @param[in] k first integer + @param[out] count number of decompositions +*/ +size_t single_goldbach(size_t k); + + +/** + This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n]. + @param[in] n upper integer bound + @param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index +*/ +std::vector count_goldbach(size_t n); + + +/** Vector @p b adds its elements to vector @p a . + @param[in] a vector + @param[in] b vector + @return a+=b componentwise +*/ +template +std::vector &operator+=(std::vector &a, std::vector const &b) +{ + assert(a.size()==b.size()); + for (size_t k = 0; k < a.size(); ++k) { + a[k] += b[k]; + } + return a; +} + +// Declare the reduction operation in OpenMP for an STL-vector +// omp_out += omp_in requires operator+=(vector &, vector const &) from above +// ------------------------------------------------------------ +// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm +// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c +// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size +// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf +#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) initializer (omp_priv=omp_orig) \ No newline at end of file diff --git a/ex5/ex5_3/main.cpp b/ex5/ex5_3/main.cpp new file mode 100644 index 0000000..1be01c2 --- /dev/null +++ b/ex5/ex5_3/main.cpp @@ -0,0 +1,45 @@ +#include "goldbach.h" +#include +#include +#include +using namespace std; + + +int main() +{ + cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl; + + for(size_t n : {10000, 100000, 400000, 1000000, 2000000}) + { + double t_start = omp_get_wtime(); + + auto goldbach_vector = count_goldbach(n); + + auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end()); + size_t max_number = distance(goldbach_vector.begin(), max_it); + + double t_end = omp_get_wtime() - t_start; + + cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl; + } + + /* + ###### WITHOUT PARALLELIZATION ###### + The number 9240 has 329 decompositions. Duration: 0.00307696 + The number 99330 has 2168 decompositions. Duration: 0.189839 + The number 390390 has 7094 decompositions. Duration: 1.3042 + The number 990990 has 15594 decompositions. Duration: 5.45034 + The number 1981980 has 27988 decompositions. Duration: 47.1807 + + + ###### WITH PARALLELIZATION ###### + The number 9240 has 329 decompositions. Duration: 0.000734854 + The number 99330 has 2168 decompositions. Duration: 0.0251322 + The number 390390 has 7094 decompositions. Duration: 0.487375 + The number 990990 has 15594 decompositions. Duration: 6.16972 + The number 1981980 has 27988 decompositions. Duration: 31.5699 + */ + + + return 0; +} \ No newline at end of file diff --git a/ex5/ex5_3/mayer_primes.h b/ex5/ex5_3/mayer_primes.h new file mode 100644 index 0000000..6c6dc23 --- /dev/null +++ b/ex5/ex5_3/mayer_primes.h @@ -0,0 +1,73 @@ +#pragma once + +#include //memset +#include +//using namespace std; + +/** \brief Determines all prime numbers in interval [2, @p max]. + * + * The sieve of Eratosthenes is used. + * + * The implementation originates from Florian Mayer. + * + * \param[in] max end of interval for the prime number search. + * \return vector of prime numbers @f$2,3,5, ..., p<=max @f$. + * + * \copyright + * Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ +template +std::vector get_primes(T max) +{ + std::vector primes; + char *sieve; + sieve = new char[max / 8 + 1]; + // Fill sieve with 1 + memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char)); + for (T x = 2; x <= max; x++) + { + if (sieve[x / 8] & (0x01 << (x % 8))) { + primes.push_back(x); + // Is prime. Mark multiplicates. + for (T j = 2 * x; j <= max; j += x) + { + sieve[j / 8] &= ~(0x01 << (j % 8)); + } + } + } + delete[] sieve; + return primes; +} + +//--------------------------------------------------------------- +//int main() // by Florian Mayer +//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out +// vector primes; +// primes = get_primes(10000000); +// // return 0; +// // Print out result. +// vector::iterator it; +// for(it=primes.begin(); it < primes.end(); it++) +// cout << *it << " "; +// +// cout << endl; +// return 0; +//} diff --git a/ex5/ex5_4/Makefile b/ex5/ex5_4/Makefile new file mode 100644 index 0000000..b2129d8 --- /dev/null +++ b/ex5/ex5_4/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_4/benchmark_tests.cpp b/ex5/ex5_4/benchmark_tests.cpp new file mode 100644 index 0000000..35ed87b --- /dev/null +++ b/ex5/ex5_4/benchmark_tests.cpp @@ -0,0 +1,375 @@ +#include "benchmark_tests.h" +#include "benchmarks.h" +#include +#include +#include +using namespace std::chrono; + +vector test_A(const size_t &NLOOPS, const size_t &N) +{ + cout << "#################### (A) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N), y(N); + + cout.precision(2); + cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization +// Special: x_i = i+1; y_i = 1/x_i ==> == N + + for (size_t i = 0; i < N; ++i) + { + x[i] = i % 219 + 1; + y[i] = 1.0/x[i]; + } + + + cout << "\nStart Benchmarking scalar\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + double check(0.0),ss(0.0); + for (size_t i = 0; i < NLOOPS; ++i) + { + check = scalar_parallel(x, y); + ss += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n = " << check << endl; + if (static_cast(check) != N) + cout << " !! W R O N G result !!\n"; + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + return vector{t_diff, Gflops, MemBandwidth}; +} + +vector test_A_sum(const size_t &NLOOPS, const size_t &N) +{ + cout << "#################### (A) sum ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N); + + cout.precision(2); + cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization + + for (size_t i = 0; i < N; ++i) + { + x[i] = 1; + } + + + cout << "\nStart Benchmarking sum\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + double check(0.0),ss(0.0); + for (size_t i = 0; i < NLOOPS; ++i) + { + check = sum(x); + ss += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n = " << check << endl; + if (static_cast(check) != N) + cout << " !! W R O N G result !!\n"; + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_B(const size_t &NLOOPS, const size_t &N, const size_t &M) +{ + cout << "#################### (B) ####################" << endl; + + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + cout << "\nM = " << M << endl; + +// Memory allocation + cout << "Memory allocation\n"; + + vector A(M*N); + vector x(N); + + cout.precision(2); + cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +// Data initialization + + for (size_t i = 0; i < M; ++i) + for (size_t j = 0; j < N; ++j) + A[N*i + j] = (i + j) % 219 + 1; + + + for (size_t j = 0; j < N; ++j) + { + x[j] = 1.0/A[N*17 + j]; + } + + cout << "\nStart Benchmarking MatVec\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector b(M); + + for (size_t i = 0; i < NLOOPS; ++i) + { + b = MatVec_parallel(A, x); + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + +// Check the correct result + cout << "\n = " << b[17] << endl; + if (static_cast(b[17]) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N) +{ + cout << "#################### (C) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nL = " << L << endl; + cout << "\nM = " << M << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector A(M*L); + vector B(L*N); + + cout.precision(2); + cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization + + for (size_t i = 0; i < M; ++i) + for (size_t k = 0; k < L; ++k) + A[L*i + k] = (i + k) % 219 + 1; + + for (size_t k = 0; k < L; ++k) + for (size_t j = 0; j < N; ++j) + B[N*k + j] = 1.0/A[L*17 + k]; + + + cout << "\nStart Benchmarking MatMat\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector C(M*N); + double check; + double check_sum = 0; + + for (size_t i = 0; i < NLOOPS; ++i) + { + C = MatMat_parallel(A, B, L); + + check = C[N*17]; + check_sum += check; // prevents the optimizer from removing unused calculation results. + } + cout << check_sum; + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + +// Check the correct result + cout << "\n C[17,0] = " << check << endl; + if (static_cast(check) != L) + { + cout << " !! W R O N G result !!, should be " << L <<"\n"; + } + cout << endl; + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_D(const size_t &NLOOPS, const size_t &N, const size_t &p) +{ + cout << "#################### (D) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + cout << "\np = " << p << endl; + +// Memory allocation + cout << "Memory allocation\n"; + + vector a(p + 1, 0); + vector x(N); + + cout.precision(2); + cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +// Data initialization + + for (size_t j = 0; j < N; ++j) + x[j] = 1.0*j; + + for (size_t k = 0; k < p + 1; ++k) + a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ... + + + + cout << "\nStart Benchmarking poly\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector y(N); + double check; + double check_sum; + + for (size_t i = 0; i < NLOOPS; ++i) + { + y = poly_parallel(a, x); + check = y[0]; + + check_sum += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n poly(" << x[0] << ") = " << check << endl; + if (abs(check - 1.0) > 1.0/1e6) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} \ No newline at end of file diff --git a/ex5/ex5_4/benchmark_tests.h b/ex5/ex5_4/benchmark_tests.h new file mode 100644 index 0000000..45c64a9 --- /dev/null +++ b/ex5/ex5_4/benchmark_tests.h @@ -0,0 +1,13 @@ +#pragma once +#include +using namespace std; + +vector test_A(const size_t &NLOOPS, const size_t &N); + +vector test_A_sum(const size_t &NLOOPS, const size_t &N); + +vector test_B(const size_t &NLOOPS, const size_t &N, const size_t &M); + +vector test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N); + +vector test_D(const size_t &NLOOPS, const size_t &N, const size_t &p); \ No newline at end of file diff --git a/ex5/ex5_4/benchmarks.cpp b/ex5/ex5_4/benchmarks.cpp new file mode 100644 index 0000000..c0eb7d9 --- /dev/null +++ b/ex5/ex5_4/benchmarks.cpp @@ -0,0 +1,141 @@ +#include "benchmarks.h" +#include // assert() +#include +#include +#include +#include + +// (A) Inner product of two vectors (from skalar_stl) +double scalar_parallel(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); + size_t const N = x.size(); + double sum = 0.0; +//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime) +#pragma omp parallel for shared(x, y, N) reduction(+:sum) + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; + } + return sum; +} + +// (A) Vector entry sum +double sum(vector const &x) +{ + double sum = 0.0; +#pragma omp parallel for shared(x) reduction(+:sum) + for (size_t i = 0; i < x.size(); ++i) + { + sum += x[i]; + } + return sum; +} + + +// (B) Matrix-vector product (from intro_vector_densematrix) +vector MatVec_parallel(vector const &A, vector const &x) +{ + size_t const nelem = A.size(); + size_t const N = x.size(); + assert(nelem % N == 0); // make sure multiplication is possible + size_t const M = nelem/N; + + vector b(M); + +#pragma omp parallel for shared(A, x, N, M, b) + for (size_t i = 0; i < M; ++i) + { + double tmp = 0.0; + for (size_t j = 0; j < N; ++j) + tmp += A[N*i + j] * x[j]; + b[i] = tmp; + } + + return b; +} + + +// (C) Matrix-matrix product +vector MatMat_parallel(vector const &A, vector const &B, size_t const &L) +{ + size_t const nelem_A = A.size(); + size_t const nelem_B = B.size(); + + assert(nelem_A % L == 0 && nelem_B % L == 0); + + size_t const M = nelem_A/L; + size_t const N = nelem_B/L; + + + vector C(M*N); + + +#pragma omp parallel for shared(A, B, M, N, L, C) + for (size_t i = 0; i < M; ++i) + { + for (size_t k = 0; k < L; ++k) + { + for (size_t j = 0; j < N; ++j) + { + C[N*i + j] += A[L*i + k]*B[N*k + j]; + } + + } + } + + return C; +} + + +// (D) Evaluation of a polynomial function +vector poly_parallel(vector const &a, vector const &x) +{ + size_t const N = x.size(); + size_t const p = a.size() - 1; + vector y(N, 0); + +#pragma omp parallel for shared(a, x, N, p, y) + for (size_t i = 0; i < N; ++i) + { + double x_temp = x[i]; + double y_temp = 0; + for (size_t k = 0; k < p + 1; ++k) + { + y_temp += x_temp*y_temp + a[p - k]; + } + y[i] = y_temp; + } + + return y; +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ex5/ex5_4/benchmarks.h b/ex5/ex5_4/benchmarks.h new file mode 100644 index 0000000..2b55d2b --- /dev/null +++ b/ex5/ex5_4/benchmarks.h @@ -0,0 +1,55 @@ +#pragma once +#include +using namespace std; + +/** (A) Inner product of two vectors (from skalar_stl) + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar_parallel(vector const &x, vector const &y); + + +/** (A) Sum entries of vector + @param[in] x vector + @return sum +*/ +double sum(vector const &x); + + +/** (B) Matrix-vector product (from intro_vector_densematrix) + * @param[in] A dense matrix (1D access) + * @param[in] u vector + * + * @return resulting vector +*/ +vector MatVec_parallel(vector const &A, vector const &x); + + +/** (C) Matrix-matrix product + * @param[in] A MxL dense matrix (1D access) + * @param[in] B LxN dense matrix (1D access) + * @param[in] shared_dim shared dimension L + * + * @return resulting MxN matrix +*/ +vector MatMat_parallel(vector const &A, vector const &B, size_t const &shared_dim); + + +/** (D) Evaluation of a polynomial function using Horner's scheme + * @param[in] a coefficient vector + * @param[in] x vector with input values + * + * @return vector with output values +*/ +vector poly_parallel(vector const &a, vector const &x); + + + + + + + + + + diff --git a/ex5/ex5_4/main.cpp b/ex5/ex5_4/main.cpp new file mode 100644 index 0000000..d8fa63d --- /dev/null +++ b/ex5/ex5_4/main.cpp @@ -0,0 +1,84 @@ +#include "benchmark_tests.h" +#include +#include + +int main() +{ + vector> results_scalar; + results_scalar.push_back(test_A(2000000, pow(10,3))); + results_scalar.push_back(test_A(1000000, pow(10,4))); + results_scalar.push_back(test_A(100000, pow(10,5))); + results_scalar.push_back(test_A(10000, pow(10,6))); + results_scalar.push_back(test_A(750, pow(10,7))); + results_scalar.push_back(test_A(125, pow(10,8))); + + + vector> results_sum; + results_sum.push_back(test_A_sum(3000000, pow(10,3))); + results_sum.push_back(test_A_sum(2000000, pow(10,4))); + results_sum.push_back(test_A_sum(1000000, pow(10,5))); + results_sum.push_back(test_A_sum(50000, pow(10,6))); + results_sum.push_back(test_A_sum(2000, pow(10,7))); + results_sum.push_back(test_A_sum(250, pow(10,8))); + + + test_B(100, 20000, 10000); + + test_C(25, 500, 1000, 1500); + + test_D(100, 100, 1000000); + + + + cout << endl << "###### Scalar ######" << endl; + cout << "Timing\tGFLOPS\tGiByte/s" << endl; + cout << "------------------------------" << endl; + for (size_t i = 0; i < results_scalar.size(); ++i) + cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl; + + cout << endl << "###### Sum ######" << endl; + cout << "Timing\tGFLOPS\tGiByte/s" << endl; + cout << "------------------------------" << endl; + for (size_t i = 0; i < results_sum.size(); ++i) + cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl; + + + + + // ###### Scalar ###### + // Timing GFLOPS GiByte/s + // ------------------------------ + // 3.4e-06 0.54 4.3 + // 4.6e-06 4 32 + // 1.6e-05 12 95 + // 0.0011 1.7 13 + // 0.0097 1.9 15 + // 0.075 2.5 20 + + + // ###### Sum ###### + // Timing GFLOPS GiByte/s + // ------------------------------ + // 5.5e-06 0.17 1.3 + // 5.4e-06 1.7 14 + // 1.5e-05 6.1 49 + // 0.00013 7.2 57 + // 0.0033 2.8 23 + // 0.032 2.9 23 + + + + + + // ######### NOT PARALLEL (from exercise sheet 2) ######### + // Timing GFLOPS GiByte/s + // ---------------------------------- + // (A) 0.038 2.5 20 + // (B) 0.13 2.9 23 + // (C) 0.44 3.2 25 + // (D) 0.19 1.5 12 + + + + return 0; +}