#include "check_env.h" #include "mylib.h" #include // atoi() #include // strncmp() #include #include #include // OpenMP #include #include #include using namespace std; void benchmark(vector &x, vector &y, unsigned int N, unsigned int NLOOPS) { double sk = 0.0; for (int i = 0; i < NLOOPS; ++i) { sk += scalar(x, y); // or scalar_trans(x,y) / norm(x) if you want } } int main(int argc, char const *argv[]) { //int const NLOOPS = 5; // chose a value such that the benchmark runs at least 10 sec. unsigned int N = 5000001; int const NLOOPS = 5; // chose a value such that the benchmark runs at least 10 sec. //unsigned int N = 5000001; //########################################################################## // Read Parameter from command line (C++ style) cout << "Checking command line parameters for: -n " << endl; for (int i = 1; i < argc; i++) { cout << " arg[" << i << "] = " << argv[i] << endl; string ss(argv[i]); if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter { N = static_cast(atoi(argv[i + 1])); } else { cout << "Corect call: " << argv[0] << " -n \n"; } } cout << "\nN = " << N << endl; check_env(argc, argv); //######################################################################## int nthreads; // OpenMP #pragma omp parallel default(none) shared(cout,nthreads) { int const th_id = omp_get_thread_num(); // OpenMP int const nthrds = omp_get_num_threads(); // OpenMP stringstream ss; ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl; #pragma omp critical { cout << ss.str(); // output to a shared ressource } #pragma omp master nthreads = nthrds; // transfer nn to to master thread } cout << " " << nthreads << " threads have been started." << endl; //########################################################################## // Memory allocation cout << "Memory allocation\n"; vector x(N), y(N); cout.precision(2); cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; cout.precision(6); //########################################################################## // Data initialization // Special: x_i = i+1; y_i = 1/x_i ==> == N for (unsigned int i = 0; i < N; ++i) { x[i] = i + 1; y[i] = 1.0 / x[i]; } //########################################################################## cout << "\nStart Benchmarking\n"; // Do calculation double tstart = omp_get_wtime(); // OpenMP double sk(0.0); for (int i = 0; i < NLOOPS; ++i) { sk = scalar(x, y); sk = scalar_trans(x, y); //sk = norm(x); } double t1 = omp_get_wtime() - tstart; // OpenMP t1 /= NLOOPS; // divide by number of function calls //########################################################################## // Check the correct result cout << "\n = " << sk << endl; if (static_cast(sk) != N) { cout << " !! W R O N G result !!\n"; } cout << endl; //########################################################################## // Timings and Performance cout << endl; cout.precision(2); cout << "Timing in sec. : " << t1 << endl; cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; //######################################################################### cout << "\n Try the reduction with an STL-vektor \n"; auto vr = reduction_vec(100); cout << "done\n"; cout << vr << endl; N=2; //Data (re-)inizialiion for (unsigned int i = 0; i < N; ++i) { x[i] = i + 1; y[i] = 1.0 / x[i]; } int proc_count = omp_get_num_procs(); cout << "Number of available processors: " << proc_count << endl; for(int j=1; j<=proc_count; j++) { omp_set_num_threads(j); cout << "used threads: "<< j << endl; omp_set_schedule(omp_sched_static, 0); tstart = omp_get_wtime(); benchmark(x, y, N, NLOOPS); t1 = omp_get_wtime()/NLOOPS; cout << "static (chunk 0) "<< (t1-tstart) << endl; for(int i=0; i<= 5; i++) { int chunk = 1 << i; cout << "chunk size: "<< chunk << endl; // STATIC omp_set_schedule(omp_sched_static, chunk); tstart = omp_get_wtime(); benchmark(x, y, N, NLOOPS); t1 = omp_get_wtime()/NLOOPS; std::cout << "static: " << (t1 - tstart) << " s\n"; // DYNAMIC omp_set_schedule(omp_sched_dynamic, chunk); tstart = omp_get_wtime(); benchmark(x, y, N, NLOOPS); t1 = omp_get_wtime()/NLOOPS; std::cout << "dynamic: " << (t1 - tstart) << " s\n"; // GUIDED omp_set_schedule(omp_sched_guided, chunk); tstart = omp_get_wtime(); benchmark(x, y, N, NLOOPS); t1 = omp_get_wtime()/NLOOPS; std::cout << "guided: " << (t1 - tstart) << " s\n"; // AUTO omp_set_schedule(omp_sched_auto, chunk); tstart = omp_get_wtime(); benchmark(x, y, N, NLOOPS); t1 = omp_get_wtime()/NLOOPS; std::cout << "auto: " << (t1 - tstart) << " s\n"; cout << endl; } cout << endl; } cout << scalar_parrallel_env(x,y) << endl; vector vec = reduction_vec_append(N); for(int i=0; i< N; i++) { cout << vec[i] << ", "; } return 0; } // memory for x and y will be deallocated their destructors