This commit is contained in:
dino.celebic 2025-12-05 23:13:00 +01:00
commit 95b3017475
28 changed files with 5800 additions and 0 deletions

31
ex5/code/Makefile Normal file
View file

@ -0,0 +1,31 @@
PROGRAM = main
SOURCES = $(wildcard *.cpp)
OBJECTS = ${SOURCES:.cpp=.o}
CXX = g++
LINKER = g++
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -fmax-errors=1
CXXFLAGS = -g -flto -O3 -ffast-math -march=native -fopenmp ${WARNINGS}
LINKFLAGS = -g -flto -O3 -lopenblas -llapacke -fopenmp
all: ${PROGRAM}
# %.o: %.cpp
# ${CXX} ${CXXFLAGS} -c $< -o $@
${PROGRAM}: ${OBJECTS}
$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}
clean:
rm -f ${OBJECTS} ${PROGRAM}
rm out_1.txt
run: ${PROGRAM}
# run: clean ${PROGRAM}
./${PROGRAM}

500
ex5/code/data_1.txt Normal file
View file

@ -0,0 +1,500 @@
141
261
87
430
258
298
425
120
496
707
244
786
75
394
4
221
2
190
143
269
175
139
599
902
940
222
483
377
524
265
69
437
174
27
955
431
962
763
8
681
706
646
553
219
773
229
371
891
857
403
319
609
911
910
592
333
854
443
905
34
533
717
180
337
188
322
404
549
49
553
275
242
244
155
957
936
819
729
176
361
189
2
317
700
626
544
440
288
502
762
763
577
748
646
124
505
348
93
148
199
673
432
695
257
10
533
280
947
907
393
25
672
838
972
57
451
583
687
720
651
727
374
582
117
58
980
285
595
963
186
194
342
933
391
274
152
398
375
132
436
92
615
11
574
790
236
449
570
62
497
643
222
838
972
847
506
279
747
237
958
621
601
173
91
256
859
912
700
726
230
577
811
404
989
90
321
512
61
726
557
530
830
859
790
318
453
753
110
110
270
525
973
711
312
292
851
912
640
256
89
839
585
949
62
585
286
828
191
443
394
827
677
208
319
134
672
571
170
148
477
909
553
33
54
806
452
383
790
365
533
712
872
329
651
975
76
588
414
310
264
759
996
187
782
196
993
803
425
729
499
809
357
74
591
911
194
433
750
40
947
764
559
184
498
518
995
855
963
679
404
935
480
232
397
706
559
757
996
963
536
964
116
52
305
581
531
902
541
432
543
713
17
801
143
479
257
370
662
170
279
199
196
327
881
472
404
180
969
408
845
616
377
878
785
465
814
899
430
335
597
902
703
378
735
955
543
541
312
72
182
93
464
10
916
643
2
31
209
455
128
9
728
355
781
437
437
50
50
92
595
242
842
858
964
489
221
227
537
763
348
462
640
918
162
716
578
434
885
394
179
634
625
328
803
1000
981
128
233
24
608
111
408
885
549
370
209
441
957
125
471
857
44
692
979
284
134
686
910
611
900
194
755
347
419
156
820
625
739
806
68
951
498
756
743
832
157
458
619
933
836
896
583
583
855
35
886
408
37
747
155
144
606
255
325
402
407
387
610
167
189
95
324
770
235
741
693
825
828
294
310
524
326
832
811
557
263
681
234
457
385
539
992
756
981
235
529
52
757
602
858
989
930
410
1
541
208
220
326
96
748
749
544
339
833
553
958
893
357
547
347
623
797
746
126
823
26
415
732
782
368

215
ex5/code/main.cpp Normal file
View file

@ -0,0 +1,215 @@
// clang-tidy *.cpp -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" -- *.cpp
#include "task_2.h"
#include "task_3.h"
#include "task_4.h"
#include "timing.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <execution>
#include <iomanip>
#include <iostream>
#include <omp.h>
#include <sstream>
#include <vector>
void task_2() {
printf("\n\n-------------- Task 2 --------------\n\n");
int threads = 4;
omp_set_num_threads(threads);
cout << omp_get_max_threads() << " threads have been started." << endl;
// Read vector
vector<double> a;
read_vector_from_file("data_1.txt", a);
tic();
// min and max
// auto [min, max] = min_max_par(a);
auto min = *min_element(std::execution::par, a.begin(), a.end());
auto max = *max_element(std::execution::par, a.begin(), a.end());
// means
auto [x,y,z] = means_par(a);
// deviation
double deviation(0.0);
#pragma omp parallel for shared(x,a) reduction(+:deviation)
for (long unsigned int i=0; i<a.size(); i++){
deviation += pow(x - a.at(i),2);
}
deviation = sqrt(deviation/static_cast<double>(a.size()));
double t = toc();
printf("Minimum: %f\n", min);
printf("Maximum: %f\n", max);
printf("Arithmetic: %f\n", x);
printf("Geometric: %f\n", y);
printf("Harmonic: %f\n", z);
printf("Deviation: %f\n", deviation);
printf("Execution time: %f\n", t);
// write results to file
vector<double> b = {min,max,x,y,z,deviation};
write_vector_to_file("out_1.txt", b);
}
void task_3() {
printf("\n\n-------------- Task 3 --------------\n\n");
int threads = 4;
omp_set_num_threads(threads);
cout << omp_get_max_threads() << " threads have been started." << endl;
// #####################################
// single_goldbach(k)
int k = 694;
printf("single_goldbach(k = %d) = %d\n", k, single_goldbach_par(k));
// Prints decompositions
print_decomps(k);
// count_goldbach(n)
// printf("\nNOTE: For n=2'000'000 it will take ~30 seconds.\n");
for (int n : {10'000, 100'000, 400'000, 1'000'000, 2'000'000/*, 10'000'000*/}) {
tic();
vector<int> counts = count_goldbach_par(n);
double sec = toc();
auto max = max_element(counts.begin(), counts.end());
printf("count_goldbach(n = %d): k = %ld, decompositions = %d, time elapsed: %f milliseconds\n", n, max-counts.begin(), *max, sec*1000);
}
printf("Should be: k = 9240, 99330, 390390, 990990, 1981980, 9699690\n");
printf(" decompositions = 329, 2168, 7094, 15594, 27988, 124180\n\n");
}
void task_4() {
printf("\n\n-------------- Task 4 --------------\n\n");
int threads = 32;
omp_set_num_threads(threads);
cout << omp_get_max_threads() << " threads have been started." << endl;
size_t M, N, L, p, NLOOPS;
{ // Matrix-Vector product
printf("----- Benchmark (B) -----\n");
// Initialization
M = 8'000;
N = 12'000;
NLOOPS = 30;
auto [A,x] = init_B(M,N);
// Benchmark
tic();
benchmark_B(A, x, NLOOPS, false);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = M*N + M + N;
size_t flops = 2 * M * N;
print_performance(sec, memory, flops, sizeof(A[0]));
printf("-------------------------\n");
}
{ // Matrix-Matrix product
printf("----- Benchmark (C) -----\n");
// Initialization
M = 1'000;
N = 2'000;
L = 500;
NLOOPS = 20;
auto [A,B] = init_C(M,N,L);
// Benchmark
tic();
benchmark_C(A, B, L, NLOOPS, false);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = M*L + L*N + M*N;
size_t flops = M * 2*L * N;
print_performance(sec, memory, flops, sizeof(A[0]));
printf("-------------------------\n");
}
{ // Polynomial evaluation
printf("----- Benchmark (D) -----\n");
// Initialization
N = 1'000'000;
p = 200;
NLOOPS = 20;
auto [x,a] = init_D(N,p);
// Benchmark
tic();
benchmark_D(x, a, NLOOPS);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = 2.0 * N;
size_t flops = 2.0 * N * p;
print_performance(sec, memory, flops, sizeof(x[0]));
printf("-------------------------\n");
}
// Timing
NLOOPS = 50;
int K=9, T=16;
vector<double> speedup_sum((K-3+1)*T), speedup_scalar((K-3+1)*T);
for (int k=0; k<(K-3+1); ++k) {
N = pow(10,k);
auto [x,y] = init_A(N);
for (int t=0; t<T; t++) {
omp_set_num_threads(t+1);
tic();
benchmark_summation(x, NLOOPS);
speedup_sum[k*T+t] = toc() / NLOOPS;
tic();
benchmark_A(x, y, NLOOPS, false);
speedup_scalar[k*T+t] = toc() / NLOOPS;
}
}
// Calculating speedup
for (int k=0; k<(K-3+1); ++k) {
double t0 = speedup_sum[k*T];
double t00 = speedup_scalar[k*T];
for (int t=0; t<T; t++){
speedup_sum[k*T+t] = t0/speedup_sum[k*T+t];
speedup_scalar[k*T+t] = t00/speedup_scalar[k*T+t];
}
}
// Printing tables
cout << fixed << setprecision(4);
cout << "\n\nSpeedup: summation" << endl;
cout << "k \\ threads | ";
for (int t=0; t<T; t++) {cout << setw(2) << t+1 << " | ";}
cout << endl;
for (int k=3; k<K+1; ++k) {
cout << " " << k << " |";
for (int t=0; t<T; t++) {
cout << speedup_sum[(k-3)*T+t] << "|";
}
cout << endl;
}
cout << "\n\nSpeedup: scalar" << endl;
cout << "k \\ threads | ";
for (int t=0; t<T; t++) {cout << setw(2) << t+1 << " | ";}
cout << endl;
for (int k=3; k<K+1; ++k) {
cout << " " << k << " |";
for (int t=0; t<T; t++) {
cout << speedup_scalar[(k-3)*T+t] << "|";
}
cout << endl;
}
}
int main() {
task_2();
task_3();
task_4();
return 0;
}

73
ex5/code/mayer_primes.h Normal file
View file

@ -0,0 +1,73 @@
#pragma once
#include <cstring> //memset
#include <vector>
//using namespace std;
/** \brief Determines all prime numbers in interval [2, @p max].
*
* The sieve of Eratosthenes is used.
*
* The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
*
* \param[in] max end of interval for the prime number search.
* \return vector of prime numbers @f$2,3,5, ..., p<=max @f$.
*
* \copyright
* Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018)
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
template <class T>
std::vector<T> get_primes(T max)
{
std::vector<T> primes;
char *sieve;
sieve = new char[max / 8 + 1];
// Fill sieve with 1
memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
for (T x = 2; x <= max; x++)
{
if (sieve[x / 8] & (0x01 << (x % 8))) {
primes.push_back(x);
// Is prime. Mark multiplicates.
for (T j = 2 * x; j <= max; j += x)
{
sieve[j / 8] &= ~(0x01 << (j % 8));
}
}
}
delete[] sieve;
return primes;
}
//---------------------------------------------------------------
//int main() // by Florian Mayer
//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out
// vector<unsigned long> primes;
// primes = get_primes(10000000);
// // return 0;
// // Print out result.
// vector<unsigned long>::iterator it;
// for(it=primes.begin(); it < primes.end(); it++)
// cout << *it << " ";
//
// cout << endl;
// return 0;
//}

6
ex5/code/out_1.txt Normal file
View file

@ -0,0 +1,6 @@
1
1000
498.184
364.412
95.6857
287.905

93
ex5/code/task_2.cpp Normal file
View file

@ -0,0 +1,93 @@
#include "task_2.h"
#include <cassert> // assert
#include <cmath>
#include <fstream>
#include <omp.h>
tuple<double, double> min_max_par(const vector<double> &v) {
int min_val = v[0];
int max_val = v[0];
#pragma omp parallel for reduction(min:min_val) reduction(max:max_val)
for (size_t i = 0; i < v.size(); ++i) {
if (v[i] < min_val) min_val = v[i];
if (v[i] > max_val) max_val = v[i];
}
return make_tuple(min_val,max_val);
}
tuple<double, double, double> means_par(const vector<double>& v){
size_t n = v.size();
double sum = 0;
double logsum = 0;
double invsum = 0;
#pragma omp parallel for shared(v,n) reduction(+:sum, logsum, invsum)
for (size_t i = 0; i<n; ++i){
sum += v[i];
logsum += log(v[i]);
invsum += 1.0/v[i];
}
double arith = sum / static_cast<double>(n);
double geo = exp(1.0/static_cast<double>(n) * logsum);
double harm = static_cast<double>(n) / invsum;
return make_tuple(arith, geo, harm);
}
void fill_vector(istream& istr, vector<double>& v)
{
double d=0;
while ( istr >> d) v.push_back(d); // Einlesen
if (!istr.eof())
{ // Fehlerbehandlung
cout << " Error handling \n";
if ( istr.bad() ) throw runtime_error("Schwerer Fehler in istr");
if ( istr.fail() ) // Versuch des Aufraeumens
{
cout << " Failed in reading all data.\n";
istr.clear();
}
}
v.shrink_to_fit(); // C++11
return;
}
void read_vector_from_file(const string& file_name, vector<double>& v)
{
ifstream fin(file_name); // Oeffne das File im ASCII-Modus
if( fin.is_open() ) // File gefunden:
{
v.clear(); // Vektor leeren
fill_vector(fin, v);
}
else // File nicht gefunden:
{
cout << "\nFile " << file_name << " has not been found.\n\n" ;
assert( fin.is_open() && "File not found." ); // exeption handling for the poor programmer
}
return;
}
void write_vector_to_file(const string& file_name, const vector<double>& v)
{
ofstream fout(file_name); // Oeffne das File im ASCII-Modus
if( fout.is_open() )
{
for (size_t k=0; k<v.size(); ++k)
{
fout << v.at(k) << endl;
}
}
else
{
cout << "\nFile " << file_name << " has not been opened.\n\n" ;
assert( fout.is_open() && "File not opened." ); // exeption handling for the poor programmer
}
return;
}

38
ex5/code/task_2.h Normal file
View file

@ -0,0 +1,38 @@
#pragma once
#include <iostream>
#include <vector>
using namespace std;
tuple<double, double> min_max_par(const vector<double> &v);
tuple<double, double, double> means_par(const vector<double>& v);
/**
This function opens the ASCII-file named @p file_name and reads the
double data into the C++ vector @p v.
If the file @p file_name does not exist then the code stops with an appropriate message.
@param[in] file_name name of the ASCII-file
@param[out] v C++ vector with double values
*/
void read_vector_from_file(const string& file_name, vector<double>& v);
/**
This function opens the ASCII-file named @p file_name and rewrites its with the
double data from the C++ vector @p v.
If there are problems in opening/generating file @p file_name
then the code stops with an appropriate message.
@param[in] file_name name of the ASCII-file
@param[in] v C++ vector with double values
*/
void write_vector_to_file(const string& file_name, const vector<double>& v);
/**
Fills the double-vector @p v with data from an input stream @p istr until this input stream
ends regularily. The vector is cleared and its memory is automatically allocated.
@param[in] istr input stream
@param[out] v C++ vector with double values
@warning An exception is thrown in case of wrong data format or corrupted data.
*/
void fill_vector(istream& istr, vector<double>& v);

73
ex5/code/task_3.cpp Normal file
View file

@ -0,0 +1,73 @@
#include "task_3.h"
#include "mayer_primes.h"
#include "timing.h"
#include <algorithm>
#include <cassert>
#include <iostream>
#include <omp.h>
#include <vector>
using namespace std;
int single_goldbach_par(int k) {
const vector<int> primes = get_primes(k);
int count = 0;
#pragma omp parallel for reduction(+:count)
for (size_t i = 0; i < primes.size(); i++) {
for (size_t j = i; j < primes.size(); j++) {
if (primes[i] + primes[j] == k) {
count++;
}
}
}
return count;
}
vector<int> count_goldbach_par(int n) {
const vector<int> primes = get_primes(n);
vector<int> counts(n+1);
#pragma omp parallel reduction(VecAdd:counts)
// #pragma omp parallel
{
vector<int> local_counts(n+1, 0);
#pragma omp for
for (size_t i = 1; i < primes.size(); i++) {
for (size_t j = i; j < primes.size(); j++) {
int sum = primes[i] + primes[j];
if (sum <= n) {
local_counts[sum]++;
}
}
}
counts += local_counts;
// #pragma omp critical
// {
// for(int k=0; k<n+1; k++){
// counts[k] += local_counts[k];
// }
// }
}
return counts;
}
void print_decomps(int k) {
const vector<int> primes = get_primes(k);
cout << "\nDecompositions for k = " << k << ": ";
for (size_t i = 0; i < primes.size(); i++) {
for (size_t j = i; j < primes.size(); j++) {
if (primes[i] + primes[j] == k) {
cout << primes[i] << " + " << primes[j] << ", ";
}
}
}
cout << endl;
}

33
ex5/code/task_3.h Normal file
View file

@ -0,0 +1,33 @@
#pragma once
#include <cassert>
#include <vector>
using namespace std;
// Counts number of possible decompositions with 2 primes that sum up to k.
int single_goldbach_par(int k);
// Counts number of possible decompositions with 2 primes that sum up to k for all even numbers k \in {4,...,n}.
vector<int> count_goldbach_par(int n);
// Prints all decompositions of k.
void print_decomps(int k);
/** Vector @p b adds its elements to vector @p a .
@param[in] a vector
@param[in] b vector
@return a+=b componentwise
*/
template<class T>
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
{
assert(a.size()==b.size());
for (size_t k = 0; k < a.size(); ++k) {
a[k] += b[k];
}
return a;
}
#pragma omp declare reduction(VecAdd : std::vector<int> : omp_out += omp_in) \
initializer (omp_priv=omp_orig)

232
ex5/code/task_4.cpp Normal file
View file

@ -0,0 +1,232 @@
#include "task_4.h"
#include "timing.h"
#include <cassert>
#include <cblas.h> // cBLAS Library
#include <iostream>
#include <vector>
using namespace std;
vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
size_t const N = x.size();
size_t const M = A.size() / N;
vector<double> b(M);
#pragma omp parallel for shared(A,x,N,M,b)
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
b[i] += A[i*N + j] * x[j];
}
}
return b;
}
vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
size_t const L = A.size() / M;
size_t const N = B.size() / L;
vector<double> C(M*N,0);
#pragma omp parallel for shared(A,B,M,L,N,C)
for (size_t i = 0; i < M; ++i) {
for (size_t k = 0; k < L; ++k) {
for (size_t j = 0; j < N; ++j) {
C[i*N + j] += A[i*L + k] * B[k*N + j];
}
}
}
return C;
}
vector<double> poly(vector<double> const &x, vector<double> const &a) {
size_t N = x.size();
size_t p = a.size();
vector<double> y(N);
#pragma omp parallel for shared(x,a,N,p,y)
for (size_t i = 0; i < N; ++i) {
y[i] = a[p];
for (size_t k = 1; k < p; ++k) {
y[i] = y[i]*x[i] + a[p-k];
}
}
return y;
}
double scalar(vector<double> const &x, vector<double> const &y) {
assert(x.size() == y.size());
size_t const N = x.size();
double sum = 0.0;
#pragma omp parallel for shared(x,y,N) reduction(+:sum)
for (size_t i = 0; i < N; ++i) {
sum += x[i] * y[i];
}
return sum;
}
double summation(vector<double> const &x){
size_t N = x.size();
double sum = 0.0;
#pragma omp parallel for shared(x,N) reduction(+:sum)
for (size_t i = 0; i < N; ++i) {
sum += x[i];
}
return sum;
}
// ##########################################################################
void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
printf("Memory allocated : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
printf("Duration per loop : %.3f sec\n", sec);
printf("GFLOPS : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
printf("GiByte/s : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
}
tuple<vector<double>, vector<double>> init_A(size_t N) {
vector<double> x(N), y(N);
for (size_t i = 0; i < N; ++i) {
x[i] = i%219 + 1.0;
y[i] = 1.0 / x[i];
}
return make_tuple(x, y);
}
void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
size_t N = x.size();
double s(0.0), sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
s = scalar(x, y);
sum += s;
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
s = cblas_ddot(N, x.data(), 1, y.data(), 1);
sum += s;
}
}
// Check correctness
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
vector<double> A(M*N), x(N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
A[i*N + j] = (i+j)%219 + 1.0;
}
}
for (size_t j = 0; j < N; ++j) {
x[j] = 1.0/A[17*N + j];
}
return make_tuple(A, x);
}
void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
size_t N = x.size();
size_t M = A.size() / N;
vector<double> b(M);
double sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
b = matrix_vec(A,x);
sum += b[17];
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
sum += b[17];
}
}
// Check correctness
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
vector<double> A(M*L), B(L*N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < L; ++j) {
A[i*L + j] = (i+j)%219 + 1.0;
}
}
// B chosen such that C[0,17]=L
// so B[i,17] = 1/A[0,i]
for (size_t i = 0; i < L; ++i) {
for (size_t j = 0; j < N; ++j) {
if (j==17) {
B[i*N + 17] = 1.0/A[i];
} else {
B[i*N + j] = (i+j)%219 + 1.0;
}
}
}
return make_tuple(A, B);
}
void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
size_t M = A.size() / L;
size_t N = B.size() / L;
vector<double> C(M*N);
double sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
C = matrix_matrix(A,B,M);
sum += C[17];
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
sum += C[17];
}
}
// Check correctness
if (static_cast<size_t>(sum) != L*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
// x_i = i/N for i=0,...,N-1
// a_j = 1 for j=0,...,p-1
vector<double> x(N), a(p);
for (size_t i = 0; i < N; ++i) {
x[i] = static_cast<double>(i) / N;
}
for (size_t j = 0; j < p; ++j) {
a[j] = 1.0;
}
return make_tuple(x, a);
}
void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
size_t N = x.size();
vector<double> y(N);
double sum(0.0);
for (size_t i = 0; i < NLOOPS; ++i) {
y = poly(x,a);
sum += y[0];
}
// Check correctness
if (static_cast<size_t>(sum) != NLOOPS) {printf(" !! W R O N G result sum = %f !!\n", sum);}
}
void benchmark_summation(vector<double> const &x, size_t NLOOPS) {
double s(0.0), sum(0.0);
for (size_t i = 0; i < NLOOPS; ++i) {
s = summation(x);
sum += s;
}
}

22
ex5/code/task_4.h Normal file
View file

@ -0,0 +1,22 @@
#pragma once
#include <vector>
using namespace std;
vector<double> matrix_vec(vector<double> const &A, vector<double> const &x);
vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M);
vector<double> poly(vector<double> const &x, vector<double> const &a);
double scalar(vector<double> const &x, vector<double> const &y);
double summation(vector<double> const &x);
void print_performance(double sec, size_t memory, size_t flops, unsigned int size);
tuple<vector<double>, vector<double>> init_A(size_t N);
tuple<vector<double>, vector<double>> init_B(size_t M, size_t N);
tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L);
tuple<vector<double>, vector<double>> init_D(size_t N, size_t p);
void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas);
void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas);
void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas);
void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS);
void benchmark_summation(vector<double> const &x, size_t NLOOPS);

51
ex5/code/timing.h Normal file
View file

@ -0,0 +1,51 @@
//
// Gundolf Haase, Oct 18 2024
//
#pragma once
#include <chrono> // timing
#include <stack>
//using Clock = std::chrono::system_clock; //!< The wall clock timer chosen
using Clock = std::chrono::high_resolution_clock;
using TPoint= std::chrono::time_point<Clock>;
// [Galowicz, C++17 STL Cookbook, p. 29]
inline
std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
/** Starts stopwatch timer.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
*
* The timining can be nested and the recent time point is stored on top of the stack.
*
* @return recent time point
* @see toc
*/
inline auto tic()
{
MyStopWatch.push(Clock::now());
return MyStopWatch.top();
}
/** Returns the elapsed time from stopwatch.
*
* The time point from top of the stack is used
* if time point @p t_b is not passed as input parameter.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
* or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode
* The last option is to be used in the case of
* non-nested but overlapping time measurements.
*
* @param[in] t_b start time of some stop watch
* @return elapsed time in seconds.
*
*/
inline double toc(TPoint const &t_b = MyStopWatch.top())
{
// https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
using Unit = std::chrono::seconds;
using FpSeconds = std::chrono::duration<double, Unit::period>;
auto t_e = Clock::now();
MyStopWatch.pop();
return FpSeconds(t_e-t_b).count();
}