This commit is contained in:
Markus Schmidt 2025-11-12 00:14:28 +01:00
commit 2195a9db0a
51 changed files with 13038 additions and 0 deletions

6
sheet3/345/.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,6 @@
{
"files.associations": {
"ostream": "cpp",
"iostream": "cpp"
}
}

2563
sheet3/345/Doxyfile Normal file

File diff suppressed because it is too large Load diff

View file

30
sheet3/345/Makefile Normal file
View file

@ -0,0 +1,30 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp mylib.cpp benchmark.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

View file

127
sheet3/345/benchmark.cpp Normal file
View file

@ -0,0 +1,127 @@
#include <iostream>
#include <vector>
#include <cmath>
using namespace std;
// Inner product
double benchmark_A(const vector<double> &x, const vector<double> &y)
{
double sum = 0.0;
for (unsigned int i = 0; i < x.size(); i++)
{
sum += x[i]*y[i];
}
return sum;
}
//Matrix-vector product
vector<double> benchmark_B(const vector<double> &A, const vector<double> &x)
{
unsigned int N = x.size();
unsigned int M = A.size() / N;
vector<double> b(M, 0.0);
for (unsigned int i = 0; i < M; i++)
{
double bi = 0.0;
for (unsigned int j = 0; j < N; j++)
{
bi += A[i*N+j]*x[j];
}
b[i] = bi;
}
return b;
}
//Matrix-Matrix product
vector<double> benchmark_C(const vector<double> &A, const vector<double> &B, unsigned int M)
{
unsigned int L = A.size()/M;
unsigned int N = B.size()/L;
vector<double> C(M*N,0.0);
for (unsigned int i = 0; i < M; i++)
{
for (unsigned int j = 0; j < N; j++)
{
double sum = 0.0;
for (unsigned int k = 0; k < L; k++)
{
sum += A[i*L+k]*B[k*N+j];
}
C[i*N+j] = sum;
}
}
return C;
}
//polynomial evaluation
vector<double> benchmark_D(const vector<double>& coeff, const vector<double>& x)
{
unsigned int p = coeff.size(); // p coefficients, degree p-1
unsigned int N = x.size();
vector<double> y(N);
for (unsigned int i = 0; i < N; i++){
double yi = coeff[p-1];
double xi = x[i];
for(int j=p-2; j>=0; --j)
{
yi = yi*xi+coeff[j];
}
y[i] = yi;
}
return y;
}
//TASK 5
double norm2(const vector<double>& x)
{
double s = 0.0;
for (unsigned int i = 0; i < x.size(); ++i)
s += x[i]*x[i];
return sqrt(s);
}
double scalar_kahan(const vector<double>& x, const vector<double>& y)
{
double sum = 0.0;
double c = 0.0;
for (unsigned int i = 0; i < x.size(); i++)
{
double prod = x[i]*y[i];
double yk = prod - c;
double t = sum+yk;
c= (t - sum) - yk;
sum = t;
}
return sum;
}
//Matrix-Matrix product
vector<double> matrixMultColumnWise(const vector<double> &A, const vector<double> &B, unsigned int M)
{
unsigned int L = A.size()/M;
unsigned int N = B.size()/L;
vector<double> C(M*N,0.0);
for (unsigned int i = 0; i < M; i++)
{
for (unsigned int j = 0; j < N; j++)
{
double sum = 0.0;
for (unsigned int k = 0; k < L; k++)
{
sum += A[k*L+i]*B[k*N+j];
}
C[i*N+j] = sum;
}
}
return C;
}

29
sheet3/345/benchmark.h Normal file
View file

@ -0,0 +1,29 @@
#ifndef BENCHMARK_H
#define BENCHMARK_H
#include <vector>
using namespace std;
double benchmark_A(const vector<double> &x,
const vector<double> &y);
vector<double> benchmark_B(const vector<double> &A,
const vector<double> &x);
vector<double> benchmark_C(const vector<double> &A,
const vector<double> &B,
unsigned int M);
vector<double> benchmark_D(const vector<double> &coefficients,
const vector<double> &x);
double norm2(const vector<double>& x);
double scalar_kahan(const vector<double>& x, const vector<double>& y);
vector<double> matrixMultColumnWise(const vector<double> &A,
const vector<double> &B,
unsigned int M);
#endif

BIN
sheet3/345/main Executable file

Binary file not shown.

286
sheet3/345/main.cpp Normal file
View file

@ -0,0 +1,286 @@
#include "mylib.h"
#include <cassert>
#include <chrono> // timing
#include <cmath> // sqrt()
#include <cstdlib> // atoi()
#include <cstring> // strncmp()
#include <ctime>
#include <iostream>
#include <sstream>
#include "benchmark.h"
using namespace std;
using namespace std::chrono; // timing
int main(int argc, char **argv)
{
const unsigned int NA = 1400000;
const unsigned int NLOOPSA = 2000;
//const unsigned int NLOOPS = 10;
const unsigned int MC = 1000;
int const NLOOPSC = 5;
// ---------- Benchmark A ----------
{
vector<double> xA(NA), yA(NA);
for (unsigned int i = 0; i < NA; ++i)
{
double xi= (i % 219) + 1;
xA[i] = xi;
yA[i] = 1.0 / xi;
}
auto tA1 = system_clock::now();
double sA = 0.0, sumA = 0.0;
for (unsigned int loop = 0; loop < NLOOPSA; ++loop)
{
sA = benchmark_A(xA, yA);
sumA += sA;
}
auto tA2 = system_clock::now();
auto durA = duration_cast<microseconds>(tA2 - tA1);
double tA = static_cast<double>(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds
cout << "\n===== Benchmark A =====\n";
cout << "<xA,yA> = " << sA << endl;
cout << "Timing in sec. : " << tA << endl;
cout << "GFLOPS : " << 2.0 * NA / tA / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : "
<< 2.0 * NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl;
}
// ---------- Benchmark B----------
{
const unsigned int MB = 1700;
const unsigned int NB = MB;
const unsigned int NLOOPSB = 200;//50;
vector<double> AB(MB * NB);
vector<double> xB(NB);
for (unsigned int i = 0; i < MB; ++i)
for (unsigned int j = 0; j < NB; ++j)
AB[i * NB + j] = (i+j) %219 +1;
for (unsigned int j = 0; j < NB; ++j)
{
xB[j] = 1.0 / AB[17*NB+j];
}
vector<double> bB;
auto tB1 = system_clock::now();
double guardB = 0.0;
for (unsigned int loop = 0; loop < NLOOPSB; ++loop)
{
bB = benchmark_B(AB, xB);
guardB += bB[17];
}
auto tB2 = system_clock::now();
auto durB = duration_cast<microseconds>(tB2 - tB1);
double tB = static_cast<double>(durB.count()) / 1e6 / NLOOPSB;
double flopsB = 2.0 * MB * NB;
double bytesB = (MB * NB + NB + MB) * sizeof(double);
cout << "\n===== Benchmark B =====\n";
cout << guardB << endl;
cout << "bytes: " << bytesB << endl;
cout << "Timing in sec. : " << tB << endl;
cout << "GFLOPS : " << flopsB / tB / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << bytesB / tB / 1024 / 1024 / 1024 << endl;
}
// ---------- Benchmark C ----------
{
const unsigned int LC = MC;
const unsigned int NC = MC;
vector<double> AC(MC * LC), BC(LC * NC);
for (unsigned int i = 0; i < MC; ++i)
for (unsigned int j = 0; j < LC; ++j)
AC[i * LC + j] = (i+j) %219 +1;
for (unsigned int i = 0; i < LC; ++i)
for (unsigned int j = 0; j < NC; ++j)
BC[i * NC + j] = (i+j) %219 +1;
vector<double> CC;
auto tC1 = system_clock::now();
double guardC = 0.0;
for (unsigned int loop = 0; loop < NLOOPSC; ++loop)
{
CC = benchmark_C(AC, BC, MC);
guardC += CC[0];
}
auto tC2 = system_clock::now();
auto durC = duration_cast<microseconds>(tC2 - tC1);
double tC = static_cast<double>(durC.count()) / 1e6 / NLOOPSC;
double flopsC = 2.0 * MC * LC * NC;
double bytesC = (MC * LC + LC * NC + MC * NC)* sizeof(double);
cout << "\n===== Benchmark C =====\n";
cout << "bytes: " << bytesC << endl;
cout << "Timing in sec. : " << tC << endl;
cout << "GFLOPS : " << flopsC / tC / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << bytesC / tC / 1024 / 1024 / 1024 << endl;
}
// ---------- Benchmark D----------
{
const unsigned int ND = 2000000;
const unsigned int p = 14; // degree p-1 = 15
const unsigned int NLOOPSD = 100;
vector<double> coeff(p, 0.0);
vector<double> xD(ND);
for (unsigned int k = 0; k < p; ++k)
coeff[k] = k%219+1;
for (unsigned int i = 0; i < ND; ++i)
xD[i] = i%219+1;
vector<double> yD;
auto tD1 = system_clock::now();
double guardD = 0.0;
for (unsigned int loop = 0; loop < NLOOPSD; ++loop)
{
yD = benchmark_D(coeff, xD);
guardD += yD[0];
}
auto tD2 = system_clock::now();
auto durD = duration_cast<microseconds>(tD2 - tD1);
double tD = static_cast<double>(durD.count()) / 1e6 / NLOOPSD;
double flopsD = ND * 2 * p;
double bytesD = (p + 2 * ND)*sizeof(double);
cout << "\n===== Benchmark D =====\n";
cout << "bytes: " << bytesD << endl;
cout << "Timing in sec. : " << tD << endl;
cout << "GFLOPS : " << flopsD / tD / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << bytesD / tD / 1024 / 1024 / 1024 << endl;
}
//-------------TASK 5
{
vector<double> xA(NA);
for (unsigned int i = 0; i < NA; ++i)
{
double xi= (i % 219) + 1;
xA[i] = xi;
}
auto tA1 = system_clock::now();
double sA = 0.0, sumA = 0.0;
for (unsigned int loop = 0; loop < NLOOPSA; ++loop)
{
sA = norm2(xA);
sumA += sA;
}
auto tA2 = system_clock::now();
auto durA = duration_cast<microseconds>(tA2 - tA1);
double tA = static_cast<double>(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds
cout << "\n===== Benchmark 5A =====\n";
cout << "NORM = " << sA << endl;
cout << "Timing in sec. : " << tA << endl;
cout << "GFLOPS : " << 2.0 * NA / tA / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : "
<< NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl;
}
{
vector<double> xA(NA), yA(NA);
for (unsigned int i = 0; i < NA; ++i)
{
double xi= (i % 219) + 1;
xA[i] = xi;
yA[i] = 1.0 / xi;
}
auto tA1 = system_clock::now();
double sA = 0.0, sumA = 0.0;
for (unsigned int loop = 0; loop < NLOOPSA; ++loop)
{
sA = scalar_kahan(xA, yA);
sumA += sA;
}
auto tA2 = system_clock::now();
auto durA = duration_cast<microseconds>(tA2 - tA1);
double tA = static_cast<double>(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds
cout << "\n===== Benchmark 5B =====\n";
cout << "<xA,yA> = " << sA << endl;
cout << "Timing in sec. : " << tA << endl;
cout << "GFLOPS : " << 5.0 * NA / tA / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : "
<< 2.0 * NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl;
}
{
const unsigned int LC = MC;
const unsigned int NC = MC;
vector<double> AC(MC * LC), BC(LC * NC);
for (unsigned int i = 0; i < MC; ++i)
for (unsigned int j = 0; j < LC; ++j)
AC[i * LC + j] = (i+j) %219 +1;
for (unsigned int i = 0; i < LC; ++i)
for (unsigned int j = 0; j < NC; ++j)
BC[i * NC + j] = (i+j) %219 +1;
vector<double> CC;
auto tC1 = system_clock::now();
double guardC = 0.0;
for (unsigned int loop = 0; loop < NLOOPSC; ++loop)
{
CC = matrixMultColumnWise(AC, BC, MC);
guardC += CC[0];
}
auto tC2 = system_clock::now();
auto durC = duration_cast<microseconds>(tC2 - tC1);
double tC = static_cast<double>(durC.count()) / 1e6 / NLOOPSC;
double flopsC = 2.0 * MC * LC * NC;
double bytesC = (MC * LC + LC * NC + MC * NC)* sizeof(double);
cout << "\n===== Benchmark 5C =====\n";
cout << "bytes: " << bytesC << endl;
cout << "Timing in sec. : " << tC << endl;
cout << "GFLOPS : " << flopsC / tC / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << bytesC / tC / 1024 / 1024 / 1024 << endl;
}
return 0;
} // memory for x and y will be deallocated by their destructors

View file

65
sheet3/345/mylib.cpp Normal file
View file

@ -0,0 +1,65 @@
#include "mylib.h"
#include <cassert> // assert()
#include <cmath>
#include <vector>
#ifdef __INTEL_CLANG_COMPILER
#pragma message(" ########## Use of MKL ###############")
#include <mkl.h>
#else
#pragma message(" ########## Use of CBLAS ###############")
//extern "C"
//{
#include <cblas.h> // cBLAS Library
#include <lapacke.h> // Lapack
//}
#endif
using namespace std;
double scalar(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
size_t const N = x.size();
double sum = 0.0;
for (size_t i = 0; i < N; ++i)
{
sum += x[i] * y[i];
//sum += exp(x[i])*log(y[i]);
}
return sum;
}
double scalar_cblas(vector<double> const &x, vector<double> const &y)
{
int const asize = static_cast<int>(size(x));
int const bsize = static_cast<int>(size(y));
assert(asize == bsize); // switch off via compile flag: -DNDEBUG
return cblas_ddot(asize,x.data(),1,y.data(),1);
//assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
//return cblas_ddot(x.size(),x.data(),1,y.data(),1);
}
float scalar_cblas(vector<float> const &x, vector<float> const &y)
{
int const asize = static_cast<int>(size(x));
int const bsize = static_cast<int>(size(y));
assert(asize == bsize); // switch off via compile flag: -DNDEBUG
return cblas_sdot(asize,x.data(),1,y.data(),1);
//assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
//return cblas_ddot(x.size(),x.data(),1,y.data(),1);
}
double norm(vector<double> const &x)
{
size_t const N = x.size();
double sum = 0.0;
for (size_t i = 0; i < N; ++i)
{
sum += x[i] * x[i];
}
return std::sqrt(sum);
}

View file

30
sheet3/345/mylib.h Normal file
View file

@ -0,0 +1,30 @@
#ifndef FILE_MYLIB
#define FILE_MYLIB
#include <vector>
/** Inner product
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double scalar(std::vector<double> const &x, std::vector<double> const &y);
/** Inner product using BLAS routines
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
float scalar_cblas(std::vector<float> const &x, std::vector<float> const &y);
/** L_2 Norm of a vector
@param[in] x vector
@return resulting Euclidian norm <x,y>
*/
double norm(std::vector<double> const &x);
#endif

View file

1826
sheet3/345/small_Doxyfile Normal file

File diff suppressed because it is too large Load diff