This commit is contained in:
dino.celebic 2025-11-11 13:36:28 +01:00
commit 7a02dff345
29 changed files with 3943 additions and 1 deletions

30
ex3/code/Makefile Normal file
View file

@ -0,0 +1,30 @@
PROGRAM = main
SOURCES = $(wildcard *.cpp)
OBJECTS = ${SOURCES:.cpp=.o}
CXX = g++
LINKER = g++
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -fmax-errors=1
CXXFLAGS = -g -flto -O3 -ffast-math -march=native ${WARNINGS}
LINKFLAGS = -g -flto -O3 -lopenblas -llapacke
all: ${PROGRAM}
# %.o: %.cpp
# ${CXX} ${CXXFLAGS} -c $< -o $@
${PROGRAM}: ${OBJECTS}
$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}
clean:
rm -f ${OBJECTS} ${PROGRAM}
run: ${PROGRAM}
# run: clean ${PROGRAM}
./${PROGRAM}

267
ex3/code/main.cpp Normal file
View file

@ -0,0 +1,267 @@
#include "task_3.h"
#include "task_4+6.h"
#include "task_5.h"
#include "task_7.h"
#include "timing.h"
#include <iomanip>
#include <iostream>
#include <cblas.h> // cBLAS Library
#include <lapacke.h>
void task_1() {
printf("\n\n-------------- Task 1 --------------\n\n");
printf("See comment in main.cpp");
// -------------------------------------------------------------
// STREAM version $Revision: 5.10 $
// -------------------------------------------------------------
// This system uses 8 bytes per array element.
// -------------------------------------------------------------
// Array size = 80000000 (elements), Offset = 0 (elements)
// Memory per array = 610.4 MiB (= 0.6 GiB).
// Total memory required = 1831.1 MiB (= 1.8 GiB).
// Each kernel will be executed 20 times.
// The *best* time for each kernel (excluding the first iteration)
// will be used to compute the reported bandwidth.
// -------------------------------------------------------------
// Your clock granularity/precision appears to be 1 microseconds.
// Each test below will take on the order of 116886 microseconds.
// (= 116886 clock ticks)
// Increase the size of the arrays if this shows that
// you are not getting at least 20 clock ticks per test.
// -------------------------------------------------------------
// WARNING -- The above is only a rough guideline.
// For best results, please be sure you know the
// precision of your system timer.
// -------------------------------------------------------------
// Function Best Rate MB/s Avg time Min time Max time
// Copy: 29569.4 0.048585 0.043288 0.059164
// Scale: 17644.0 0.082248 0.072546 0.102548
// Add: 21030.1 0.100620 0.091298 0.124700
// Triad: 21230.7 0.100758 0.090435 0.120631
// -------------------------------------------------------------
// Solution Validates: avg error less than 1.000000e-13 on all three arrays
// -------------------------------------------------------------
// ./flops.exe
// FLOPS C Program (Double Precision), V2.0 18 Dec 1992
// Module Error RunTime MFLOPS
// (usec)
// 1 4.0146e-13 0.0024 5827.9076
// 2 -1.4166e-13 0.0007 10037.8942
// 3 4.7184e-14 0.0039 4371.9185
// 4 -1.2557e-13 0.0034 4355.5711
// 5 -1.3800e-13 0.0066 4415.6439
// 6 3.2380e-13 0.0065 4441.6299
// 7 -8.4583e-11 0.0053 2277.1707
// 8 3.4867e-13 0.0069 4367.6094
// Iterations = 512000000
// NullTime (usec) = 0.0000
// MFLOPS(1) = 7050.6178
// MFLOPS(2) = 3461.6233
// MFLOPS(3) = 4175.0442
// MFLOPS(4) = 4389.7311
}
void task_2() {
printf("\n\n-------------- Task 2 --------------\n\n");
printf("See comment in main.cpp");
// Memory needed (double 64-bit, 8 bytes):
// (A) (2N + 1) * 8 bytes
// (B) (M*N + M + N) * 8 bytes
// (C) (M*L + L*N + M*N) * 8 bytes
// (D) (N + N + p) * 8 bytes
// Floating point operations:
// (A) 2N
// (B) M * 2N
// (C) M * 2L * N
// (D) 2 * N * p (Horner Schema)
// Read/Write operations:
// (A) Read: 2N Write: 1
// (B) Read: M*2N Write: M*N
// (C) Read: M*2L*N Write: M*L*N
// (D) Read: 2*N*p Write: N*P
}
void task_3() {
printf("\n\n-------------- Task 3 --------------\n\n");
printf("Functions implemented in task_3.cpp");
}
void task_4(bool cblas = false) {
if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
size_t M, N, L, p, NLOOPS;
{ // Scalar product
printf("----- Benchmark (A) -----\n");
// Initialization
N = 50'000'000;
NLOOPS = 50;
auto [x,y] = init_A(N);
// Benchmark
tic();
benchmark_A(x, y, NLOOPS, cblas);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = 2 * N;
size_t flops = 2 * N;
print_performance(sec, memory, flops, sizeof(x[0]));
printf("-------------------------\n");
}
{ // Matrix-Vector product
printf("----- Benchmark (B) -----\n");
// Initialization
M = 8'000;
N = 12'000;
NLOOPS = 30;
auto [A,x] = init_B(M,N);
// Benchmark
tic();
benchmark_B(A, x, NLOOPS, cblas);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = M*N + M + N;
size_t flops = 2 * M * N;
print_performance(sec, memory, flops, sizeof(A[0]));
printf("-------------------------\n");
}
{ // Matrix-Matrix product
printf("----- Benchmark (C) -----\n");
// Initialization
M = 1'000;
N = 2'000;
L = 500;
NLOOPS = 20;
auto [A,B] = init_C(M,N,L);
// Benchmark
tic();
benchmark_C(A, B, L, NLOOPS, cblas);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = M*L + L*N + M*N;
size_t flops = M * 2*L * N;
print_performance(sec, memory, flops, sizeof(A[0]));
printf("-------------------------\n");
}
if (cblas == false)
{ // Polynomial evaluation
printf("----- Benchmark (D) -----\n");
// Initialization
N = 1'000'000;
p = 200;
NLOOPS = 20;
auto [x,a] = init_D(N,p);
// Benchmark
tic();
benchmark_D(x, a, NLOOPS);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = 2.0 * N;
size_t flops = 2.0 * N * p;
print_performance(sec, memory, flops, sizeof(x[0]));
printf("-------------------------\n");
}
}
void task_5() {
printf("\n\n-------------- Task 5 --------------\n\n");
printf("----- Benchmark norm -----\n");
// Initialization
size_t N =50'000'000;
size_t NLOOPS = 50;
vector<double> x = init_norm(N);
// Benchmark
tic();
benchmark_norm(x, NLOOPS);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = N;
size_t flops = 2 * N;
print_performance(sec, memory, flops, sizeof(x[0]));
printf("-------------------------\n");
printf("What do you observe? Why?\n");
printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
}
void task_6() {
printf("\n\n-------------- Task 6 --------------\n\n");
printf("Benchmarks using cBLAS\n");
task_4(true);
}
void task_7() {
printf("\n\n-------------- Task 7 --------------\n\n");
{ // Check Ax=b
size_t N=5, Nrhs=2;
auto [A,b] = init_M(N,Nrhs);
vector<double> A_og = A;
printf("A =");
print_matrix(A,N,N);
printf("b =");
print_matrix(b,N,Nrhs);
int lda=N, ldb=Nrhs;
vector<int> ipiv(N);
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
printf("L + U =");
print_matrix(A,N,N);
printf("x =");
print_matrix(b,N,Nrhs);
int ldc=Nrhs;
vector<double> C(N*Nrhs);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
printf("Check solution:\nA * x = ");
print_matrix(C,N,Nrhs);
}
// #################################
// Benchmark
cout << fixed << setprecision(4); // 4 digits after decimal
size_t NLOOPS = 1000;
cout << "N = " << " | 1 | 2 | 4 | 8 | 16 | 32 " << endl;
cout << "---------|--------|--------|--------|--------|--------|-------" << endl;
for (int exp = 1; exp < 10; ++exp) {
cout << "Nrhs = " << static_cast<size_t>(pow(2,exp));
for (size_t N : {1, 2, 4, 8, 16, 32}) {
tic();
for (size_t i = 0; i < NLOOPS; ++i) {
benchmark_lapacke(N, static_cast<size_t>(pow(2,exp)));
}
double sec = toc();
cout << " | " << sec;
}
cout << endl;
}
printf("\nFor fixed n, the solution time per rhs does not slow down consistently and scales very well.\nIts faster than expected.");
}
int main() {
task_1();
task_2();
task_3();
task_4();
task_5();
task_6();
task_7();
printf("\n\n");
return 0;
}

61
ex3/code/task_3.cpp Normal file
View file

@ -0,0 +1,61 @@
#include "task_3.h"
#include <vector>
#include <cassert>
#include <iostream>
#include <cmath>
using namespace std;
double scalar(vector<double> const &x, vector<double> const &y) {
assert(x.size() == y.size());
size_t const N = x.size();
double sum = 0.0;
for (size_t i = 0; i < N; ++i) {
sum += x[i] * y[i];
}
return sum;
}
vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
size_t const N = x.size();
size_t const M = A.size() / N;
vector<double> b(M);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
b[i] += A[i*N + j] * x[j];
}
}
return b;
}
vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
size_t const L = A.size() / M;
size_t const N = B.size() / L;
vector<double> C(M*N,0);
for (size_t i = 0; i < M; ++i) {
for (size_t k = 0; k < L; ++k) {
for (size_t j = 0; j < N; ++j) {
C[i*N + j] += A[i*L + k] * B[k*N + j];
}
}
}
return C;
}
vector<double> poly(vector<double> const &x, vector<double> const &a) {
size_t N = x.size();
size_t p = a.size();
vector<double> y(N);
for (size_t i = 0; i < N; ++i) {
y[i] = a[p];
for (size_t k = 1; k < p; ++k) {
y[i] = y[i]*x[i] + a[p-k];
}
}
return y;
}

8
ex3/code/task_3.h Normal file
View file

@ -0,0 +1,8 @@
#pragma once
#include <vector>
using namespace std;
double scalar(vector<double> const &x, vector<double> const &y);
vector<double> matrix_vec(vector<double> const &A, vector<double> const &x);
vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M);
vector<double> poly(vector<double> const &x, vector<double> const &a);

148
ex3/code/task_4+6.cpp Normal file
View file

@ -0,0 +1,148 @@
#include "task_3.h"
#include "task_4+6.h"
#include "timing.h"
#include <vector>
#include <iostream>
#include <cblas.h> // cBLAS Library
using namespace std;
void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
printf("Memory allocated : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
printf("Duration per loop : %.3f sec\n", sec);
printf("GFLOPS : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
printf("GiByte/s : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
}
tuple<vector<double>, vector<double>> init_A(size_t N) {
vector<double> x(N), y(N);
for (size_t i = 0; i < N; ++i) {
x[i] = i%219 + 1.0;
y[i] = 1.0 / x[i];
}
return make_tuple(x, y);
}
void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
size_t N = x.size();
double s(0.0), sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
s = scalar(x, y);
sum += s;
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
s = cblas_ddot(N, x.data(), 1, y.data(), 1);
sum += s;
}
}
// Check correctness
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
vector<double> A(M*N), x(N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
A[i*N + j] = (i+j)%219 + 1.0;
}
}
for (size_t j = 0; j < N; ++j) {
x[j] = 1.0/A[17*N + j];
}
return make_tuple(A, x);
}
void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
size_t N = x.size();
size_t M = A.size() / N;
vector<double> b(M);
double sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
b = matrix_vec(A,x);
sum += b[17];
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
sum += b[17];
}
}
// Check correctness
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
vector<double> A(M*L), B(L*N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < L; ++j) {
A[i*L + j] = (i+j)%219 + 1.0;
}
}
// B chosen such that C[0,17]=L
// so B[i,17] = 1/A[0,i]
for (size_t i = 0; i < L; ++i) {
for (size_t j = 0; j < N; ++j) {
if (j==17) {
B[i*N + 17] = 1.0/A[i];
} else {
B[i*N + j] = (i+j)%219 + 1.0;
}
}
}
return make_tuple(A, B);
}
void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
size_t M = A.size() / L;
size_t N = B.size() / L;
vector<double> C(M*N);
double sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
C = matrix_matrix(A,B,M);
sum += C[17];
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
sum += C[17];
}
}
// Check correctness
if (static_cast<size_t>(sum) != L*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
// x_i = i/N for i=0,...,N-1
// a_j = 1 for j=0,...,p-1
vector<double> x(N), a(p);
for (size_t i = 0; i < N; ++i) {
x[i] = static_cast<double>(i) / N;
}
for (size_t j = 0; j < p; ++j) {
a[j] = 1.0;
}
return make_tuple(x, a);
}
void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
size_t N = x.size();
vector<double> y(N);
double sum(0.0);
for (size_t i = 0; i < NLOOPS; ++i) {
y = poly(x,a);
sum += y[0];
}
// Check correctness
if (static_cast<size_t>(sum) != NLOOPS) {printf(" !! W R O N G result sum = %f !!\n", sum);}
}

14
ex3/code/task_4+6.h Normal file
View file

@ -0,0 +1,14 @@
#pragma once
#include <vector>
using namespace std;
void print_performance(double sec, size_t memory, size_t flops, unsigned int size);
tuple<vector<double>, vector<double>> init_A(size_t N);
tuple<vector<double>, vector<double>> init_B(size_t M, size_t N);
tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L);
tuple<vector<double>, vector<double>> init_D(size_t N, size_t p);
void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas);
void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas);
void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas);
void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS);

33
ex3/code/task_5.cpp Normal file
View file

@ -0,0 +1,33 @@
#include "task_4+6.h"
#include "task_5.h"
#include "timing.h"
#include <vector>
#include <iostream>
#include <cmath>
using namespace std;
double norm(vector<double> const &x) {
size_t N = x.size();
double sum = 0.0;
for (size_t i = 0; i < N; ++i) {
sum += x[i] * x[i];
}
return sqrt(sum);
}
vector<double> init_norm(size_t N) {
vector<double> x(N);
for (size_t i = 0; i < N; ++i) {
x[i] = i%219 + 1.0;
}
return x;
}
void benchmark_norm(vector<double> const &x, size_t NLOOPS) {
double s(0.0), sum(0.0);
for (size_t i = 0; i < NLOOPS; ++i) {
s = norm(x);
sum += s;
}
printf("||x|| = %f\n", sum/NLOOPS);
}

8
ex3/code/task_5.h Normal file
View file

@ -0,0 +1,8 @@
#pragma once
#include <vector>
using namespace std;
double norm(vector<double> const &x);
vector<double> init_norm(size_t N);
void benchmark_norm(vector<double> const &x, size_t NLOOPS);

43
ex3/code/task_7.cpp Normal file
View file

@ -0,0 +1,43 @@
#include "task_7.h"
#include <iostream>
#include <vector>
#include <cmath>
#include <lapacke.h>
using namespace std;
tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs) {
vector<double> A(N*N), b(N*Nrhs);
for (size_t i = 0; i < N; ++i) {
for (size_t j = 0; j < N; ++j) {
if (i != j) {
A[i*N + j] = 1.0 / pow(abs(1.0*i-1.0*j),2);
} else {
A[i*N + j] = 4;
}
}
for (size_t j=0; j < Nrhs; ++j) {
b[i*Nrhs + j] = 1.0*j;
}
}
return make_tuple(A, b);
}
void print_matrix(vector<double> &A, size_t M, size_t N) {
printf("\n");
for (size_t i = 0; i < M; ++i){
for (size_t j = 0; j < N; ++j) {
printf("%f ", A[i*N + j]);
}
printf("\n");
}
printf("\n\n");
}
void benchmark_lapacke(int N, int Nrhs) {
auto [A,b] = init_M(N,Nrhs);
int lda=N, ldb=Nrhs;
vector<int> ipiv(N);
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
}

7
ex3/code/task_7.h Normal file
View file

@ -0,0 +1,7 @@
#pragma once
#include <vector>
using namespace std;
tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs);
void print_matrix(vector<double> &A, size_t M, size_t N);
void benchmark_lapacke(int N, int Nrhs);

51
ex3/code/timing.h Normal file
View file

@ -0,0 +1,51 @@
//
// Gundolf Haase, Oct 18 2024
//
#pragma once
#include <chrono> // timing
#include <stack>
//using Clock = std::chrono::system_clock; //!< The wall clock timer chosen
using Clock = std::chrono::high_resolution_clock;
using TPoint= std::chrono::time_point<Clock>;
// [Galowicz, C++17 STL Cookbook, p. 29]
inline
std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
/** Starts stopwatch timer.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
*
* The timining can be nested and the recent time point is stored on top of the stack.
*
* @return recent time point
* @see toc
*/
inline auto tic()
{
MyStopWatch.push(Clock::now());
return MyStopWatch.top();
}
/** Returns the elapsed time from stopwatch.
*
* The time point from top of the stack is used
* if time point @p t_b is not passed as input parameter.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
* or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode
* The last option is to be used in the case of
* non-nested but overlapping time measurements.
*
* @param[in] t_b start time of some stop watch
* @return elapsed time in seconds.
*
*/
inline double toc(TPoint const &t_b = MyStopWatch.top())
{
// https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
using Unit = std::chrono::seconds;
using FpSeconds = std::chrono::duration<double, Unit::period>;
auto t_e = Clock::now();
MyStopWatch.pop();
return FpSeconds(t_e-t_b).count();
}