From 44e8b9d13b625c098ebd4f9be08c90e3d56af6c3 Mon Sep 17 00:00:00 2001 From: "lisa.pizzo" Date: Wed, 5 Nov 2025 15:57:45 +0100 Subject: [PATCH] Task 2/3/4/5 of Exercise sheet 3 --- Sheet3/bench_funcs.cpp | 113 ++++++++++++++++++++ Sheet3/bench_funcs.h | 27 +++++ Sheet3/main.cpp | 230 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 370 insertions(+) create mode 100644 Sheet3/bench_funcs.cpp create mode 100644 Sheet3/bench_funcs.h create mode 100644 Sheet3/main.cpp diff --git a/Sheet3/bench_funcs.cpp b/Sheet3/bench_funcs.cpp new file mode 100644 index 0000000..974d204 --- /dev/null +++ b/Sheet3/bench_funcs.cpp @@ -0,0 +1,113 @@ +#include "bench_funcs.h" +#include +#include +#include + +double dot_basic(const std::vector& x, const std::vector& y) { + assert(x.size() == y.size()); + double s = 0.0; + std::size_t N = x.size(); + for (std::size_t i = 0; i < N; ++i) { + s += x[i] * y[i]; + } + return s; +} + +double dot_kahan(const std::vector& x, const std::vector& y) { + assert(x.size() == y.size()); + double sum = 0.0; + double c = 0.0; + std::size_t N = x.size(); + for (std::size_t i = 0; i < N; ++i) { + double prod = x[i] * y[i]; + double yk = prod - c; + double t = sum + yk; + c = (t - sum) - yk; + sum = t; + } + return sum; +} + +double norm_basic(const std::vector& x) { + double s = 0.0; + for (std::size_t i = 0; i < x.size(); ++i) { + double v = x[i]; + s += v * v; + } + return std::sqrt(s); +} + +void matvec_rowmajor(const std::vector& A, std::size_t M, std::size_t N, + const std::vector& x, std::vector& b) { + assert(A.size() == M * N); + assert(x.size() == N); + b.assign(M, 0.0); + for (std::size_t i = 0; i < M; ++i) { //over rows + double sum = 0.0; + std::size_t base = i * N; //the start index of row 1 + for (std::size_t j = 0; j < N; ++j) { //dot product of that row with x + sum += A[base + j] * x[j]; + } + b[i] = sum; + } +} + +void matmul_rowmajor(const std::vector& A, std::size_t M, std::size_t L, + const std::vector& B, std::size_t N, + std::vector& C) { + assert(A.size() == M * L); + assert(B.size() == L * N); + C.assign(M * N, 0.0); + for (std::size_t i = 0; i < M; ++i) { + for (std::size_t k = 0; k < L; ++k) { + double a = A[i * L + k]; //a=A[i,k] + std::size_t baseB = k * N; + std::size_t baseC = i * N; + for (std::size_t j = 0; j < N; ++j) { + C[baseC + j] += a * B[baseB + j]; + } + } + } +} + +void polyp_horner(const std::vector& a, const std::vector& x, + std::vector& y) { + std::size_t p = (a.size() == 0) ? 0 : a.size() - 1; //if a.size=0, then p=0, otherwise p=a.soze-1 + std::size_t N = x.size(); + y.assign(N, 0.0); + for (std::size_t i = 0; i < N; ++i) { //loops over all evaluation points x[i] + double xi = x[i]; + double acc = a[p]; + for (std::size_t k = p; k-- > 0;) { // from p-1 down to 0 + acc = acc * xi + a[k]; + } + y[i] = acc; + } +} + +void jacobi_csr(const CSR& K, const std::vector& f, std::vector& u, + std::size_t max_iter, double omega, double tol) { + std::size_t n = K.n; + assert(f.size() == n); + u.assign(n, 0.0); + std::vector u_new(n, 0.0); + for (std::size_t iter = 0; iter < max_iter; ++iter) { //Jacobi iteration + double max_err = 0.0; + for (std::size_t i = 0; i < n; ++i) { + double diag = 0.0; + double sum = 0.0; + for (std::size_t idx = K.row_ptr[i]; idx < K.row_ptr[i+1]; ++idx) { //iterates over the non zero entries of row i + std::size_t j = K.col[idx]; + double v = K.val[idx]; + if (j == i) diag = v; + else sum += v * u[j]; //accomulates sum = Σ_{j≠i} K_ij * u[j] + } + double uk1 = u[i] + omega * (f[i] - sum - diag * u[i]) / diag; //stoers the diagonal separately + u_new[i] = uk1; + double err = std::fabs(uk1 - u[i]); + if (err > max_err) max_err = err; + } + u.swap(u_new); + if (max_err < tol) break; + } +} diff --git a/Sheet3/bench_funcs.h b/Sheet3/bench_funcs.h new file mode 100644 index 0000000..72f4df5 --- /dev/null +++ b/Sheet3/bench_funcs.h @@ -0,0 +1,27 @@ +#pragma once +#include +#include + +// now we declare functions that we define in benh_funcs.cpp +double dot_basic(const std::vector& x, const std::vector& y); +double dot_kahan(const std::vector& x, const std::vector& y); +double norm_basic(const std::vector& x); + +void matvec_rowmajor(const std::vector& A, std::size_t M, std::size_t N, + const std::vector& x, std::vector& b); + +void matmul_rowmajor(const std::vector& A, std::size_t M, std::size_t L, + const std::vector& B, std::size_t N, + std::vector& C); + +void polyp_horner(const std::vector& a, const std::vector& x, + std::vector& y); + +struct CSR { //compressed Sparse Row + std::size_t n; // number of rows + std::vector val; // non-zero values + std::vector col; // column indices + std::vector row_ptr; // row pointers (size n+1) +}; +void jacobi_csr(const CSR& K, const std::vector& f, std::vector& u, + std::size_t max_iter, double omega, double tol); diff --git a/Sheet3/main.cpp b/Sheet3/main.cpp new file mode 100644 index 0000000..943e775 --- /dev/null +++ b/Sheet3/main.cpp @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include "bench_funcs.h" +#include "bench_funcs.cpp" + +using namespace std; +using namespace std::chrono; + +void gen_vector_x_y(std::size_t N, std::vector& x, std::vector& y) { + x.resize(N); + y.resize(N); + for (std::size_t i = 0; i < N; ++i) { + x[i] = static_cast((i % 219) + 1); // xi := (i mod 219) + 1 + y[i] = 1.0 / x[i]; // yi := 1/xi + } +} + +void gen_matrix_A(std::size_t M, std::size_t N, std::vector& A) { + A.resize(M * N); + for (std::size_t i = 0; i < M; ++i) { + for (std::size_t j = 0; j < N; ++j) { + A[i * N + j] = static_cast(((i + j) % 219) + 1); + } + } +} + +high_resolution_clock::time_point tic_timer; +void tic() { tic_timer = high_resolution_clock::now(); } +double toc() { + auto t1 = high_resolution_clock::now(); + duration elapsed = t1 - tic_timer; + return elapsed.count(); +} + +//CHANGE FLAG BASED ON WHAT YOU WANT TO DO +int main() { + int flag = 8; // 1=A, 2=B, 3=C, 4=D, 5=Jacobi 6=Norm in 5a, 7= Kahan in 5b and 8=colums access in 5c + size_t N = 5000000; // default vector length + size_t M = 3000, L = 3000; + if (flag == 1) { + // A) Inner product + vector x, y; + gen_vector_x_y(N, x, y); + cout << "Running dot_basic" << endl; + tic(); + volatile double s = dot_basic(x, y); (void)s; + double dt = toc(); + + double flops = 2.0 * N; //x_i*y_i=1 FLOPS, the sum = 1 FLOPS, for N times = 2N + double gflops = (flops / dt) / 1e9; // computes how many flops per second does my computer and then converts to giga FLOPS + double traffic_bytes = 2.0 * N * sizeof(double); //Memory usage in bytes: 2 vectors of length N times the size of double + double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); //computes how many bytes are moved by my computer per second and converts to Gibibytes per second + + cout << "A: N=" << N << " time=" << dt << " s GFLOPS=" << gflops + << " GiB/s=" << gib_s << "\n"; + } + + else if (flag == 2) { + //B) Matrix*vector + size_t m = M, n = 5000; + vector A, x, b; + gen_matrix_A(m, n, A); + x.resize(n); + for (size_t j = 0; j < n; ++j) + x[j] = 1.0 / (((17 + j) % 219) + 1); + + cout << "Running matvec\n"; + tic(); + matvec_rowmajor(A, m, n, x, b); + double dt = toc(); + + double flops = 2.0 * m * n; //each y_i does N multiplications and N additions = Mx2N + double gflops = (flops / dt) / 1e9; + double traffic_bytes = (m*n + n + m) * sizeof(double); //Memory usage in bytes: Matrix size mxn, 2 vectors n and m + double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); + + cout << "B: M=" << m << " N=" << n << " time=" << dt + << " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n"; + } + + else if (flag == 3) { + // C) Matrix*matrix + size_t m = M, l = L, n = 500; + vector A, B, C; + gen_matrix_A(m, l, A); + gen_matrix_A(l, n, B); + + cout << "Running matmul\n"; + tic(); + matmul_rowmajor(A, m, l, B, n, C); + double dt = toc(); + + double flops = 2.0 * m * l * n; //each element of C does N moltiplications and N additions: dim(C)=MxL hence MxLx2N + double gflops = (flops / dt) / 1e9; + double traffic_bytes = (m*l + l*n + m*n) * sizeof(double); //Memory usage in bytes: 3 matices MxN, NxL, MxL + double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); + + cout << "C: M=" << m << " L=" << l << " N=" << n << " time=" << dt + << " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n"; + } + + else if (flag == 4) { + // D) Polynomial + size_t p = 100; // degree + vector a(p+1), x(N), y; + for (size_t k = 0; k <= p; ++k) + a[k] = 1.0 / (k+1); + for (size_t i = 0; i < N; ++i) + x[i] = (i % 219) * 0.001 + 1.0; + + cout << "Running POLYNOMIAL test (D)\n"; + tic(); + polyp_horner(a, x, y); + double dt = toc(); + + double flops = 2.0 * p * N; //each evaluation p moltiplications + p additions = 2pxN + double gflops = (flops / dt) / 1e9; + double traffic_bytes = (p+1 + N + N) * sizeof(double); //Memory usage in bytes: p+1 coefficients and N evaluation points + double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); + + cout << "D: p=" << p << " N=" << N << " time=" << dt + << " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n"; + } + + else if (flag == 5) { + // E) Jacobi + size_t n = 10000; + CSR K; K.n = n; + vector f(n, 1.0), u; + K.row_ptr.resize(n+1); + K.val.reserve(3*n); + K.col.reserve(3*n); + + for (size_t i = 0; i < n; ++i) { + K.row_ptr[i] = K.val.size(); + if (i > 0) { K.val.push_back(-1.0); K.col.push_back(i-1); } + K.val.push_back(2.0); K.col.push_back(i); + if (i+1 < n) { K.val.push_back(-1.0); K.col.push_back(i+1); } + } + K.row_ptr[n] = K.val.size(); + + size_t maxit = 5000; + double omega = 1.0, tol = 1e-8; + + cout << "Running JACOBI solver test...\n"; + tic(); + jacobi_csr(K, f, u, maxit, omega, tol); + double dt = toc(); + cout << "Jacobi: n=" << n << " time=" << dt << " s\n"; + } + + else if (flag == 6) {//5(a) + size_t N = 5000000; // large enough for ~10 s runtime + vector x(N); + for (size_t i = 0; i < N; ++i) + x[i] = static_cast((i % 219) + 1); + + cout << "Running norm test (A2)\n"; + tic(); + volatile double s = 0.0; + for (size_t i = 0; i < N; ++i) + s += x[i] * x[i]; + double dt = toc(); + + double flops = 2.0 * N; // 1 mult + 1 add per element + double gflops = (flops / dt) / 1e9; + double traffic_bytes = N * sizeof(double); // only one vector is read, memory is halved + double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); + + cout << "A2 (norm): N=" << N << " time=" << dt + << " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n"; + } + + else if (flag == 7) { //5(b) Kahan + size_t N = 50'000'000; + vector x(N), y(N); + for (size_t i = 0; i < N; ++i) { + x[i] = static_cast((i % 219) + 1); + y[i] = 1.0 / x[i]; + } + + cout << "Running Kahan dot product (A3)...\n"; + tic(); + volatile double s = dot_kahan(x, y); (void)s; + double dt = toc(); + + double flops = 6.0 * N; // more operations per element than standard dot + double gflops = (flops / dt) / 1e9; + double traffic_bytes = 2.0 * N * sizeof(double); + double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); + + cout << "A3 (Kahan): N=" << N << " time=" << dt + << " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n"; + } + + else if (flag == 8) { //5(c): compare row-wise vs column-wise matrix access + size_t M = 1500, L = 1500, N = 1500; + vector A, B, C; + gen_matrix_A(M, L, A); + gen_matrix_A(L, N, B); + + cout << "Running matrix-matrix (row-wise)\n"; + tic(); + matmul_rowmajor(A, M, L, B, N, C); + double dt_row = toc(); + + cout << "Row-wise time: " << dt_row << " s\n"; + + // Column-wise version + C.assign(M*N, 0.0); + cout << "Running column-wise version\n"; + tic(); + for (size_t j = 0; j < N; ++j) + for (size_t k = 0; k < L; ++k) + for (size_t i = 0; i < M; ++i) + C[i*N + j] += A[i*L + k] * B[k*N + j]; + double dt_col = toc(); + cout << "Column-wise time: " << dt_col << " s\n"; //usually slower because it causes cache misses and lower memory bandwidth + } + + else { + cout << "Invalid flag. Choose 1–8.\n"; + } + + cout << "\nDone\n"; + return 0; +} \ No newline at end of file