From ce1d715eeb701f46872f547776ee3f71eb4430ae Mon Sep 17 00:00:00 2001 From: "lisa.pizzo" Date: Thu, 4 Dec 2025 12:27:23 +0100 Subject: [PATCH] Supporting files for Ex4, Ex5 and Ex6 --- Sheet5/bench_funcs.cpp | 285 +++++++++++++++++++++++++++++++++++++++++ Sheet5/bench_funcs.h | 50 ++++++++ 2 files changed, 335 insertions(+) create mode 100644 Sheet5/bench_funcs.cpp create mode 100644 Sheet5/bench_funcs.h diff --git a/Sheet5/bench_funcs.cpp b/Sheet5/bench_funcs.cpp new file mode 100644 index 0000000..9657e4f --- /dev/null +++ b/Sheet5/bench_funcs.cpp @@ -0,0 +1,285 @@ +// bench_funcs.cpp +#include "bench_funcs.h" +#include +#include +#include +#include +#include + +//EXERCISE 4 +// A: parallel sum +double sum_basic(const std::vector& x) +{ + double sum = 0.0; + #pragma omp parallel for reduction(+:sum) + for (std::size_t i = 0; i < x.size(); ++i) sum += x[i]; + return sum; +} + +// A: parallel dot product +double dot_basic(const std::vector& x, const std::vector& y) +{ + std::size_t N = x.size(); + double sum = 0.0; + #pragma omp parallel for reduction(+:sum) + for (std::size_t i = 0; i < N; ++i) sum += x[i] * y[i]; + return sum; +} + +// Kahan remains same +double dot_kahan(const std::vector& x, const std::vector& y) +{ + double sum = 0.0; + double c = 0.0; + for (std::size_t i = 0; i < x.size(); ++i) + { + double prod = x[i] * y[i]; + double yk = prod - c; + double t = sum + yk; + c = (t - sum) - yk; + sum = t; + } + return sum; +} + +// Norm (sum of squares) +double norm_basic(const std::vector& x) +{ + double sumsq = 0.0; + #pragma omp parallel for reduction(+:sumsq) + for (std::size_t i = 0; i < x.size(); ++i) sumsq += x[i]*x[i]; + return sumsq; +} + +// B: matvec (row-major) +void matvec_rowmajor(const std::vector& A, std::size_t M, std::size_t N, + const std::vector& x, std::vector& b) +{ + b.assign(M, 0.0); + #pragma omp parallel for + for (std::size_t i = 0; i < M; ++i) { + double tmp = 0.0; + const double* Ai = &A[i*N]; + for (std::size_t j = 0; j < N; ++j) tmp += Ai[j] * x[j]; + b[i] = tmp; + } +} + +// C: matmul (row-major) +void matmul_rowmajor(const std::vector& A, std::size_t M, std::size_t L, + const std::vector& B, std::size_t N, + std::vector& C) +{ + C.assign(M * N, 0.0); + // Parallelize over output rows (i) + #pragma omp parallel for + for (std::size_t i = 0; i < M; ++i) { + for (std::size_t k = 0; k < L; ++k) { + double Aik = A[i*L + k]; + const double* Bk = &B[k*N]; + double* Ci = &C[i*N]; + for (std::size_t j = 0; j < N; ++j) { + Ci[j] += Aik * Bk[j]; + } + } + } +} + +// D: polynomial evaluation (Horner), parallel over x points +void polyp_horner(const std::vector& a, const std::vector& x, std::vector& y) +{ + std::size_t p = a.size() - 1; + std::size_t N = x.size(); + y.assign(N, 0.0); + + #pragma omp parallel for + for (std::size_t i = 0; i < N; ++i) { + double xi = x[i]; + double val = a[p]; + for (std::size_t k = p; k-- > 0; ) { + val = val * xi + a[k]; + } + y[i] = val; + } +} + +//EXERCISE 5 +// Parallel assembly of tridiagonal FEM matrix (CSR) +void build_fem_system(std::size_t n, CSR& K, std::vector& f) +{ + K.n = n; + K.row_ptr.resize(n+1); + f.assign(n, 1.0); + K.val.resize(3*n); // Each row has at most 3 non-zero entries: [-1,2,-1] + K.col.resize(3*n); + + for (std::size_t i = 0; i < n; ++i) + K.row_ptr[i] = 3*i; + K.row_ptr[n] = 3*n; + + #pragma omp parallel for // Fill matrix in parallel + for (std::size_t i = 0; i < n; ++i) { + std::size_t base = 3*i; + + if (i > 0) { + K.val[base] = -1.0; + K.col[base] = i - 1; + } else { + K.val[base] = 0.0; + K.col[base] = 0; + } + + K.val[base + 1] = 2.0; + K.col[base + 1] = i; + + if (i + 1 < n) { + K.val[base + 2] = -1.0; + K.col[base + 2] = i + 1; + } else { + K.val[base + 2] = 0.0; + K.col[base + 2] = i; + } + } +} + +// Parallel Jacobi iteration +void jacobi_csr_parallel(const CSR& K, const std::vector& f, + std::vector& u, + std::size_t max_iter, double omega, double tol) +{ + std::size_t n = K.n; + u.assign(n, 0.0); + std::vector u_new(n, 0.0); + + for (std::size_t iter = 0; iter < max_iter; ++iter) { + double max_err = 0.0; + + #pragma omp parallel for reduction(max:max_err) + for (std::size_t i = 0; i < n; ++i) { + double diag = 0.0; + double sum = 0.0; + + for (std::size_t idx = K.row_ptr[i]; idx < K.row_ptr[i+1]; ++idx) { + std::size_t j = K.col[idx]; + double v = K.val[idx]; + if (j == i) diag = v; + else sum += v * u[j]; + } + + double ui_new = u[i] + omega * (f[i] - sum - diag*u[i]) / diag; + u_new[i] = ui_new; + + double err = std::fabs(ui_new - u[i]); + if (err > max_err) max_err = err; + } + + u.swap(u_new); + if (max_err < tol) break; + } +} + +//EXERCISE 6 +// Helper: build empty CSR structure (3 entries per row) +static void prepare_tridiag_csr(std::size_t n, CSR &K) +{ + K.n = n; + K.row_ptr.resize(n + 1); + // Each row has 3 slots (left, diag, right) + for (std::size_t i = 0; i < n; ++i) K.row_ptr[i] = 3 * i; + K.row_ptr[n] = 3 * n; + + K.val.assign(3 * n, 0.0); + K.col.resize(3 * n); + + // fill col indices + for (std::size_t i = 0; i < n; ++i) { + std::size_t base = 3 * i; + // left column index + if (i > 0) K.col[base + 0] = i - 1; + else K.col[base + 0] = i; // dummy (no entry) + // diagonal + K.col[base + 1] = i; + // right + if (i + 1 < n) K.col[base + 2] = i + 1; + else K.col[base + 2] = i; // dummy + } +} + +// Element-level element matrix (1D linear element on uniform mesh) +// local ordering: node 0 = left, node 1 = right +// element matrix (unscaled) = [ 1 -1; -1 1 ] +// For a uniform mesh with h=1/(n-1) stiffness would be 1/h. For simplicity we use 1.0 scaling. +inline void element_matrix_vals(double out[4]) +{ + // ordering out = [K00, K01, K10, K11] + out[0] = 1.0; out[1] = -1.0; out[2] = -1.0; out[3] = 1.0; +} + +// Atomic assembly: parallel over elements, atomic adds to K.val +void build_fem_system_atomic(std::size_t n, CSR &K, std::vector &f) +{ + prepare_tridiag_csr(n, K); + f.assign(n, 1.0); // RHS = 1 + + std::size_t nel = (n >= 2) ? (n - 1) : 0; + // Parallel over elements + #pragma omp parallel for + for (std::size_t e = 0; e < nel; ++e) { + // global node indices for element e: i = e, j = e+1 + std::size_t i = e; + std::size_t j = e + 1; + double Em[4]; + element_matrix_vals(Em); + + // contributions: + // K(i,i) += Em[0] + // K(i,j) += Em[1] + // K(j,i) += Em[2] + // K(j,j) += Em[3] + std::size_t idx_ii = 3 * i + 1; // diag of row i + std::size_t idx_ij = 3 * i + 2; // right of row i + std::size_t idx_ji = 3 * j + 0; // left of row j + std::size_t idx_jj = 3 * j + 1; // diag of row j + + #pragma omp atomic + K.val[idx_ii] += Em[0]; + #pragma omp atomic + K.val[idx_ij] += Em[1]; + #pragma omp atomic + K.val[idx_ji] += Em[2]; + #pragma omp atomic + K.val[idx_jj] += Em[3]; + } +} + +// No-atomic grouped assembly (method (a)) +// We compute each row i's three entries by summing the (<=2) contributing elements. +// Parallel over rows; no atomics. +void build_fem_system_no_atomic(std::size_t n, CSR &K, std::vector &f) +{ + prepare_tridiag_csr(n, K); + f.assign(n, 1.0); + + // element matrix values (same for all elements here) + double Em00 = 1.0, Em01 = -1.0, Em10 = -1.0, Em11 = 1.0; + + #pragma omp parallel for // parallel over rows (global entries) + for (std::size_t i = 0; i < n; ++i) { + std::size_t base = 3 * i; + + // left entry (K[i, i-1]) gets contribution only from element e = i-1 + if (i > 0) K.val[base + 0] = Em10; // from element e=i-1 local(1,0) + else K.val[base + 0] = 0.0; + + // diagonal K[i,i] gets contributions from element e=i-1 (local(1,1)) and e=i (local(0,0)) + double diag = 0.0; + if (i > 0) diag += Em11; // e = i-1 local(1,1) + if (i + 1 < n) diag += Em00; // e = i local(0,0) + K.val[base + 1] = diag; + + // right entry (K[i,i+1]) gets contribution only from element e = i + if (i + 1 < n) K.val[base + 2] = Em01; // e=i local(0,1) + else K.val[base + 2] = 0.0; + } +} + diff --git a/Sheet5/bench_funcs.h b/Sheet5/bench_funcs.h new file mode 100644 index 0000000..048cf6e --- /dev/null +++ b/Sheet5/bench_funcs.h @@ -0,0 +1,50 @@ +#pragma once +#include +#include + +//EXERCISE 4 +// scalar and vector operations +double sum_basic(const std::vector& x); +double dot_basic(const std::vector& x, const std::vector& y); +double dot_kahan(const std::vector& x, const std::vector& y); +double norm_basic(const std::vector& x); + +// matrix-vector, matrix-matrix, polynomial +void matvec_rowmajor(const std::vector& A, std::size_t M, std::size_t N, + const std::vector& x, std::vector& b); + +void matmul_rowmajor(const std::vector& A, std::size_t M, std::size_t L, + const std::vector& B, std::size_t N, + std::vector& C); + +void polyp_horner(const std::vector& a, const std::vector& x, + std::vector& y); + + +// EXERCISE 5 +struct CSR { + std::size_t n; + std::vector val; + std::vector col; + std::vector row_ptr; +}; + +void jacobi_csr(const CSR& K, const std::vector& f, std::vector& u, + std::size_t max_iter, double omega, double tol); + +// build tridiagonal FEM matrix K and rhs f (parallel) +void build_fem_system(std::size_t n, CSR& K, std::vector& f); + +//Jacobi iteration (parallel) +void jacobi_csr_parallel(const CSR& K, const std::vector& f, + std::vector& u, + std::size_t max_iter, double omega, double tol); + +// EXERCISE 6 + +// parallel element assembly with atomics +void build_fem_system_atomic(std::size_t n, CSR &K, std::vector &f); + +// No-atomic grouped assembly (method (a)) +void build_fem_system_no_atomic(std::size_t n, CSR &K, std::vector &f); +