Task 2/3/4/5 of Exercise sheet 3
This commit is contained in:
parent
c771a5cb37
commit
44e8b9d13b
3 changed files with 370 additions and 0 deletions
230
Sheet3/main.cpp
Normal file
230
Sheet3/main.cpp
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <iomanip>
|
||||
#include <chrono>
|
||||
#include "bench_funcs.h"
|
||||
#include "bench_funcs.cpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace std::chrono;
|
||||
|
||||
void gen_vector_x_y(std::size_t N, std::vector<double>& x, std::vector<double>& y) {
|
||||
x.resize(N);
|
||||
y.resize(N);
|
||||
for (std::size_t i = 0; i < N; ++i) {
|
||||
x[i] = static_cast<double>((i % 219) + 1); // xi := (i mod 219) + 1
|
||||
y[i] = 1.0 / x[i]; // yi := 1/xi
|
||||
}
|
||||
}
|
||||
|
||||
void gen_matrix_A(std::size_t M, std::size_t N, std::vector<double>& A) {
|
||||
A.resize(M * N);
|
||||
for (std::size_t i = 0; i < M; ++i) {
|
||||
for (std::size_t j = 0; j < N; ++j) {
|
||||
A[i * N + j] = static_cast<double>(((i + j) % 219) + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
high_resolution_clock::time_point tic_timer;
|
||||
void tic() { tic_timer = high_resolution_clock::now(); }
|
||||
double toc() {
|
||||
auto t1 = high_resolution_clock::now();
|
||||
duration<double> elapsed = t1 - tic_timer;
|
||||
return elapsed.count();
|
||||
}
|
||||
|
||||
//CHANGE FLAG BASED ON WHAT YOU WANT TO DO
|
||||
int main() {
|
||||
int flag = 8; // 1=A, 2=B, 3=C, 4=D, 5=Jacobi 6=Norm in 5a, 7= Kahan in 5b and 8=colums access in 5c
|
||||
size_t N = 5000000; // default vector length
|
||||
size_t M = 3000, L = 3000;
|
||||
if (flag == 1) {
|
||||
// A) Inner product
|
||||
vector<double> x, y;
|
||||
gen_vector_x_y(N, x, y);
|
||||
cout << "Running dot_basic" << endl;
|
||||
tic();
|
||||
volatile double s = dot_basic(x, y); (void)s;
|
||||
double dt = toc();
|
||||
|
||||
double flops = 2.0 * N; //x_i*y_i=1 FLOPS, the sum = 1 FLOPS, for N times = 2N
|
||||
double gflops = (flops / dt) / 1e9; // computes how many flops per second does my computer and then converts to giga FLOPS
|
||||
double traffic_bytes = 2.0 * N * sizeof(double); //Memory usage in bytes: 2 vectors of length N times the size of double
|
||||
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); //computes how many bytes are moved by my computer per second and converts to Gibibytes per second
|
||||
|
||||
cout << "A: N=" << N << " time=" << dt << " s GFLOPS=" << gflops
|
||||
<< " GiB/s=" << gib_s << "\n";
|
||||
}
|
||||
|
||||
else if (flag == 2) {
|
||||
//B) Matrix*vector
|
||||
size_t m = M, n = 5000;
|
||||
vector<double> A, x, b;
|
||||
gen_matrix_A(m, n, A);
|
||||
x.resize(n);
|
||||
for (size_t j = 0; j < n; ++j)
|
||||
x[j] = 1.0 / (((17 + j) % 219) + 1);
|
||||
|
||||
cout << "Running matvec\n";
|
||||
tic();
|
||||
matvec_rowmajor(A, m, n, x, b);
|
||||
double dt = toc();
|
||||
|
||||
double flops = 2.0 * m * n; //each y_i does N multiplications and N additions = Mx2N
|
||||
double gflops = (flops / dt) / 1e9;
|
||||
double traffic_bytes = (m*n + n + m) * sizeof(double); //Memory usage in bytes: Matrix size mxn, 2 vectors n and m
|
||||
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
|
||||
|
||||
cout << "B: M=" << m << " N=" << n << " time=" << dt
|
||||
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
|
||||
}
|
||||
|
||||
else if (flag == 3) {
|
||||
// C) Matrix*matrix
|
||||
size_t m = M, l = L, n = 500;
|
||||
vector<double> A, B, C;
|
||||
gen_matrix_A(m, l, A);
|
||||
gen_matrix_A(l, n, B);
|
||||
|
||||
cout << "Running matmul\n";
|
||||
tic();
|
||||
matmul_rowmajor(A, m, l, B, n, C);
|
||||
double dt = toc();
|
||||
|
||||
double flops = 2.0 * m * l * n; //each element of C does N moltiplications and N additions: dim(C)=MxL hence MxLx2N
|
||||
double gflops = (flops / dt) / 1e9;
|
||||
double traffic_bytes = (m*l + l*n + m*n) * sizeof(double); //Memory usage in bytes: 3 matices MxN, NxL, MxL
|
||||
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
|
||||
|
||||
cout << "C: M=" << m << " L=" << l << " N=" << n << " time=" << dt
|
||||
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
|
||||
}
|
||||
|
||||
else if (flag == 4) {
|
||||
// D) Polynomial
|
||||
size_t p = 100; // degree
|
||||
vector<double> a(p+1), x(N), y;
|
||||
for (size_t k = 0; k <= p; ++k)
|
||||
a[k] = 1.0 / (k+1);
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
x[i] = (i % 219) * 0.001 + 1.0;
|
||||
|
||||
cout << "Running POLYNOMIAL test (D)\n";
|
||||
tic();
|
||||
polyp_horner(a, x, y);
|
||||
double dt = toc();
|
||||
|
||||
double flops = 2.0 * p * N; //each evaluation p moltiplications + p additions = 2pxN
|
||||
double gflops = (flops / dt) / 1e9;
|
||||
double traffic_bytes = (p+1 + N + N) * sizeof(double); //Memory usage in bytes: p+1 coefficients and N evaluation points
|
||||
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
|
||||
|
||||
cout << "D: p=" << p << " N=" << N << " time=" << dt
|
||||
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
|
||||
}
|
||||
|
||||
else if (flag == 5) {
|
||||
// E) Jacobi
|
||||
size_t n = 10000;
|
||||
CSR K; K.n = n;
|
||||
vector<double> f(n, 1.0), u;
|
||||
K.row_ptr.resize(n+1);
|
||||
K.val.reserve(3*n);
|
||||
K.col.reserve(3*n);
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
K.row_ptr[i] = K.val.size();
|
||||
if (i > 0) { K.val.push_back(-1.0); K.col.push_back(i-1); }
|
||||
K.val.push_back(2.0); K.col.push_back(i);
|
||||
if (i+1 < n) { K.val.push_back(-1.0); K.col.push_back(i+1); }
|
||||
}
|
||||
K.row_ptr[n] = K.val.size();
|
||||
|
||||
size_t maxit = 5000;
|
||||
double omega = 1.0, tol = 1e-8;
|
||||
|
||||
cout << "Running JACOBI solver test...\n";
|
||||
tic();
|
||||
jacobi_csr(K, f, u, maxit, omega, tol);
|
||||
double dt = toc();
|
||||
cout << "Jacobi: n=" << n << " time=" << dt << " s\n";
|
||||
}
|
||||
|
||||
else if (flag == 6) {//5(a)
|
||||
size_t N = 5000000; // large enough for ~10 s runtime
|
||||
vector<double> x(N);
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
x[i] = static_cast<double>((i % 219) + 1);
|
||||
|
||||
cout << "Running norm test (A2)\n";
|
||||
tic();
|
||||
volatile double s = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
s += x[i] * x[i];
|
||||
double dt = toc();
|
||||
|
||||
double flops = 2.0 * N; // 1 mult + 1 add per element
|
||||
double gflops = (flops / dt) / 1e9;
|
||||
double traffic_bytes = N * sizeof(double); // only one vector is read, memory is halved
|
||||
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
|
||||
|
||||
cout << "A2 (norm): N=" << N << " time=" << dt
|
||||
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
|
||||
}
|
||||
|
||||
else if (flag == 7) { //5(b) Kahan
|
||||
size_t N = 50'000'000;
|
||||
vector<double> x(N), y(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
x[i] = static_cast<double>((i % 219) + 1);
|
||||
y[i] = 1.0 / x[i];
|
||||
}
|
||||
|
||||
cout << "Running Kahan dot product (A3)...\n";
|
||||
tic();
|
||||
volatile double s = dot_kahan(x, y); (void)s;
|
||||
double dt = toc();
|
||||
|
||||
double flops = 6.0 * N; // more operations per element than standard dot
|
||||
double gflops = (flops / dt) / 1e9;
|
||||
double traffic_bytes = 2.0 * N * sizeof(double);
|
||||
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
|
||||
|
||||
cout << "A3 (Kahan): N=" << N << " time=" << dt
|
||||
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
|
||||
}
|
||||
|
||||
else if (flag == 8) { //5(c): compare row-wise vs column-wise matrix access
|
||||
size_t M = 1500, L = 1500, N = 1500;
|
||||
vector<double> A, B, C;
|
||||
gen_matrix_A(M, L, A);
|
||||
gen_matrix_A(L, N, B);
|
||||
|
||||
cout << "Running matrix-matrix (row-wise)\n";
|
||||
tic();
|
||||
matmul_rowmajor(A, M, L, B, N, C);
|
||||
double dt_row = toc();
|
||||
|
||||
cout << "Row-wise time: " << dt_row << " s\n";
|
||||
|
||||
// Column-wise version
|
||||
C.assign(M*N, 0.0);
|
||||
cout << "Running column-wise version\n";
|
||||
tic();
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
C[i*N + j] += A[i*L + k] * B[k*N + j];
|
||||
double dt_col = toc();
|
||||
cout << "Column-wise time: " << dt_col << " s\n"; //usually slower because it causes cache misses and lower memory bandwidth
|
||||
}
|
||||
|
||||
else {
|
||||
cout << "Invalid flag. Choose 1–8.\n";
|
||||
}
|
||||
|
||||
cout << "\nDone\n";
|
||||
return 0;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue