LisaPizzoExercises/Sheet3/main.cpp

230 lines
No EOL
7.7 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <iostream>
#include <vector>
#include <cmath>
#include <iomanip>
#include <chrono>
#include "bench_funcs.h"
#include "bench_funcs.cpp"
using namespace std;
using namespace std::chrono;
void gen_vector_x_y(std::size_t N, std::vector<double>& x, std::vector<double>& y) {
x.resize(N);
y.resize(N);
for (std::size_t i = 0; i < N; ++i) {
x[i] = static_cast<double>((i % 219) + 1); // xi := (i mod 219) + 1
y[i] = 1.0 / x[i]; // yi := 1/xi
}
}
void gen_matrix_A(std::size_t M, std::size_t N, std::vector<double>& A) {
A.resize(M * N);
for (std::size_t i = 0; i < M; ++i) {
for (std::size_t j = 0; j < N; ++j) {
A[i * N + j] = static_cast<double>(((i + j) % 219) + 1);
}
}
}
high_resolution_clock::time_point tic_timer;
void tic() { tic_timer = high_resolution_clock::now(); }
double toc() {
auto t1 = high_resolution_clock::now();
duration<double> elapsed = t1 - tic_timer;
return elapsed.count();
}
//CHANGE FLAG BASED ON WHAT YOU WANT TO DO
int main() {
int flag = 8; // 1=A, 2=B, 3=C, 4=D, 5=Jacobi 6=Norm in 5a, 7= Kahan in 5b and 8=colums access in 5c
size_t N = 5000000; // default vector length
size_t M = 3000, L = 3000;
if (flag == 1) {
// A) Inner product
vector<double> x, y;
gen_vector_x_y(N, x, y);
cout << "Running dot_basic" << endl;
tic();
volatile double s = dot_basic(x, y); (void)s;
double dt = toc();
double flops = 2.0 * N; //x_i*y_i=1 FLOPS, the sum = 1 FLOPS, for N times = 2N
double gflops = (flops / dt) / 1e9; // computes how many flops per second does my computer and then converts to giga FLOPS
double traffic_bytes = 2.0 * N * sizeof(double); //Memory usage in bytes: 2 vectors of length N times the size of double
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0); //computes how many bytes are moved by my computer per second and converts to Gibibytes per second
cout << "A: N=" << N << " time=" << dt << " s GFLOPS=" << gflops
<< " GiB/s=" << gib_s << "\n";
}
else if (flag == 2) {
//B) Matrix*vector
size_t m = M, n = 5000;
vector<double> A, x, b;
gen_matrix_A(m, n, A);
x.resize(n);
for (size_t j = 0; j < n; ++j)
x[j] = 1.0 / (((17 + j) % 219) + 1);
cout << "Running matvec\n";
tic();
matvec_rowmajor(A, m, n, x, b);
double dt = toc();
double flops = 2.0 * m * n; //each y_i does N multiplications and N additions = Mx2N
double gflops = (flops / dt) / 1e9;
double traffic_bytes = (m*n + n + m) * sizeof(double); //Memory usage in bytes: Matrix size mxn, 2 vectors n and m
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
cout << "B: M=" << m << " N=" << n << " time=" << dt
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
}
else if (flag == 3) {
// C) Matrix*matrix
size_t m = M, l = L, n = 500;
vector<double> A, B, C;
gen_matrix_A(m, l, A);
gen_matrix_A(l, n, B);
cout << "Running matmul\n";
tic();
matmul_rowmajor(A, m, l, B, n, C);
double dt = toc();
double flops = 2.0 * m * l * n; //each element of C does N moltiplications and N additions: dim(C)=MxL hence MxLx2N
double gflops = (flops / dt) / 1e9;
double traffic_bytes = (m*l + l*n + m*n) * sizeof(double); //Memory usage in bytes: 3 matices MxN, NxL, MxL
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
cout << "C: M=" << m << " L=" << l << " N=" << n << " time=" << dt
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
}
else if (flag == 4) {
// D) Polynomial
size_t p = 100; // degree
vector<double> a(p+1), x(N), y;
for (size_t k = 0; k <= p; ++k)
a[k] = 1.0 / (k+1);
for (size_t i = 0; i < N; ++i)
x[i] = (i % 219) * 0.001 + 1.0;
cout << "Running POLYNOMIAL test (D)\n";
tic();
polyp_horner(a, x, y);
double dt = toc();
double flops = 2.0 * p * N; //each evaluation p moltiplications + p additions = 2pxN
double gflops = (flops / dt) / 1e9;
double traffic_bytes = (p+1 + N + N) * sizeof(double); //Memory usage in bytes: p+1 coefficients and N evaluation points
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
cout << "D: p=" << p << " N=" << N << " time=" << dt
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
}
else if (flag == 5) {
// E) Jacobi
size_t n = 10000;
CSR K; K.n = n;
vector<double> f(n, 1.0), u;
K.row_ptr.resize(n+1);
K.val.reserve(3*n);
K.col.reserve(3*n);
for (size_t i = 0; i < n; ++i) {
K.row_ptr[i] = K.val.size();
if (i > 0) { K.val.push_back(-1.0); K.col.push_back(i-1); }
K.val.push_back(2.0); K.col.push_back(i);
if (i+1 < n) { K.val.push_back(-1.0); K.col.push_back(i+1); }
}
K.row_ptr[n] = K.val.size();
size_t maxit = 5000;
double omega = 1.0, tol = 1e-8;
cout << "Running JACOBI solver test...\n";
tic();
jacobi_csr(K, f, u, maxit, omega, tol);
double dt = toc();
cout << "Jacobi: n=" << n << " time=" << dt << " s\n";
}
else if (flag == 6) {//5(a)
size_t N = 5000000; // large enough for ~10 s runtime
vector<double> x(N);
for (size_t i = 0; i < N; ++i)
x[i] = static_cast<double>((i % 219) + 1);
cout << "Running norm test (A2)\n";
tic();
volatile double s = 0.0;
for (size_t i = 0; i < N; ++i)
s += x[i] * x[i];
double dt = toc();
double flops = 2.0 * N; // 1 mult + 1 add per element
double gflops = (flops / dt) / 1e9;
double traffic_bytes = N * sizeof(double); // only one vector is read, memory is halved
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
cout << "A2 (norm): N=" << N << " time=" << dt
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
}
else if (flag == 7) { //5(b) Kahan
size_t N = 50'000'000;
vector<double> x(N), y(N);
for (size_t i = 0; i < N; ++i) {
x[i] = static_cast<double>((i % 219) + 1);
y[i] = 1.0 / x[i];
}
cout << "Running Kahan dot product (A3)...\n";
tic();
volatile double s = dot_kahan(x, y); (void)s;
double dt = toc();
double flops = 6.0 * N; // more operations per element than standard dot
double gflops = (flops / dt) / 1e9;
double traffic_bytes = 2.0 * N * sizeof(double);
double gib_s = (traffic_bytes / dt) / (1024.0*1024.0*1024.0);
cout << "A3 (Kahan): N=" << N << " time=" << dt
<< " s GFLOPS=" << gflops << " GiB/s=" << gib_s << "\n";
}
else if (flag == 8) { //5(c): compare row-wise vs column-wise matrix access
size_t M = 1500, L = 1500, N = 1500;
vector<double> A, B, C;
gen_matrix_A(M, L, A);
gen_matrix_A(L, N, B);
cout << "Running matrix-matrix (row-wise)\n";
tic();
matmul_rowmajor(A, M, L, B, N, C);
double dt_row = toc();
cout << "Row-wise time: " << dt_row << " s\n";
// Column-wise version
C.assign(M*N, 0.0);
cout << "Running column-wise version\n";
tic();
for (size_t j = 0; j < N; ++j)
for (size_t k = 0; k < L; ++k)
for (size_t i = 0; i < M; ++i)
C[i*N + j] += A[i*L + k] * B[k*N + j];
double dt_col = toc();
cout << "Column-wise time: " << dt_col << " s\n"; //usually slower because it causes cache misses and lower memory bandwidth
}
else {
cout << "Invalid flag. Choose 18.\n";
}
cout << "\nDone\n";
return 0;
}