scf_celebic/ex3/code/main.cpp
dino.celebic 36a12731ef ex4
2025-11-25 19:21:38 +01:00

268 lines
No EOL
8.7 KiB
C++

// Compile with "make" / "make run" or
// g++ *.cpp -O3 -ffast-math -lopenblas -llapacke -o main
#include "task_3.h"
#include "task_4+6.h"
#include "task_5.h"
#include "task_7.h"
#include "timing.h"
#include <cblas.h> // cBLAS Library
#include <iomanip>
#include <iostream>
#include <lapacke.h>
void task_1() {
printf("\n\n-------------- Task 1 --------------\n\n");
printf("See comment in main.cpp");
// -------------------------------------------------------------
// STREAM version $Revision: 5.10 $
// -------------------------------------------------------------
// This system uses 8 bytes per array element.
// -------------------------------------------------------------
// Array size = 80000000 (elements), Offset = 0 (elements)
// Memory per array = 610.4 MiB (= 0.6 GiB).
// Total memory required = 1831.1 MiB (= 1.8 GiB).
// Each kernel will be executed 20 times.
// The *best* time for each kernel (excluding the first iteration)
// will be used to compute the reported bandwidth.
// -------------------------------------------------------------
// Your clock granularity/precision appears to be 1 microseconds.
// Each test below will take on the order of 116886 microseconds.
// (= 116886 clock ticks)
// Increase the size of the arrays if this shows that
// you are not getting at least 20 clock ticks per test.
// -------------------------------------------------------------
// WARNING -- The above is only a rough guideline.
// For best results, please be sure you know the
// precision of your system timer.
// -------------------------------------------------------------
// Function Best Rate MB/s Avg time Min time Max time
// Copy: 29569.4 0.048585 0.043288 0.059164
// Scale: 17644.0 0.082248 0.072546 0.102548
// Add: 21030.1 0.100620 0.091298 0.124700
// Triad: 21230.7 0.100758 0.090435 0.120631
// -------------------------------------------------------------
// Solution Validates: avg error less than 1.000000e-13 on all three arrays
// -------------------------------------------------------------
// ./flops.exe
// FLOPS C Program (Double Precision), V2.0 18 Dec 1992
// Module Error RunTime MFLOPS
// (usec)
// 1 4.0146e-13 0.0024 5827.9076
// 2 -1.4166e-13 0.0007 10037.8942
// 3 4.7184e-14 0.0039 4371.9185
// 4 -1.2557e-13 0.0034 4355.5711
// 5 -1.3800e-13 0.0066 4415.6439
// 6 3.2380e-13 0.0065 4441.6299
// 7 -8.4583e-11 0.0053 2277.1707
// 8 3.4867e-13 0.0069 4367.6094
// Iterations = 512000000
// NullTime (usec) = 0.0000
// MFLOPS(1) = 7050.6178
// MFLOPS(2) = 3461.6233
// MFLOPS(3) = 4175.0442
// MFLOPS(4) = 4389.7311
}
void task_2() {
printf("\n\n-------------- Task 2 --------------\n\n");
printf("See comment in main.cpp");
// Memory needed (double 64-bit, 8 bytes):
// (A) (2N + 1) * 8 bytes
// (B) (M*N + M + N) * 8 bytes
// (C) (M*L + L*N + M*N) * 8 bytes
// (D) (N + N + p) * 8 bytes
// Floating point operations:
// (A) 2N
// (B) M * 2N
// (C) M * 2L * N
// (D) 2 * N * p (Horner Schema)
// Read/Write operations:
// (A) Read: 2N Write: 1
// (B) Read: M*2N Write: M*N
// (C) Read: M*2L*N Write: M*L*N
// (D) Read: 2*N*p Write: N*P
}
void task_3() {
printf("\n\n-------------- Task 3 --------------\n\n");
printf("Functions implemented in task_3.cpp");
}
void task_4(bool cblas = false) {
if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
size_t M, N, L, p, NLOOPS;
{ // Scalar product
printf("----- Benchmark (A) -----\n");
// Initialization
N = 50'000'000;
NLOOPS = 50;
auto [x,y] = init_A(N);
// Benchmark
tic();
benchmark_A(x, y, NLOOPS, cblas);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = 2 * N;
size_t flops = 2 * N;
print_performance(sec, memory, flops, sizeof(x[0]));
printf("-------------------------\n");
}
{ // Matrix-Vector product
printf("----- Benchmark (B) -----\n");
// Initialization
M = 8'000;
N = 12'000;
NLOOPS = 30;
auto [A,x] = init_B(M,N);
// Benchmark
tic();
benchmark_B(A, x, NLOOPS, cblas);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = M*N + M + N;
size_t flops = 2 * M * N;
print_performance(sec, memory, flops, sizeof(A[0]));
printf("-------------------------\n");
}
{ // Matrix-Matrix product
printf("----- Benchmark (C) -----\n");
// Initialization
M = 1'000;
N = 2'000;
L = 500;
NLOOPS = 20;
auto [A,B] = init_C(M,N,L);
// Benchmark
tic();
benchmark_C(A, B, L, NLOOPS, cblas);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = M*L + L*N + M*N;
size_t flops = M * 2*L * N;
print_performance(sec, memory, flops, sizeof(A[0]));
printf("-------------------------\n");
}
if (cblas == false)
{ // Polynomial evaluation
printf("----- Benchmark (D) -----\n");
// Initialization
N = 1'000'000;
p = 200;
NLOOPS = 20;
auto [x,a] = init_D(N,p);
// Benchmark
tic();
benchmark_D(x, a, NLOOPS);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = 2.0 * N;
size_t flops = 2.0 * N * p;
print_performance(sec, memory, flops, sizeof(x[0]));
printf("-------------------------\n");
}
}
void task_5() {
printf("\n\n-------------- Task 5 --------------\n\n");
printf("----- Benchmark norm -----\n");
// Initialization
size_t N =50'000'000;
size_t NLOOPS = 50;
vector<double> x = init_norm(N);
// Benchmark
tic();
benchmark_norm(x, NLOOPS);
double sec = toc() / NLOOPS;
// Timings and Performance
size_t memory = N;
size_t flops = 2 * N;
print_performance(sec, memory, flops, sizeof(x[0]));
printf("-------------------------\n");
printf("What do you observe? Why?\n");
printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
}
void task_6() {
printf("\n\n-------------- Task 6 --------------\n\n");
printf("Benchmarks using cBLAS\n");
task_4(true);
}
void task_7() {
printf("\n\n-------------- Task 7 --------------\n\n");
{ // Check Ax=b
size_t N=5, Nrhs=2;
auto [A,b] = init_M(N,Nrhs);
vector<double> A_og = A;
printf("A =");
print_matrix(A,N,N);
printf("b =");
print_matrix(b,N,Nrhs);
int lda=N, ldb=Nrhs;
vector<int> ipiv(N);
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
printf("L + U =");
print_matrix(A,N,N);
printf("x =");
print_matrix(b,N,Nrhs);
int ldc=Nrhs;
vector<double> C(N*Nrhs);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
printf("Check solution:\nA * x = ");
print_matrix(C,N,Nrhs);
}
// #################################
// Benchmark
cout << fixed << setprecision(4);
size_t NLOOPS = 200;
size_t Nrhs = 2000;
cout << "Solution time per right hand side in milliseconds: sec*1000/Nrhs" << endl;
cout << "N = " << " | 1 | 2 | 4 | 8 | 16 | 32 " << endl;
cout << "------------|--------|--------|--------|--------|--------|-------" << endl;
for (int k = 1; k < 10; ++k) {
cout << "Nrhs = " << Nrhs*k;
for (size_t N : {1, 2, 4, 8, 16, 32}) {
tic();
for (size_t i = 0; i < NLOOPS; ++i) {
benchmark_lapacke(N, Nrhs*k);
}
double sec = toc()*1000 / (static_cast<double>(Nrhs)*k);
cout << " | " << sec;
}
cout << endl;
}
printf("\nFor fixed n, the solution time per rhs stays roughly constant.");
}
int main() {
task_1();
task_2();
task_3();
task_4();
task_5();
task_6();
task_7();
printf("\n\n");
return 0;
}