268 lines
No EOL
8.7 KiB
C++
268 lines
No EOL
8.7 KiB
C++
// Compile with "make" / "make run" or
|
|
// g++ *.cpp -O3 -ffast-math -lopenblas -llapacke -o main
|
|
|
|
|
|
#include "task_3.h"
|
|
#include "task_4+6.h"
|
|
#include "task_5.h"
|
|
#include "task_7.h"
|
|
#include "timing.h"
|
|
#include <cblas.h> // cBLAS Library
|
|
#include <iomanip>
|
|
#include <iostream>
|
|
#include <lapacke.h>
|
|
|
|
void task_1() {
|
|
printf("\n\n-------------- Task 1 --------------\n\n");
|
|
printf("See comment in main.cpp");
|
|
|
|
// -------------------------------------------------------------
|
|
// STREAM version $Revision: 5.10 $
|
|
// -------------------------------------------------------------
|
|
// This system uses 8 bytes per array element.
|
|
// -------------------------------------------------------------
|
|
// Array size = 80000000 (elements), Offset = 0 (elements)
|
|
// Memory per array = 610.4 MiB (= 0.6 GiB).
|
|
// Total memory required = 1831.1 MiB (= 1.8 GiB).
|
|
// Each kernel will be executed 20 times.
|
|
// The *best* time for each kernel (excluding the first iteration)
|
|
// will be used to compute the reported bandwidth.
|
|
// -------------------------------------------------------------
|
|
// Your clock granularity/precision appears to be 1 microseconds.
|
|
// Each test below will take on the order of 116886 microseconds.
|
|
// (= 116886 clock ticks)
|
|
// Increase the size of the arrays if this shows that
|
|
// you are not getting at least 20 clock ticks per test.
|
|
// -------------------------------------------------------------
|
|
// WARNING -- The above is only a rough guideline.
|
|
// For best results, please be sure you know the
|
|
// precision of your system timer.
|
|
// -------------------------------------------------------------
|
|
// Function Best Rate MB/s Avg time Min time Max time
|
|
// Copy: 29569.4 0.048585 0.043288 0.059164
|
|
// Scale: 17644.0 0.082248 0.072546 0.102548
|
|
// Add: 21030.1 0.100620 0.091298 0.124700
|
|
// Triad: 21230.7 0.100758 0.090435 0.120631
|
|
// -------------------------------------------------------------
|
|
// Solution Validates: avg error less than 1.000000e-13 on all three arrays
|
|
// -------------------------------------------------------------
|
|
// ./flops.exe
|
|
|
|
// FLOPS C Program (Double Precision), V2.0 18 Dec 1992
|
|
|
|
// Module Error RunTime MFLOPS
|
|
// (usec)
|
|
// 1 4.0146e-13 0.0024 5827.9076
|
|
// 2 -1.4166e-13 0.0007 10037.8942
|
|
// 3 4.7184e-14 0.0039 4371.9185
|
|
// 4 -1.2557e-13 0.0034 4355.5711
|
|
// 5 -1.3800e-13 0.0066 4415.6439
|
|
// 6 3.2380e-13 0.0065 4441.6299
|
|
// 7 -8.4583e-11 0.0053 2277.1707
|
|
// 8 3.4867e-13 0.0069 4367.6094
|
|
|
|
// Iterations = 512000000
|
|
// NullTime (usec) = 0.0000
|
|
// MFLOPS(1) = 7050.6178
|
|
// MFLOPS(2) = 3461.6233
|
|
// MFLOPS(3) = 4175.0442
|
|
// MFLOPS(4) = 4389.7311
|
|
}
|
|
|
|
void task_2() {
|
|
printf("\n\n-------------- Task 2 --------------\n\n");
|
|
printf("See comment in main.cpp");
|
|
|
|
// Memory needed (double 64-bit, 8 bytes):
|
|
// (A) (2N + 1) * 8 bytes
|
|
// (B) (M*N + M + N) * 8 bytes
|
|
// (C) (M*L + L*N + M*N) * 8 bytes
|
|
// (D) (N + N + p) * 8 bytes
|
|
|
|
// Floating point operations:
|
|
// (A) 2N
|
|
// (B) M * 2N
|
|
// (C) M * 2L * N
|
|
// (D) 2 * N * p (Horner Schema)
|
|
|
|
// Read/Write operations:
|
|
// (A) Read: 2N Write: 1
|
|
// (B) Read: M*2N Write: M*N
|
|
// (C) Read: M*2L*N Write: M*L*N
|
|
// (D) Read: 2*N*p Write: N*P
|
|
}
|
|
|
|
void task_3() {
|
|
printf("\n\n-------------- Task 3 --------------\n\n");
|
|
printf("Functions implemented in task_3.cpp");
|
|
}
|
|
|
|
void task_4(bool cblas = false) {
|
|
if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
|
|
size_t M, N, L, p, NLOOPS;
|
|
|
|
{ // Scalar product
|
|
printf("----- Benchmark (A) -----\n");
|
|
// Initialization
|
|
N = 50'000'000;
|
|
NLOOPS = 50;
|
|
auto [x,y] = init_A(N);
|
|
// Benchmark
|
|
tic();
|
|
benchmark_A(x, y, NLOOPS, cblas);
|
|
double sec = toc() / NLOOPS;
|
|
// Timings and Performance
|
|
size_t memory = 2 * N;
|
|
size_t flops = 2 * N;
|
|
print_performance(sec, memory, flops, sizeof(x[0]));
|
|
printf("-------------------------\n");
|
|
}
|
|
|
|
{ // Matrix-Vector product
|
|
printf("----- Benchmark (B) -----\n");
|
|
// Initialization
|
|
M = 8'000;
|
|
N = 12'000;
|
|
NLOOPS = 30;
|
|
auto [A,x] = init_B(M,N);
|
|
// Benchmark
|
|
tic();
|
|
benchmark_B(A, x, NLOOPS, cblas);
|
|
double sec = toc() / NLOOPS;
|
|
// Timings and Performance
|
|
size_t memory = M*N + M + N;
|
|
size_t flops = 2 * M * N;
|
|
print_performance(sec, memory, flops, sizeof(A[0]));
|
|
printf("-------------------------\n");
|
|
}
|
|
|
|
{ // Matrix-Matrix product
|
|
printf("----- Benchmark (C) -----\n");
|
|
// Initialization
|
|
M = 1'000;
|
|
N = 2'000;
|
|
L = 500;
|
|
NLOOPS = 20;
|
|
auto [A,B] = init_C(M,N,L);
|
|
// Benchmark
|
|
tic();
|
|
benchmark_C(A, B, L, NLOOPS, cblas);
|
|
double sec = toc() / NLOOPS;
|
|
// Timings and Performance
|
|
size_t memory = M*L + L*N + M*N;
|
|
size_t flops = M * 2*L * N;
|
|
print_performance(sec, memory, flops, sizeof(A[0]));
|
|
printf("-------------------------\n");
|
|
}
|
|
|
|
if (cblas == false)
|
|
{ // Polynomial evaluation
|
|
printf("----- Benchmark (D) -----\n");
|
|
// Initialization
|
|
N = 1'000'000;
|
|
p = 200;
|
|
NLOOPS = 20;
|
|
auto [x,a] = init_D(N,p);
|
|
// Benchmark
|
|
tic();
|
|
benchmark_D(x, a, NLOOPS);
|
|
double sec = toc() / NLOOPS;
|
|
// Timings and Performance
|
|
size_t memory = 2.0 * N;
|
|
size_t flops = 2.0 * N * p;
|
|
print_performance(sec, memory, flops, sizeof(x[0]));
|
|
printf("-------------------------\n");
|
|
}
|
|
}
|
|
|
|
void task_5() {
|
|
printf("\n\n-------------- Task 5 --------------\n\n");
|
|
|
|
printf("----- Benchmark norm -----\n");
|
|
// Initialization
|
|
size_t N =50'000'000;
|
|
size_t NLOOPS = 50;
|
|
vector<double> x = init_norm(N);
|
|
// Benchmark
|
|
tic();
|
|
benchmark_norm(x, NLOOPS);
|
|
double sec = toc() / NLOOPS;
|
|
// Timings and Performance
|
|
size_t memory = N;
|
|
size_t flops = 2 * N;
|
|
print_performance(sec, memory, flops, sizeof(x[0]));
|
|
printf("-------------------------\n");
|
|
printf("What do you observe? Why?\n");
|
|
printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
|
|
}
|
|
|
|
void task_6() {
|
|
printf("\n\n-------------- Task 6 --------------\n\n");
|
|
printf("Benchmarks using cBLAS\n");
|
|
task_4(true);
|
|
}
|
|
|
|
void task_7() {
|
|
printf("\n\n-------------- Task 7 --------------\n\n");
|
|
{ // Check Ax=b
|
|
size_t N=5, Nrhs=2;
|
|
auto [A,b] = init_M(N,Nrhs);
|
|
vector<double> A_og = A;
|
|
|
|
printf("A =");
|
|
print_matrix(A,N,N);
|
|
printf("b =");
|
|
print_matrix(b,N,Nrhs);
|
|
|
|
int lda=N, ldb=Nrhs;
|
|
vector<int> ipiv(N);
|
|
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
|
|
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
|
|
|
|
printf("L + U =");
|
|
print_matrix(A,N,N);
|
|
printf("x =");
|
|
print_matrix(b,N,Nrhs);
|
|
|
|
int ldc=Nrhs;
|
|
vector<double> C(N*Nrhs);
|
|
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
|
|
printf("Check solution:\nA * x = ");
|
|
print_matrix(C,N,Nrhs);
|
|
}
|
|
// #################################
|
|
// Benchmark
|
|
|
|
cout << fixed << setprecision(4);
|
|
size_t NLOOPS = 200;
|
|
size_t Nrhs = 2000;
|
|
cout << "Solution time per right hand side in milliseconds: sec*1000/Nrhs" << endl;
|
|
cout << "N = " << " | 1 | 2 | 4 | 8 | 16 | 32 " << endl;
|
|
cout << "------------|--------|--------|--------|--------|--------|-------" << endl;
|
|
for (int k = 1; k < 10; ++k) {
|
|
cout << "Nrhs = " << Nrhs*k;
|
|
for (size_t N : {1, 2, 4, 8, 16, 32}) {
|
|
tic();
|
|
for (size_t i = 0; i < NLOOPS; ++i) {
|
|
benchmark_lapacke(N, Nrhs*k);
|
|
}
|
|
double sec = toc()*1000 / (static_cast<double>(Nrhs)*k);
|
|
cout << " | " << sec;
|
|
}
|
|
cout << endl;
|
|
}
|
|
printf("\nFor fixed n, the solution time per rhs stays roughly constant.");
|
|
}
|
|
|
|
int main() {
|
|
task_1();
|
|
task_2();
|
|
task_3();
|
|
task_4();
|
|
task_5();
|
|
task_6();
|
|
task_7();
|
|
printf("\n\n");
|
|
|
|
return 0;
|
|
} |