ex3
This commit is contained in:
parent
f0cd37fddc
commit
7a02dff345
29 changed files with 3943 additions and 1 deletions
267
ex3/code/main.cpp
Normal file
267
ex3/code/main.cpp
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
|
||||
#include "task_3.h"
|
||||
#include "task_4+6.h"
|
||||
#include "task_5.h"
|
||||
#include "task_7.h"
|
||||
#include "timing.h"
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <cblas.h> // cBLAS Library
|
||||
#include <lapacke.h>
|
||||
|
||||
void task_1() {
|
||||
printf("\n\n-------------- Task 1 --------------\n\n");
|
||||
printf("See comment in main.cpp");
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// STREAM version $Revision: 5.10 $
|
||||
// -------------------------------------------------------------
|
||||
// This system uses 8 bytes per array element.
|
||||
// -------------------------------------------------------------
|
||||
// Array size = 80000000 (elements), Offset = 0 (elements)
|
||||
// Memory per array = 610.4 MiB (= 0.6 GiB).
|
||||
// Total memory required = 1831.1 MiB (= 1.8 GiB).
|
||||
// Each kernel will be executed 20 times.
|
||||
// The *best* time for each kernel (excluding the first iteration)
|
||||
// will be used to compute the reported bandwidth.
|
||||
// -------------------------------------------------------------
|
||||
// Your clock granularity/precision appears to be 1 microseconds.
|
||||
// Each test below will take on the order of 116886 microseconds.
|
||||
// (= 116886 clock ticks)
|
||||
// Increase the size of the arrays if this shows that
|
||||
// you are not getting at least 20 clock ticks per test.
|
||||
// -------------------------------------------------------------
|
||||
// WARNING -- The above is only a rough guideline.
|
||||
// For best results, please be sure you know the
|
||||
// precision of your system timer.
|
||||
// -------------------------------------------------------------
|
||||
// Function Best Rate MB/s Avg time Min time Max time
|
||||
// Copy: 29569.4 0.048585 0.043288 0.059164
|
||||
// Scale: 17644.0 0.082248 0.072546 0.102548
|
||||
// Add: 21030.1 0.100620 0.091298 0.124700
|
||||
// Triad: 21230.7 0.100758 0.090435 0.120631
|
||||
// -------------------------------------------------------------
|
||||
// Solution Validates: avg error less than 1.000000e-13 on all three arrays
|
||||
// -------------------------------------------------------------
|
||||
// ./flops.exe
|
||||
|
||||
// FLOPS C Program (Double Precision), V2.0 18 Dec 1992
|
||||
|
||||
// Module Error RunTime MFLOPS
|
||||
// (usec)
|
||||
// 1 4.0146e-13 0.0024 5827.9076
|
||||
// 2 -1.4166e-13 0.0007 10037.8942
|
||||
// 3 4.7184e-14 0.0039 4371.9185
|
||||
// 4 -1.2557e-13 0.0034 4355.5711
|
||||
// 5 -1.3800e-13 0.0066 4415.6439
|
||||
// 6 3.2380e-13 0.0065 4441.6299
|
||||
// 7 -8.4583e-11 0.0053 2277.1707
|
||||
// 8 3.4867e-13 0.0069 4367.6094
|
||||
|
||||
// Iterations = 512000000
|
||||
// NullTime (usec) = 0.0000
|
||||
// MFLOPS(1) = 7050.6178
|
||||
// MFLOPS(2) = 3461.6233
|
||||
// MFLOPS(3) = 4175.0442
|
||||
// MFLOPS(4) = 4389.7311
|
||||
}
|
||||
|
||||
void task_2() {
|
||||
printf("\n\n-------------- Task 2 --------------\n\n");
|
||||
printf("See comment in main.cpp");
|
||||
|
||||
// Memory needed (double 64-bit, 8 bytes):
|
||||
// (A) (2N + 1) * 8 bytes
|
||||
// (B) (M*N + M + N) * 8 bytes
|
||||
// (C) (M*L + L*N + M*N) * 8 bytes
|
||||
// (D) (N + N + p) * 8 bytes
|
||||
|
||||
// Floating point operations:
|
||||
// (A) 2N
|
||||
// (B) M * 2N
|
||||
// (C) M * 2L * N
|
||||
// (D) 2 * N * p (Horner Schema)
|
||||
|
||||
// Read/Write operations:
|
||||
// (A) Read: 2N Write: 1
|
||||
// (B) Read: M*2N Write: M*N
|
||||
// (C) Read: M*2L*N Write: M*L*N
|
||||
// (D) Read: 2*N*p Write: N*P
|
||||
}
|
||||
|
||||
void task_3() {
|
||||
printf("\n\n-------------- Task 3 --------------\n\n");
|
||||
printf("Functions implemented in task_3.cpp");
|
||||
}
|
||||
|
||||
void task_4(bool cblas = false) {
|
||||
if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
|
||||
size_t M, N, L, p, NLOOPS;
|
||||
|
||||
{ // Scalar product
|
||||
printf("----- Benchmark (A) -----\n");
|
||||
// Initialization
|
||||
N = 50'000'000;
|
||||
NLOOPS = 50;
|
||||
auto [x,y] = init_A(N);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_A(x, y, NLOOPS, cblas);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = 2 * N;
|
||||
size_t flops = 2 * N;
|
||||
print_performance(sec, memory, flops, sizeof(x[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
|
||||
{ // Matrix-Vector product
|
||||
printf("----- Benchmark (B) -----\n");
|
||||
// Initialization
|
||||
M = 8'000;
|
||||
N = 12'000;
|
||||
NLOOPS = 30;
|
||||
auto [A,x] = init_B(M,N);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_B(A, x, NLOOPS, cblas);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = M*N + M + N;
|
||||
size_t flops = 2 * M * N;
|
||||
print_performance(sec, memory, flops, sizeof(A[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
|
||||
{ // Matrix-Matrix product
|
||||
printf("----- Benchmark (C) -----\n");
|
||||
// Initialization
|
||||
M = 1'000;
|
||||
N = 2'000;
|
||||
L = 500;
|
||||
NLOOPS = 20;
|
||||
auto [A,B] = init_C(M,N,L);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_C(A, B, L, NLOOPS, cblas);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = M*L + L*N + M*N;
|
||||
size_t flops = M * 2*L * N;
|
||||
print_performance(sec, memory, flops, sizeof(A[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
|
||||
if (cblas == false)
|
||||
{ // Polynomial evaluation
|
||||
printf("----- Benchmark (D) -----\n");
|
||||
// Initialization
|
||||
N = 1'000'000;
|
||||
p = 200;
|
||||
NLOOPS = 20;
|
||||
auto [x,a] = init_D(N,p);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_D(x, a, NLOOPS);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = 2.0 * N;
|
||||
size_t flops = 2.0 * N * p;
|
||||
print_performance(sec, memory, flops, sizeof(x[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
}
|
||||
|
||||
void task_5() {
|
||||
printf("\n\n-------------- Task 5 --------------\n\n");
|
||||
|
||||
printf("----- Benchmark norm -----\n");
|
||||
// Initialization
|
||||
size_t N =50'000'000;
|
||||
size_t NLOOPS = 50;
|
||||
vector<double> x = init_norm(N);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_norm(x, NLOOPS);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = N;
|
||||
size_t flops = 2 * N;
|
||||
print_performance(sec, memory, flops, sizeof(x[0]));
|
||||
printf("-------------------------\n");
|
||||
printf("What do you observe? Why?\n");
|
||||
printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
|
||||
}
|
||||
|
||||
void task_6() {
|
||||
printf("\n\n-------------- Task 6 --------------\n\n");
|
||||
printf("Benchmarks using cBLAS\n");
|
||||
task_4(true);
|
||||
}
|
||||
|
||||
void task_7() {
|
||||
printf("\n\n-------------- Task 7 --------------\n\n");
|
||||
{ // Check Ax=b
|
||||
size_t N=5, Nrhs=2;
|
||||
auto [A,b] = init_M(N,Nrhs);
|
||||
vector<double> A_og = A;
|
||||
|
||||
printf("A =");
|
||||
print_matrix(A,N,N);
|
||||
printf("b =");
|
||||
print_matrix(b,N,Nrhs);
|
||||
|
||||
int lda=N, ldb=Nrhs;
|
||||
vector<int> ipiv(N);
|
||||
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
|
||||
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
|
||||
|
||||
printf("L + U =");
|
||||
print_matrix(A,N,N);
|
||||
printf("x =");
|
||||
print_matrix(b,N,Nrhs);
|
||||
|
||||
int ldc=Nrhs;
|
||||
vector<double> C(N*Nrhs);
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
|
||||
printf("Check solution:\nA * x = ");
|
||||
print_matrix(C,N,Nrhs);
|
||||
}
|
||||
// #################################
|
||||
// Benchmark
|
||||
|
||||
cout << fixed << setprecision(4); // 4 digits after decimal
|
||||
size_t NLOOPS = 1000;
|
||||
cout << "N = " << " | 1 | 2 | 4 | 8 | 16 | 32 " << endl;
|
||||
cout << "---------|--------|--------|--------|--------|--------|-------" << endl;
|
||||
for (int exp = 1; exp < 10; ++exp) {
|
||||
cout << "Nrhs = " << static_cast<size_t>(pow(2,exp));
|
||||
for (size_t N : {1, 2, 4, 8, 16, 32}) {
|
||||
tic();
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
benchmark_lapacke(N, static_cast<size_t>(pow(2,exp)));
|
||||
}
|
||||
double sec = toc();
|
||||
cout << " | " << sec;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
printf("\nFor fixed n, the solution time per rhs does not slow down consistently and scales very well.\nIts faster than expected.");
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
int main() {
|
||||
task_1();
|
||||
task_2();
|
||||
task_3();
|
||||
task_4();
|
||||
task_5();
|
||||
task_6();
|
||||
task_7();
|
||||
printf("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue