ex3
This commit is contained in:
parent
f0cd37fddc
commit
7a02dff345
29 changed files with 3943 additions and 1 deletions
30
ex3/code/Makefile
Normal file
30
ex3/code/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
PROGRAM = main
|
||||
|
||||
SOURCES = $(wildcard *.cpp)
|
||||
OBJECTS = ${SOURCES:.cpp=.o}
|
||||
|
||||
CXX = g++
|
||||
LINKER = g++
|
||||
|
||||
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -fmax-errors=1
|
||||
|
||||
CXXFLAGS = -g -flto -O3 -ffast-math -march=native ${WARNINGS}
|
||||
LINKFLAGS = -g -flto -O3 -lopenblas -llapacke
|
||||
|
||||
|
||||
all: ${PROGRAM}
|
||||
|
||||
# %.o: %.cpp
|
||||
# ${CXX} ${CXXFLAGS} -c $< -o $@
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM}
|
||||
|
||||
clean:
|
||||
rm -f ${OBJECTS} ${PROGRAM}
|
||||
|
||||
|
||||
run: ${PROGRAM}
|
||||
# run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
267
ex3/code/main.cpp
Normal file
267
ex3/code/main.cpp
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
|
||||
#include "task_3.h"
|
||||
#include "task_4+6.h"
|
||||
#include "task_5.h"
|
||||
#include "task_7.h"
|
||||
#include "timing.h"
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <cblas.h> // cBLAS Library
|
||||
#include <lapacke.h>
|
||||
|
||||
void task_1() {
|
||||
printf("\n\n-------------- Task 1 --------------\n\n");
|
||||
printf("See comment in main.cpp");
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// STREAM version $Revision: 5.10 $
|
||||
// -------------------------------------------------------------
|
||||
// This system uses 8 bytes per array element.
|
||||
// -------------------------------------------------------------
|
||||
// Array size = 80000000 (elements), Offset = 0 (elements)
|
||||
// Memory per array = 610.4 MiB (= 0.6 GiB).
|
||||
// Total memory required = 1831.1 MiB (= 1.8 GiB).
|
||||
// Each kernel will be executed 20 times.
|
||||
// The *best* time for each kernel (excluding the first iteration)
|
||||
// will be used to compute the reported bandwidth.
|
||||
// -------------------------------------------------------------
|
||||
// Your clock granularity/precision appears to be 1 microseconds.
|
||||
// Each test below will take on the order of 116886 microseconds.
|
||||
// (= 116886 clock ticks)
|
||||
// Increase the size of the arrays if this shows that
|
||||
// you are not getting at least 20 clock ticks per test.
|
||||
// -------------------------------------------------------------
|
||||
// WARNING -- The above is only a rough guideline.
|
||||
// For best results, please be sure you know the
|
||||
// precision of your system timer.
|
||||
// -------------------------------------------------------------
|
||||
// Function Best Rate MB/s Avg time Min time Max time
|
||||
// Copy: 29569.4 0.048585 0.043288 0.059164
|
||||
// Scale: 17644.0 0.082248 0.072546 0.102548
|
||||
// Add: 21030.1 0.100620 0.091298 0.124700
|
||||
// Triad: 21230.7 0.100758 0.090435 0.120631
|
||||
// -------------------------------------------------------------
|
||||
// Solution Validates: avg error less than 1.000000e-13 on all three arrays
|
||||
// -------------------------------------------------------------
|
||||
// ./flops.exe
|
||||
|
||||
// FLOPS C Program (Double Precision), V2.0 18 Dec 1992
|
||||
|
||||
// Module Error RunTime MFLOPS
|
||||
// (usec)
|
||||
// 1 4.0146e-13 0.0024 5827.9076
|
||||
// 2 -1.4166e-13 0.0007 10037.8942
|
||||
// 3 4.7184e-14 0.0039 4371.9185
|
||||
// 4 -1.2557e-13 0.0034 4355.5711
|
||||
// 5 -1.3800e-13 0.0066 4415.6439
|
||||
// 6 3.2380e-13 0.0065 4441.6299
|
||||
// 7 -8.4583e-11 0.0053 2277.1707
|
||||
// 8 3.4867e-13 0.0069 4367.6094
|
||||
|
||||
// Iterations = 512000000
|
||||
// NullTime (usec) = 0.0000
|
||||
// MFLOPS(1) = 7050.6178
|
||||
// MFLOPS(2) = 3461.6233
|
||||
// MFLOPS(3) = 4175.0442
|
||||
// MFLOPS(4) = 4389.7311
|
||||
}
|
||||
|
||||
void task_2() {
|
||||
printf("\n\n-------------- Task 2 --------------\n\n");
|
||||
printf("See comment in main.cpp");
|
||||
|
||||
// Memory needed (double 64-bit, 8 bytes):
|
||||
// (A) (2N + 1) * 8 bytes
|
||||
// (B) (M*N + M + N) * 8 bytes
|
||||
// (C) (M*L + L*N + M*N) * 8 bytes
|
||||
// (D) (N + N + p) * 8 bytes
|
||||
|
||||
// Floating point operations:
|
||||
// (A) 2N
|
||||
// (B) M * 2N
|
||||
// (C) M * 2L * N
|
||||
// (D) 2 * N * p (Horner Schema)
|
||||
|
||||
// Read/Write operations:
|
||||
// (A) Read: 2N Write: 1
|
||||
// (B) Read: M*2N Write: M*N
|
||||
// (C) Read: M*2L*N Write: M*L*N
|
||||
// (D) Read: 2*N*p Write: N*P
|
||||
}
|
||||
|
||||
void task_3() {
|
||||
printf("\n\n-------------- Task 3 --------------\n\n");
|
||||
printf("Functions implemented in task_3.cpp");
|
||||
}
|
||||
|
||||
void task_4(bool cblas = false) {
|
||||
if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");}
|
||||
size_t M, N, L, p, NLOOPS;
|
||||
|
||||
{ // Scalar product
|
||||
printf("----- Benchmark (A) -----\n");
|
||||
// Initialization
|
||||
N = 50'000'000;
|
||||
NLOOPS = 50;
|
||||
auto [x,y] = init_A(N);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_A(x, y, NLOOPS, cblas);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = 2 * N;
|
||||
size_t flops = 2 * N;
|
||||
print_performance(sec, memory, flops, sizeof(x[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
|
||||
{ // Matrix-Vector product
|
||||
printf("----- Benchmark (B) -----\n");
|
||||
// Initialization
|
||||
M = 8'000;
|
||||
N = 12'000;
|
||||
NLOOPS = 30;
|
||||
auto [A,x] = init_B(M,N);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_B(A, x, NLOOPS, cblas);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = M*N + M + N;
|
||||
size_t flops = 2 * M * N;
|
||||
print_performance(sec, memory, flops, sizeof(A[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
|
||||
{ // Matrix-Matrix product
|
||||
printf("----- Benchmark (C) -----\n");
|
||||
// Initialization
|
||||
M = 1'000;
|
||||
N = 2'000;
|
||||
L = 500;
|
||||
NLOOPS = 20;
|
||||
auto [A,B] = init_C(M,N,L);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_C(A, B, L, NLOOPS, cblas);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = M*L + L*N + M*N;
|
||||
size_t flops = M * 2*L * N;
|
||||
print_performance(sec, memory, flops, sizeof(A[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
|
||||
if (cblas == false)
|
||||
{ // Polynomial evaluation
|
||||
printf("----- Benchmark (D) -----\n");
|
||||
// Initialization
|
||||
N = 1'000'000;
|
||||
p = 200;
|
||||
NLOOPS = 20;
|
||||
auto [x,a] = init_D(N,p);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_D(x, a, NLOOPS);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = 2.0 * N;
|
||||
size_t flops = 2.0 * N * p;
|
||||
print_performance(sec, memory, flops, sizeof(x[0]));
|
||||
printf("-------------------------\n");
|
||||
}
|
||||
}
|
||||
|
||||
void task_5() {
|
||||
printf("\n\n-------------- Task 5 --------------\n\n");
|
||||
|
||||
printf("----- Benchmark norm -----\n");
|
||||
// Initialization
|
||||
size_t N =50'000'000;
|
||||
size_t NLOOPS = 50;
|
||||
vector<double> x = init_norm(N);
|
||||
// Benchmark
|
||||
tic();
|
||||
benchmark_norm(x, NLOOPS);
|
||||
double sec = toc() / NLOOPS;
|
||||
// Timings and Performance
|
||||
size_t memory = N;
|
||||
size_t flops = 2 * N;
|
||||
print_performance(sec, memory, flops, sizeof(x[0]));
|
||||
printf("-------------------------\n");
|
||||
printf("What do you observe? Why?\n");
|
||||
printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.");
|
||||
}
|
||||
|
||||
void task_6() {
|
||||
printf("\n\n-------------- Task 6 --------------\n\n");
|
||||
printf("Benchmarks using cBLAS\n");
|
||||
task_4(true);
|
||||
}
|
||||
|
||||
void task_7() {
|
||||
printf("\n\n-------------- Task 7 --------------\n\n");
|
||||
{ // Check Ax=b
|
||||
size_t N=5, Nrhs=2;
|
||||
auto [A,b] = init_M(N,Nrhs);
|
||||
vector<double> A_og = A;
|
||||
|
||||
printf("A =");
|
||||
print_matrix(A,N,N);
|
||||
printf("b =");
|
||||
print_matrix(b,N,Nrhs);
|
||||
|
||||
int lda=N, ldb=Nrhs;
|
||||
vector<int> ipiv(N);
|
||||
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
|
||||
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
|
||||
|
||||
printf("L + U =");
|
||||
print_matrix(A,N,N);
|
||||
printf("x =");
|
||||
print_matrix(b,N,Nrhs);
|
||||
|
||||
int ldc=Nrhs;
|
||||
vector<double> C(N*Nrhs);
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc);
|
||||
printf("Check solution:\nA * x = ");
|
||||
print_matrix(C,N,Nrhs);
|
||||
}
|
||||
// #################################
|
||||
// Benchmark
|
||||
|
||||
cout << fixed << setprecision(4); // 4 digits after decimal
|
||||
size_t NLOOPS = 1000;
|
||||
cout << "N = " << " | 1 | 2 | 4 | 8 | 16 | 32 " << endl;
|
||||
cout << "---------|--------|--------|--------|--------|--------|-------" << endl;
|
||||
for (int exp = 1; exp < 10; ++exp) {
|
||||
cout << "Nrhs = " << static_cast<size_t>(pow(2,exp));
|
||||
for (size_t N : {1, 2, 4, 8, 16, 32}) {
|
||||
tic();
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
benchmark_lapacke(N, static_cast<size_t>(pow(2,exp)));
|
||||
}
|
||||
double sec = toc();
|
||||
cout << " | " << sec;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
printf("\nFor fixed n, the solution time per rhs does not slow down consistently and scales very well.\nIts faster than expected.");
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
int main() {
|
||||
task_1();
|
||||
task_2();
|
||||
task_3();
|
||||
task_4();
|
||||
task_5();
|
||||
task_6();
|
||||
task_7();
|
||||
printf("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
61
ex3/code/task_3.cpp
Normal file
61
ex3/code/task_3.cpp
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
#include "task_3.h"
|
||||
#include <vector>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
using namespace std;
|
||||
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y) {
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
|
||||
size_t const N = x.size();
|
||||
size_t const M = A.size() / N;
|
||||
vector<double> b(M);
|
||||
|
||||
for (size_t i = 0; i < M; ++i) {
|
||||
for (size_t j = 0; j < N; ++j) {
|
||||
b[i] += A[i*N + j] * x[j];
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
|
||||
size_t const L = A.size() / M;
|
||||
size_t const N = B.size() / L;
|
||||
vector<double> C(M*N,0);
|
||||
|
||||
for (size_t i = 0; i < M; ++i) {
|
||||
for (size_t k = 0; k < L; ++k) {
|
||||
for (size_t j = 0; j < N; ++j) {
|
||||
C[i*N + j] += A[i*L + k] * B[k*N + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
return C;
|
||||
}
|
||||
|
||||
|
||||
vector<double> poly(vector<double> const &x, vector<double> const &a) {
|
||||
size_t N = x.size();
|
||||
size_t p = a.size();
|
||||
vector<double> y(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
y[i] = a[p];
|
||||
for (size_t k = 1; k < p; ++k) {
|
||||
y[i] = y[i]*x[i] + a[p-k];
|
||||
}
|
||||
}
|
||||
return y;
|
||||
}
|
||||
8
ex3/code/task_3.h
Normal file
8
ex3/code/task_3.h
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y);
|
||||
vector<double> matrix_vec(vector<double> const &A, vector<double> const &x);
|
||||
vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M);
|
||||
vector<double> poly(vector<double> const &x, vector<double> const &a);
|
||||
148
ex3/code/task_4+6.cpp
Normal file
148
ex3/code/task_4+6.cpp
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
#include "task_3.h"
|
||||
#include "task_4+6.h"
|
||||
#include "timing.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <cblas.h> // cBLAS Library
|
||||
using namespace std;
|
||||
|
||||
void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
|
||||
printf("Memory allocated : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
|
||||
printf("Duration per loop : %.3f sec\n", sec);
|
||||
printf("GFLOPS : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
|
||||
printf("GiByte/s : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
|
||||
}
|
||||
|
||||
tuple<vector<double>, vector<double>> init_A(size_t N) {
|
||||
vector<double> x(N), y(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
x[i] = i%219 + 1.0;
|
||||
y[i] = 1.0 / x[i];
|
||||
}
|
||||
return make_tuple(x, y);
|
||||
}
|
||||
|
||||
void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
|
||||
size_t N = x.size();
|
||||
|
||||
double s(0.0), sum(0.0);
|
||||
if (cblas == false) {
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
s = scalar(x, y);
|
||||
sum += s;
|
||||
}
|
||||
} else if (cblas == true) {
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
s = cblas_ddot(N, x.data(), 1, y.data(), 1);
|
||||
sum += s;
|
||||
}
|
||||
}
|
||||
|
||||
// Check correctness
|
||||
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
|
||||
}
|
||||
|
||||
tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
|
||||
vector<double> A(M*N), x(N);
|
||||
for (size_t i = 0; i < M; ++i) {
|
||||
for (size_t j = 0; j < N; ++j) {
|
||||
A[i*N + j] = (i+j)%219 + 1.0;
|
||||
}
|
||||
}
|
||||
for (size_t j = 0; j < N; ++j) {
|
||||
x[j] = 1.0/A[17*N + j];
|
||||
}
|
||||
return make_tuple(A, x);
|
||||
}
|
||||
|
||||
void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
|
||||
size_t N = x.size();
|
||||
size_t M = A.size() / N;
|
||||
vector<double> b(M);
|
||||
double sum(0.0);
|
||||
|
||||
if (cblas == false) {
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
b = matrix_vec(A,x);
|
||||
sum += b[17];
|
||||
}
|
||||
} else if (cblas == true) {
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
|
||||
sum += b[17];
|
||||
}
|
||||
}
|
||||
|
||||
// Check correctness
|
||||
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
|
||||
}
|
||||
|
||||
tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
|
||||
vector<double> A(M*L), B(L*N);
|
||||
for (size_t i = 0; i < M; ++i) {
|
||||
for (size_t j = 0; j < L; ++j) {
|
||||
A[i*L + j] = (i+j)%219 + 1.0;
|
||||
}
|
||||
}
|
||||
// B chosen such that C[0,17]=L
|
||||
// so B[i,17] = 1/A[0,i]
|
||||
for (size_t i = 0; i < L; ++i) {
|
||||
for (size_t j = 0; j < N; ++j) {
|
||||
if (j==17) {
|
||||
B[i*N + 17] = 1.0/A[i];
|
||||
} else {
|
||||
B[i*N + j] = (i+j)%219 + 1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
return make_tuple(A, B);
|
||||
}
|
||||
|
||||
void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
|
||||
size_t M = A.size() / L;
|
||||
size_t N = B.size() / L;
|
||||
vector<double> C(M*N);
|
||||
double sum(0.0);
|
||||
|
||||
if (cblas == false) {
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
C = matrix_matrix(A,B,M);
|
||||
sum += C[17];
|
||||
}
|
||||
} else if (cblas == true) {
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
|
||||
sum += C[17];
|
||||
}
|
||||
}
|
||||
|
||||
// Check correctness
|
||||
if (static_cast<size_t>(sum) != L*NLOOPS) {printf(" !! W R O N G result !!\n");}
|
||||
}
|
||||
|
||||
tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
|
||||
// x_i = i/N for i=0,...,N-1
|
||||
// a_j = 1 for j=0,...,p-1
|
||||
vector<double> x(N), a(p);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
x[i] = static_cast<double>(i) / N;
|
||||
}
|
||||
for (size_t j = 0; j < p; ++j) {
|
||||
a[j] = 1.0;
|
||||
}
|
||||
return make_tuple(x, a);
|
||||
}
|
||||
|
||||
void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
|
||||
size_t N = x.size();
|
||||
vector<double> y(N);
|
||||
double sum(0.0);
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
y = poly(x,a);
|
||||
sum += y[0];
|
||||
}
|
||||
|
||||
// Check correctness
|
||||
if (static_cast<size_t>(sum) != NLOOPS) {printf(" !! W R O N G result sum = %f !!\n", sum);}
|
||||
}
|
||||
14
ex3/code/task_4+6.h
Normal file
14
ex3/code/task_4+6.h
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
void print_performance(double sec, size_t memory, size_t flops, unsigned int size);
|
||||
tuple<vector<double>, vector<double>> init_A(size_t N);
|
||||
tuple<vector<double>, vector<double>> init_B(size_t M, size_t N);
|
||||
tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L);
|
||||
tuple<vector<double>, vector<double>> init_D(size_t N, size_t p);
|
||||
|
||||
void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas);
|
||||
void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas);
|
||||
void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas);
|
||||
void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS);
|
||||
33
ex3/code/task_5.cpp
Normal file
33
ex3/code/task_5.cpp
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
#include "task_4+6.h"
|
||||
#include "task_5.h"
|
||||
#include "timing.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
using namespace std;
|
||||
|
||||
double norm(vector<double> const &x) {
|
||||
size_t N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
sum += x[i] * x[i];
|
||||
}
|
||||
return sqrt(sum);
|
||||
}
|
||||
|
||||
vector<double> init_norm(size_t N) {
|
||||
vector<double> x(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
x[i] = i%219 + 1.0;
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
void benchmark_norm(vector<double> const &x, size_t NLOOPS) {
|
||||
double s(0.0), sum(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i) {
|
||||
s = norm(x);
|
||||
sum += s;
|
||||
}
|
||||
printf("||x|| = %f\n", sum/NLOOPS);
|
||||
}
|
||||
8
ex3/code/task_5.h
Normal file
8
ex3/code/task_5.h
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
double norm(vector<double> const &x);
|
||||
vector<double> init_norm(size_t N);
|
||||
void benchmark_norm(vector<double> const &x, size_t NLOOPS);
|
||||
43
ex3/code/task_7.cpp
Normal file
43
ex3/code/task_7.cpp
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
#include "task_7.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <lapacke.h>
|
||||
using namespace std;
|
||||
|
||||
tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs) {
|
||||
vector<double> A(N*N), b(N*Nrhs);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
for (size_t j = 0; j < N; ++j) {
|
||||
if (i != j) {
|
||||
A[i*N + j] = 1.0 / pow(abs(1.0*i-1.0*j),2);
|
||||
} else {
|
||||
A[i*N + j] = 4;
|
||||
}
|
||||
}
|
||||
for (size_t j=0; j < Nrhs; ++j) {
|
||||
b[i*Nrhs + j] = 1.0*j;
|
||||
}
|
||||
}
|
||||
return make_tuple(A, b);
|
||||
}
|
||||
|
||||
void print_matrix(vector<double> &A, size_t M, size_t N) {
|
||||
printf("\n");
|
||||
for (size_t i = 0; i < M; ++i){
|
||||
for (size_t j = 0; j < N; ++j) {
|
||||
printf("%f ", A[i*N + j]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
void benchmark_lapacke(int N, int Nrhs) {
|
||||
auto [A,b] = init_M(N,Nrhs);
|
||||
int lda=N, ldb=Nrhs;
|
||||
vector<int> ipiv(N);
|
||||
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data());
|
||||
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb);
|
||||
}
|
||||
7
ex3/code/task_7.h
Normal file
7
ex3/code/task_7.h
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
tuple<vector<double>, vector<double>> init_M(size_t N, size_t Nrhs);
|
||||
void print_matrix(vector<double> &A, size_t M, size_t N);
|
||||
void benchmark_lapacke(int N, int Nrhs);
|
||||
51
ex3/code/timing.h
Normal file
51
ex3/code/timing.h
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
//
|
||||
// Gundolf Haase, Oct 18 2024
|
||||
//
|
||||
#pragma once
|
||||
#include <chrono> // timing
|
||||
#include <stack>
|
||||
|
||||
//using Clock = std::chrono::system_clock; //!< The wall clock timer chosen
|
||||
using Clock = std::chrono::high_resolution_clock;
|
||||
using TPoint= std::chrono::time_point<Clock>;
|
||||
|
||||
// [Galowicz, C++17 STL Cookbook, p. 29]
|
||||
inline
|
||||
std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
|
||||
|
||||
/** Starts stopwatch timer.
|
||||
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
|
||||
*
|
||||
* The timining can be nested and the recent time point is stored on top of the stack.
|
||||
*
|
||||
* @return recent time point
|
||||
* @see toc
|
||||
*/
|
||||
inline auto tic()
|
||||
{
|
||||
MyStopWatch.push(Clock::now());
|
||||
return MyStopWatch.top();
|
||||
}
|
||||
|
||||
/** Returns the elapsed time from stopwatch.
|
||||
*
|
||||
* The time point from top of the stack is used
|
||||
* if time point @p t_b is not passed as input parameter.
|
||||
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
|
||||
* or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode
|
||||
* The last option is to be used in the case of
|
||||
* non-nested but overlapping time measurements.
|
||||
*
|
||||
* @param[in] t_b start time of some stop watch
|
||||
* @return elapsed time in seconds.
|
||||
*
|
||||
*/
|
||||
inline double toc(TPoint const &t_b = MyStopWatch.top())
|
||||
{
|
||||
// https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
|
||||
using Unit = std::chrono::seconds;
|
||||
using FpSeconds = std::chrono::duration<double, Unit::period>;
|
||||
auto t_e = Clock::now();
|
||||
MyStopWatch.pop();
|
||||
return FpSeconds(t_e-t_b).count();
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue