now with 6 and 7

This commit is contained in:
Georg Thomas Mandl 2025-12-04 19:04:13 +01:00
commit 7836d85c80
5 changed files with 1194 additions and 0 deletions

30
BSP_3_2to7/Makefile Normal file
View file

@ -0,0 +1,30 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp bsp_3_lib_bench.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

View file

@ -0,0 +1,796 @@
#include "bsp_3_lib_bench.h"
#include <cassert>
#include <chrono>
#include <cmath>
#include <iostream>
#include <ctime>
#include <cblas.h>
#include <lapacke.h>
using namespace std;
using namespace std::chrono; // timing
double scalar(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
size_t const N = x.size();
double sum = 0.0;
for (size_t i = 0; i < N; ++i)
{
sum += x[i] * y[i];
//sum += exp(x[i])*log(y[i]);
}
return sum;
}
double scalar_cblas(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
size_t const N = x.size();
double sum = cblas_ddot(N, x.data(), 1, y.data(), 1);
return sum;
}
double scalar_kahan(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
size_t const N = x.size();
double sum = 0.0;
double c = 0.0;
for (size_t i = 0; i < N; ++i)
{
double yk = x[i] * y[i] - c;
double t = sum + yk;
c = t - sum - yk;
sum = t;
//sum += exp(x[i])*log(y[i]);
}
return sum;
}
double norm_eucl(std::vector<double> const &x)
{
size_t const N = x.size();
double sum = 0.0;
for (size_t i = 0; i < N; ++i)
{
sum += x[i]*x[i];
//sum += exp(x[i])*log(y[i]);
}
sum = sqrt(sum);
return sum;
}
vector<double> MatVec(vector<double> const & a, vector<double> const & x) // row wise access
{
int const nelem = static_cast<int>(a.size()); // #elements in matrix
int const mcols = static_cast<int>(x.size()); // #elements in vector <==> #columns in matrix
assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows)
int const nrows = nelem/mcols; // integer division!
vector<double> b(nrows); // allocate resulting vector
for(size_t i = 0; i < nrows; ++i)
{
double tmp = 0.0;
for(size_t j = 0; j < mcols; ++j)
{
tmp = tmp + a[i*mcols+j] * x[j];
}
b[i] = tmp;
}
return b;
}
vector<double> MatVec_cblas(vector<double> const & a, vector<double> const & x) // row wise access
{
int const nelem = static_cast<int>(a.size()); // #elements in matrix
int const mcols = static_cast<int>(x.size()); // #elements in vector <==> #columns in matrix
assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows)
int const nrows = nelem/mcols; // integer division!
vector<double> b(nrows); // allocate resulting vector
double const alpha = 1.0;
double const beta = 0.0;
cblas_dgemv(CblasRowMajor, CblasTrans,
nrows, mcols,
alpha, a.data(), nrows,
x.data(), 1,
beta, b.data(), 1);
return b;
}
vector<double> MatVec_column(vector<double> const & a, vector<double> const & x) // column wise access
{
int const nelem = static_cast<int>(a.size()); // #elements in matrix
int const mcols = static_cast<int>(x.size()); // #elements in vector <==> #columns in matrix
assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows)
int const nrows = nelem/mcols; // integer division!
vector<double> b(nrows); // allocate resulting vector
// if we do it directly we have cache issues - not optimal
// to make the code more efficient we change the two loops and put the b[i] inside the inner loop
// b is not so large compared to a, so higher amount of writing operations to not matter that much
for(size_t j = 0; j < mcols; ++j)
{
double xj = x[j];
for(size_t i = 0; i < nrows; ++i)
{
b[i] += a[j*nrows+i] * xj;
}
}
return b;
}
vector<double> MatMatProd(vector<double> const & a, vector<double> const & b, int const & L)
{
size_t const a_nelem = a.size();
size_t const b_nelem = b.size();
assert(static_cast<int>(a_nelem) % L == 0 && static_cast<int>(b_nelem) % L == 0);
size_t M = a_nelem/L;
size_t N = b_nelem/L;
vector<double> c(N*M,0);
for(size_t i = 0; i < M; ++i)
{
for(size_t k = 0; k < L; ++k)
{
for(size_t j = 0; j < N; ++j)
{
c[i*M+j] = c[i*M+j] + a[i*L+k]*b[k*N+j];
}
}
}
return c;
}
vector<double> MatMatProd_cblas(vector<double> const & a, vector<double> const & b, int const & L)
{
size_t const a_nelem = a.size();
size_t const b_nelem = b.size();
assert(static_cast<int>(a_nelem) % L == 0 && static_cast<int>(b_nelem) % L == 0);
size_t M = a_nelem/L;
size_t N = b_nelem/L;
vector<double> c(N*M,0);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
M, N, L,
1.0, a.data(), L, b.data(), N,
0.0, c.data(), N);
return c;
}
vector<double> PolynomEval(vector<double> const & a, vector<double> const & x)
{
// we want to use the Horner-scheme
vector<double> sol(x.size(),0);
for(size_t i = 0; i < x.size(); ++i)
{
double tmp = a[a.size()-1];
for(int k = static_cast<int>(a.size())-2; k >= 0; --k)
{
tmp = tmp*x[i] + a[k];
}
sol[i] = tmp;
}
return sol;
}
void benchmark_A(int const & N, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking A: scalar product\n";
vector<double> x(N), y(N);
for(size_t k = 0; k < x.size(); ++k)
{
x[k] = (k % 219) + 1;
y[k] = 1.0/x[k];
}
auto t1 = system_clock::now(); // start timer
// Do calculation
double sk(0.0), ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
sk = scalar(x, y);
ss += sk; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n <x,y> = " << sk << endl;
if (static_cast<unsigned int>(sk) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_A_cblas(int const & N, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking A: scalar product with cblas\n";
vector<double> x(N), y(N);
for(size_t k = 0; k < x.size(); ++k)
{
x[k] = (k % 219) + 1;
y[k] = 1.0/x[k];
}
auto t1 = system_clock::now(); // start timer
// Do calculation
double sk(0.0), ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
sk = scalar_cblas(x, y);
ss += sk; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n <x,y> = " << sk << endl;
if (static_cast<unsigned int>(sk) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_A_kahan(int const & N, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking A: scalar product with Kahan summation\n";
vector<double> x(N), y(N);
for(size_t k = 0; k < x.size(); ++k)
{
x[k] = (k % 219) + 1;
y[k] = 1.0/x[k];
}
auto t1 = system_clock::now(); // start timer
// Do calculation
double sk(0.0), ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
sk = scalar(x, y);
ss += sk; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n <x,y> = " << sk << endl;
if (static_cast<unsigned int>(sk) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
//cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
//cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_A_norm(int const & N, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking A_norm: euclidean norm\n";
vector<double> x(N,1.0);
auto t1 = system_clock::now(); // start timer
// Do calculation
double sk(0.0), ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
sk = norm_eucl(x);
ss += sk; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n ||x|| = " << sk << endl;
if (sk - sqrt(N) > 1e-7)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_B(int const & N, int const & M, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking B: Matrix-Vector Product (row wise access)\n";
vector<double> x(N), b(M), a(N*M);
// initialize data
for(size_t i = 0; i < M; ++i)
{
for(size_t j = 0; j < N; ++j)
{
a[i*N+j] = (i+j) % 219 + 1;
}
}
for(size_t i = 0; i < N; ++i)
{
x[i] = 1.0/a[17*N+i];
}
auto t1 = system_clock::now(); // start timer
// Do calculation
double ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
b = MatVec(a,x);
ss += b[0]; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n <A[17,.],x> = " << b[17] << endl;
if (static_cast<unsigned int>(b[17]) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << "\t M = " << M << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_B_cblas(int const & N, int const & M, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking B: Matrix-Vector Product with cblas (row wise access)\n";
vector<double> x(N), b(M), a(N*M);
// initialize data
for(size_t i = 0; i < M; ++i)
{
for(size_t j = 0; j < N; ++j)
{
a[i*N+j] = (i+j) % 219 + 1;
}
}
for(size_t i = 0; i < N; ++i)
{
x[i] = 1.0/a[17*N+i];
}
auto t1 = system_clock::now(); // start timer
// Do calculation
double ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
b = MatVec_cblas(a,x);
ss += b[0]; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n <A[17,.],x> = " << b[17] << endl;
if (static_cast<unsigned int>(b[17]) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << "\t M = " << M << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_B_column(int const & N, int const & M, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking B: Matrix-Vector Product (column wise access)\n";
vector<double> x(N), b(M), a(N*M);
// initialize data
for(size_t i = 0; i < M; ++i)
{
for(size_t j = 0; j < N; ++j)
{
a[i*N+j] = (i+j) % 219 + 1;
}
}
for(size_t i = 0; i < N; ++i)
{
x[i] = 1.0/a[17*N+i];
}
auto t1 = system_clock::now(); // start timer
// Do calculation
double ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
b = MatVec_column(a,x);
ss += b[0]; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n <A[17,.],x> = " << b[17] << endl;
if (static_cast<unsigned int>(b[17]) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << "\t M = " << M << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking C: Matrix-Matrix Product\n";
vector<double> a(M*L,1.0), b(L*N,1.0), c(N*M);
// with this data we get C[i,j] = L for all i and j
auto t1 = system_clock::now(); // start timer
// Do calculation
double ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
c = MatMatProd(a,b,L);
ss += c[0]; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n C[10,15] = " << c[10*N+15] << endl;
if (static_cast<unsigned int>(c[10*N+15]) != L)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_C_cblas(int const & N, int const & M, int const & L, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking C: Matrix-Matrix Product mit cblas\n";
vector<double> a(M*L,1.0), b(L*N,1.0), c(N*M);
// with this data we get C[i,j] = L for all i and j
auto t1 = system_clock::now(); // start timer
// Do calculation
double ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
c = MatMatProd_cblas(a,b,L);
ss += c[0]; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n C[10,15] = " << c[10*N+15] << endl;
if (static_cast<unsigned int>(c[10*N+15]) != L)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
cout << endl << endl;
return;
}
void benchmark_D(int const & p, int const & N, int const & Nloops)
{
//##########################################################################
cout << "\nStart Benchmarking D: polynomial evaluation\n";
vector<double> x(N,1), sol(N), a(p+1);
for(size_t i = 0; i < a.size(); ++i)
{
a[i] = pow(-1.0,i); // 1-x+x^2-x^3+x^4...
}
a[0] = 1;
auto t1 = system_clock::now(); // start timer
// Do calculation
double ss(0.0);
for (int i = 0; i < Nloops; ++i)
{
sol = PolynomEval(a,x);
ss += sol[0]; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/Nloops; // duration per loop seconds
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
//##########################################################################
// Check the correct result
cout << "\n p(x[0]) = " << sol[0] << endl;
if (static_cast<unsigned int>(sol[0]) != (static_cast<int>(a.size()) % 2))
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "p = " << p << "\t N = " << N << endl;
cout << "Time for Nloops: " << t_diff*Nloops << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << 2.0*(p+1)*N / t_diff / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << N*(3+2*p) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
cout << endl << endl;
return;
}
void solver(int const & N, int const & nrhs)
{
// generating the system matrix a
vector<double> a(N*N);
for(int i=0; i<N; ++i)
{
for(int j=0; j<N; ++j)
{
if(i==j)
{
a[i*N+j] = 4;
}
else
{
a[i*N+j] = 1.0/(i-j)/(i-j);
}
}
}
vector<double> a_old = a;
// generating rhs
vector<double> rhs(N*nrhs);
for(int i=0; i<nrhs; ++i)
{
for(int j=0; j<N; ++j)
{
rhs[i*nrhs + j] = i+j-1;
}
}
vector<double> b_old = rhs;
vector<int> ipiv(N);
// factorization
LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, a.data(), N, ipiv.data());
auto t1 = system_clock::now(); // start timer
// Calculation of rhs -> b gets overwritten and contains the solution
LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, nrhs, a.data(), N, ipiv.data(), rhs.data(), nrhs);
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
cout.precision(2);
cout << "Timing in sec. : " << t_diff << " for nrhs= " << nrhs << endl;
// Check for correct result
vector<double> ax = MatMatProd_cblas(a_old, rhs, N);
vector<double> diff(N*nrhs);
for(int i=0; i<N*nrhs; ++i)
{
diff[i] = ax[i] - b_old[i];
}
double diffnorm = norm_eucl(diff);
if(diffnorm > 1e-4)
{
cout << " !! W R O N G result !!\n";
}
return;
}

View file

@ -0,0 +1,205 @@
#ifndef BSP_3_LIB_BENCH_H_INCLUDED
#define BSP_3_LIB_BENCH_H_INCLUDED
#include <vector>
/** Inner product
@param[in] x vector
@param[in] y vector
@return resulting Euclidean inner product <x,y>
*/
double scalar(std::vector<double> const &x, std::vector<double> const &y);
/** Inner product with cblas
@param[in] x vector
@param[in] y vector
@return resulting Euclidean inner product <x,y>
*/
double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
/** Inner product with Kahan summation
@param[in] x vector
@param[in] y vector
@return resulting Euclidean inner product <x,y>
*/
double scalar_kahan(std::vector<double> const &x, std::vector<double> const &y);
/** euclidean norm
@param[in] x vector
@return resulting Euclidean norm
*/
double norm_eucl(std::vector<double> const &x);
/** \brief Matrix-Vektor-Multiplikation (row-wise access)
*
* \param[in] a Matrix with row wise access
* \param[in] x vector which gets multiplied
* \return resulting product a*x (vector)
*
*/
std::vector<double> MatVec(std::vector<double> const & a, std::vector<double> const & x);
/** \brief Matrix-Vektor-Multiplikation mit cblas (row-wise access)
*
* \param[in] a Matrix with row wise access
* \param[in] x vector which gets multiplied
* \return resulting product a*x (vector)
*
*/
std::vector<double> MatVec_cblas(std::vector<double> const & a, std::vector<double> const & x);
/** \brief Matrix-Vektor-Multiplikation (column-wise access)
*
* \param[in] a Matrix with row wise access
* \param[in] x vector which gets multiplied
* \return resulting product a*x (vector)
*
*/
std::vector<double> MatVec_column(std::vector<double> const & a, std::vector<double> const & x);
/** \brief Matrix-Matrix-Multiplikation (row-wise access)
*
* \param[in] a matrix with row wise access (M*L)
* \param[in] b matrix with row wise access (L*N)
* \param[in] L inner dimension of the matrix product
* \return resulting product a*b
*
*/
std::vector<double> MatMatProd(std::vector<double> const & a, std::vector<double> const & b, int const & L);
/** \brief Matrix-Matrix-Multiplikation mit cblas (row-wise access)
*
* \param[in] a matrix with row wise access (M*L)
* \param[in] b matrix with row wise access (L*N)
* \param[in] L inner dimension of the matrix product
* \return resulting product a*b
*
*/
std::vector<double> MatMatProd_cblas(std::vector<double> const & a, std::vector<double> const & b, int const & L);
/** \brief Polynomauswertung an Stelle x
*
* \param[in] a Vekor mit den Koeffizienten des Polynoms a=[a0,a1,a2,...]
* \param[in] x Vektor, für welchen das Polynom ausgewertet werden soll
* \return resulting vector p(x)
*
*/
std::vector<double> PolynomEval(std::vector<double> const & a, std::vector<double> const & x);
/** \brief Benchmarking A - the scalar product
*
* \param N size of the vector
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_A(int const & N, int const & Nloops);
/** \brief Benchmarking A - the scalar product with cblas
*
* \param N size of the vector
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_A_cblas(int const & N, int const & Nloops);
/** \brief Benchmarking A - the scalar product with Kahan summation
*
* \param N size of the vector
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_A_kahan(int const & N, int const & Nloops);
/** \brief Benchmarking A - norm
*
* \param N size of the vector
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_A_norm(int const & N, int const & Nloops);
/** \brief Benchmarking B - matrix-vector product Ax=b (row wise access)
*
* \param N size of vector x
* \param M size of vector b (=> A: M*N)
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_B(int const & N, int const & M, int const & Nloops);
/** \brief Benchmarking B - matrix-vector product Ax=b with cblas (row wise access)
*
* \param N size of vector x
* \param M size of vector b (=> A: M*N)
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_B_cblas(int const & N, int const & M, int const & Nloops);
/** \brief Benchmarking B - matrix-vector product Ax=b (column wise access)
*
* \param N size of vector x
* \param M size of vector b (=> A: M*N)
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_B_column(int const & N, int const & M, int const & Nloops);
/** \brief Benchmarking C - Matrix-Matrix product C=A*B A_M*L, B_L*N
*
* \param N
* \param M
* \param L
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops);
/** \brief Benchmarking C - Matrix-Matrix product with cblas; C=A*B A_M*L, B_L*N
*
* \param N
* \param M
* \param L
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_C_cblas(int const & N, int const & M, int const & L, int const & Nloops);
/** \brief Benchmarking D - polynomial evaluation
*
* \param p the degree of the polynomial
* \param N size of the input vector x where p(x)
* \param Nloops number of iterations we want to do for the measuring
*
*/
void benchmark_D(int const & p, int const & N, int const & Nloops);
/** \brief solving system of linear equations with time measurement
*
* \param N size of the system Ax=b with A(nxn)
* \param nrhs number of right hand sides b(nxnrhs)
*
*/
void solver(int const & N, int const & nrhs);
#endif // BSP_3_LIB_BENCH_H_INCLUDED

29
BSP_3_2to7/main.cpp Normal file
View file

@ -0,0 +1,29 @@
#include "bsp_3_lib_bench.h"
#include <iostream>
using namespace std;
int main()
{
benchmark_A(25*1e7,50);
benchmark_A_cblas(25*1e7,50);
benchmark_A_kahan(25*1e7,50);
benchmark_A_norm(25*1e7,50);
benchmark_B(8000,8000,400);
benchmark_B_column(8000,8000,400);
benchmark_B_cblas(8000,8000,400);
benchmark_C(4000,4000,4000,1);
benchmark_C_cblas(4000,4000,4000,10);
benchmark_D(1e4,1e5,15);
cout << "Solving system of linear equations\n";
vector<int> nrhsvec{1,2,4,8,16,32};
for(size_t i=0; i<nrhsvec.size(); ++i)
{
int nrhs = nrhsvec[i];
solver(5000,nrhs);
}
// the time for solving the system does not really depend on the number of the right hand sides
return 0;
}

View file

@ -0,0 +1,134 @@
Start Benchmarking A: scalar product
<x,y> = 2.5e+08
N = 250000000
Time for Nloops: 11
Timing in sec. : 0.21
GFLOPS : 2.2
GiByte/s : 17
Start Benchmarking A: scalar product with cblas
<x,y> = 2.5e+08
N = 250000000
Time for Nloops: 5.7
Timing in sec. : 0.11
GFLOPS : 4.1
GiByte/s : 33
Start Benchmarking A: scalar product with Kahan summation
<x,y> = 2.5e+08
N = 250000000
Time for Nloops: 11
Timing in sec. : 0.22
Start Benchmarking A_norm: euclidean norm
||x|| = 1.6e+04
N = 250000000
Time for Nloops: 6.1
Timing in sec. : 0.12
GFLOPS : 3.8
GiByte/s : 15
Start Benchmarking B: Matrix-Vector Product (row wise access)
<A[17,.],x> = 8e+03
N = 8000 M = 8000
Time for Nloops: 10
Timing in sec. : 0.026
GFLOPS : 4.7
GiByte/s : 37
Start Benchmarking B: Matrix-Vector Product (column wise access)
<A[17,.],x> = 8e+03
N = 8000 M = 8000
Time for Nloops: 13
Timing in sec. : 0.032
GFLOPS : 3.7
GiByte/s : 30
Start Benchmarking B: Matrix-Vector Product with cblas (row wise access)
<A[17,.],x> = 8e+03
N = 8000 M = 8000
Time for Nloops: 13
Timing in sec. : 0.032
GFLOPS : 3.8
GiByte/s : 30
Start Benchmarking C: Matrix-Matrix Product
C[10,15] = 4e+03
N = 4000 M = 4000 L = 4000
Time for Nloops: 29
Timing in sec. : 29
GFLOPS : 4
GiByte/s : 0.012
Start Benchmarking C: Matrix-Matrix Product mit cblas
C[10,15] = 4e+03
N = 4000 M = 4000 L = 4000
Time for Nloops: 6.2
Timing in sec. : 0.62
GFLOPS : 1.9e+02
GiByte/s : 0.58
Start Benchmarking D: polynomial evaluation
p(x[0]) = 1
p = 10000 N = 100000
Time for Nloops: 17
Timing in sec. : 1.2
GFLOPS : 1.6
GiByte/s : 13
Solving system of linear equations
Timing in sec. : 0.4 for nrhs= 1
Timing in sec. : 0.4 for nrhs= 2
Timing in sec. : 0.43 for nrhs= 4
Timing in sec. : 0.47 for nrhs= 8
Timing in sec. : 0.54 for nrhs= 16
Timing in sec. : 0.48 for nrhs= 32