exercises 2 to 5
This commit is contained in:
parent
14b51cb405
commit
ce217b844f
5 changed files with 820 additions and 0 deletions
519
BSP_3_2to5/bsp_3_lib_bench.cpp
Normal file
519
BSP_3_2to5/bsp_3_lib_bench.cpp
Normal file
|
|
@ -0,0 +1,519 @@
|
|||
#include "bsp_3_lib_bench.h"
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <ctime>
|
||||
|
||||
using namespace std;
|
||||
using namespace std::chrono; // timing
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
//sum += exp(x[i])*log(y[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
double scalar_kahan(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
double c = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
double yk = x[i] * y[i] - c;
|
||||
double t = sum + yk;
|
||||
c = t - sum - yk;
|
||||
sum = t;
|
||||
//sum += exp(x[i])*log(y[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
double norm_eucl(std::vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i]*x[i];
|
||||
//sum += exp(x[i])*log(y[i]);
|
||||
}
|
||||
sum = sqrt(sum);
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<double> MatVec(vector<double> const & a, vector<double> const & x) // row wise access
|
||||
{
|
||||
int const nelem = static_cast<int>(a.size()); // #elements in matrix
|
||||
int const mcols = static_cast<int>(x.size()); // #elements in vector <==> #columns in matrix
|
||||
|
||||
assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows)
|
||||
int const nrows = nelem/mcols; // integer division!
|
||||
|
||||
vector<double> b(nrows); // allocate resulting vector
|
||||
|
||||
for(size_t i = 0; i < nrows; ++i)
|
||||
{
|
||||
double tmp = 0.0;
|
||||
for(size_t j = 0; j < mcols; ++j)
|
||||
{
|
||||
tmp = tmp + a[i*mcols+j] * x[j];
|
||||
}
|
||||
b[i] = tmp;
|
||||
}
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
vector<double> MatVec_column(vector<double> const & a, vector<double> const & x) // column wise access
|
||||
{
|
||||
int const nelem = static_cast<int>(a.size()); // #elements in matrix
|
||||
int const mcols = static_cast<int>(x.size()); // #elements in vector <==> #columns in matrix
|
||||
|
||||
assert(nelem % mcols == 0); // nelem has to be a multiple of mcols (==> #rows)
|
||||
int const nrows = nelem/mcols; // integer division!
|
||||
|
||||
vector<double> b(nrows); // allocate resulting vector
|
||||
|
||||
// if we do it directly we have cache issues - not optimal
|
||||
// to make the code more efficient we change the two loops and put the b[i] inside the inner loop
|
||||
// b is not so large compared to a, so higher amount of writing operations to not matter that much
|
||||
for(size_t j = 0; j < mcols; ++j)
|
||||
{
|
||||
double xj = x[j];
|
||||
for(size_t i = 0; i < nrows; ++i)
|
||||
{
|
||||
b[i] += a[j*nrows+i] * xj;
|
||||
}
|
||||
}
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
vector<double> MatMatProd(vector<double> const & a, vector<double> const & b, int const & L)
|
||||
{
|
||||
size_t const a_nelem = a.size();
|
||||
size_t const b_nelem = b.size();
|
||||
|
||||
assert(static_cast<int>(a_nelem) % L == 0 && static_cast<int>(b_nelem) % L == 0);
|
||||
|
||||
size_t M = a_nelem/L;
|
||||
size_t N = b_nelem/L;
|
||||
|
||||
vector<double> c(N*M,0);
|
||||
|
||||
for(size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for(size_t k = 0; k < L; ++k)
|
||||
{
|
||||
for(size_t j = 0; j < N; ++j)
|
||||
{
|
||||
c[i*M+j] = c[i*M+j] + a[i*L+k]*b[k*N+j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
vector<double> PolynomEval(vector<double> const & a, vector<double> const & x)
|
||||
{
|
||||
// we want to use the Horner-scheme
|
||||
vector<double> sol(x.size(),0);
|
||||
|
||||
for(size_t i = 0; i < x.size(); ++i)
|
||||
{
|
||||
double tmp = a[a.size()-1];
|
||||
for(int k = static_cast<int>(a.size())-2; k >= 0; --k)
|
||||
{
|
||||
tmp = tmp*x[i] + a[k];
|
||||
}
|
||||
sol[i] = tmp;
|
||||
}
|
||||
|
||||
return sol;
|
||||
}
|
||||
|
||||
|
||||
void benchmark_A(int const & N, int const & Nloops)
|
||||
{
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking A: scalar product\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
for(size_t k = 0; k < x.size(); ++k)
|
||||
{
|
||||
x[k] = (k % 219) + 1;
|
||||
y[k] = 1.0/x[k];
|
||||
}
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double sk(0.0), ss(0.0);
|
||||
for (int i = 0; i < Nloops; ++i)
|
||||
{
|
||||
sk = scalar(x, y);
|
||||
ss += sk; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/Nloops; // duration per loop seconds
|
||||
|
||||
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << sk << endl;
|
||||
if (static_cast<unsigned int>(sk) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "N = " << N << endl;
|
||||
cout << "Time for Nloops: " << t_diff*Nloops << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
cout << endl << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void benchmark_A_kahan(int const & N, int const & Nloops)
|
||||
{
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking A: scalar product with Kahan summation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
for(size_t k = 0; k < x.size(); ++k)
|
||||
{
|
||||
x[k] = (k % 219) + 1;
|
||||
y[k] = 1.0/x[k];
|
||||
}
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double sk(0.0), ss(0.0);
|
||||
for (int i = 0; i < Nloops; ++i)
|
||||
{
|
||||
sk = scalar(x, y);
|
||||
ss += sk; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/Nloops; // duration per loop seconds
|
||||
|
||||
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << sk << endl;
|
||||
if (static_cast<unsigned int>(sk) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "N = " << N << endl;
|
||||
cout << "Time for Nloops: " << t_diff*Nloops << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
//cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
|
||||
//cout << "GiByte/s : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
cout << endl << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void benchmark_A_norm(int const & N, int const & Nloops)
|
||||
{
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking A_norm: euclidean norm\n";
|
||||
|
||||
vector<double> x(N,1.0);
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double sk(0.0), ss(0.0);
|
||||
for (int i = 0; i < Nloops; ++i)
|
||||
{
|
||||
sk = norm_eucl(x);
|
||||
ss += sk; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/Nloops; // duration per loop seconds
|
||||
|
||||
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n ||x|| = " << sk << endl;
|
||||
if (sk - sqrt(N) > 1e-7)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "N = " << N << endl;
|
||||
cout << "Time for Nloops: " << t_diff*Nloops << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
cout << endl << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void benchmark_B(int const & N, int const & M, int const & Nloops)
|
||||
{
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking B: Matrix-Vector Product (row wise access)\n";
|
||||
|
||||
vector<double> x(N), b(M), a(N*M);
|
||||
// initialize data
|
||||
for(size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for(size_t j = 0; j < N; ++j)
|
||||
{
|
||||
a[i*N+j] = (i+j) % 219 + 1;
|
||||
}
|
||||
}
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = 1.0/a[17*N+i];
|
||||
}
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double ss(0.0);
|
||||
for (int i = 0; i < Nloops; ++i)
|
||||
{
|
||||
b = MatVec(a,x);
|
||||
ss += b[0]; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/Nloops; // duration per loop seconds
|
||||
|
||||
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <A[17,.],x> = " << b[17] << endl;
|
||||
if (static_cast<unsigned int>(b[17]) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "N = " << N << "\t M = " << M << endl;
|
||||
cout << "Time for Nloops: " << t_diff*Nloops << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
cout << endl << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void benchmark_B_column(int const & N, int const & M, int const & Nloops)
|
||||
{
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking B: Matrix-Vector Product (column wise access)\n";
|
||||
|
||||
vector<double> x(N), b(M), a(N*M);
|
||||
// initialize data
|
||||
for(size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for(size_t j = 0; j < N; ++j)
|
||||
{
|
||||
a[i*N+j] = (i+j) % 219 + 1;
|
||||
}
|
||||
}
|
||||
for(size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = 1.0/a[17*N+i];
|
||||
}
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double ss(0.0);
|
||||
for (int i = 0; i < Nloops; ++i)
|
||||
{
|
||||
b = MatVec_column(a,x);
|
||||
ss += b[0]; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/Nloops; // duration per loop seconds
|
||||
|
||||
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <A[17,.],x> = " << b[17] << endl;
|
||||
if (static_cast<unsigned int>(b[17]) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "N = " << N << "\t M = " << M << endl;
|
||||
cout << "Time for Nloops: " << t_diff*Nloops << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N * M / t_diff / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << (2.0 * N * M + M) / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
cout << endl << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops)
|
||||
{
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking C: Matrix-Matrix Product\n";
|
||||
|
||||
vector<double> a(M*L,1.0), b(L*N,1.0), c(N*M);
|
||||
// with this data we get C[i,j] = L for all i and j
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double ss(0.0);
|
||||
for (int i = 0; i < Nloops; ++i)
|
||||
{
|
||||
c = MatMatProd(a,b,L);
|
||||
ss += c[0]; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/Nloops; // duration per loop seconds
|
||||
|
||||
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n C[10,15] = " << c[10*N+15] << endl;
|
||||
if (static_cast<unsigned int>(c[10*N+15]) != L)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "N = " << N << "\t M = " << M << "\t L = " << L << endl;
|
||||
cout << "Time for Nloops: " << t_diff*Nloops << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N * M * L / t_diff / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << (L*(N+M) + M*N) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
|
||||
cout << endl << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void benchmark_D(int const & p, int const & N, int const & Nloops)
|
||||
{
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking D: polynomial evaluation\n";
|
||||
|
||||
vector<double> x(N,1), sol(N), a(p+1);
|
||||
for(size_t i = 0; i < a.size(); ++i)
|
||||
{
|
||||
a[i] = pow(-1.0,i); // 1-x+x^2-x^3+x^4...
|
||||
}
|
||||
a[0] = 1;
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double ss(0.0);
|
||||
for (int i = 0; i < Nloops; ++i)
|
||||
{
|
||||
sol = PolynomEval(a,x);
|
||||
ss += sol[0]; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/Nloops; // duration per loop seconds
|
||||
|
||||
//assert(std::abs(ss/NLOOPS-sk)<1e-5); // avoids unsafe floating point comparison "=="
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n p(x[0]) = " << sol[0] << endl;
|
||||
if (static_cast<unsigned int>(sol[0]) != (static_cast<int>(a.size()) % 2))
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "p = " << p << "\t N = " << N << endl;
|
||||
cout << "Time for Nloops: " << t_diff*Nloops << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << 2.0*(p+1)*N / t_diff / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << N*(3+2*p) / t_diff / 1024 / 1024 / 1024 * sizeof(a[0]) << endl;
|
||||
cout << endl << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
136
BSP_3_2to5/bsp_3_lib_bench.h
Normal file
136
BSP_3_2to5/bsp_3_lib_bench.h
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
#ifndef BSP_3_LIB_BENCH_H_INCLUDED
|
||||
#define BSP_3_LIB_BENCH_H_INCLUDED
|
||||
|
||||
#include <vector>
|
||||
|
||||
/** Inner product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidean inner product <x,y>
|
||||
*/
|
||||
double scalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
/** Inner product with Kahan summation
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidean inner product <x,y>
|
||||
*/
|
||||
double scalar_kahan(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
/** euclidean norm
|
||||
@param[in] x vector
|
||||
@return resulting Euclidean norm
|
||||
*/
|
||||
double norm_eucl(std::vector<double> const &x);
|
||||
|
||||
|
||||
/** \brief Matrix-Vektor-Multiplikation (row-wise access)
|
||||
*
|
||||
* \param[in] a Matrix with row wise access
|
||||
* \param[in] x vector which gets multiplied
|
||||
* \return resulting product a*x (vector)
|
||||
*
|
||||
*/
|
||||
std::vector<double> MatVec(std::vector<double> const & a, std::vector<double> const & x);
|
||||
|
||||
|
||||
/** \brief Matrix-Vektor-Multiplikation (column-wise access)
|
||||
*
|
||||
* \param[in] a Matrix with row wise access
|
||||
* \param[in] x vector which gets multiplied
|
||||
* \return resulting product a*x (vector)
|
||||
*
|
||||
*/
|
||||
std::vector<double> MatVec_column(std::vector<double> const & a, std::vector<double> const & x);
|
||||
|
||||
|
||||
/** \brief Matrix-Matrix-Multiplikation (row-wise access)
|
||||
*
|
||||
* \param[in] a matrix with row wise access (M*L)
|
||||
* \param[in] b matrix with row wise access (L*N)
|
||||
* \param[in] L inner dimension of the matrix product
|
||||
* \return resulting product a*b
|
||||
*
|
||||
*/
|
||||
std::vector<double> MatMatProd(std::vector<double> const & a, std::vector<double> const & b, int const & L);
|
||||
|
||||
|
||||
/** \brief Polynomauswertung an Stelle x
|
||||
*
|
||||
* \param[in] a Vekor mit den Koeffizienten des Polynoms a=[a0,a1,a2,...]
|
||||
* \param[in] x Vektor, für welchen das Polynom ausgewertet werden soll
|
||||
* \return resulting vector p(x)
|
||||
*
|
||||
*/
|
||||
std::vector<double> PolynomEval(std::vector<double> const & a, std::vector<double> const & x);
|
||||
|
||||
|
||||
/** \brief Benchmarking A - the scalar product
|
||||
*
|
||||
* \param N size of the vector
|
||||
* \param Nloops number of iterations we want to do for the measuring
|
||||
*
|
||||
*/
|
||||
void benchmark_A(int const & N, int const & Nloops);
|
||||
|
||||
|
||||
/** \brief Benchmarking A - the scalar product with Kahan summation
|
||||
*
|
||||
* \param N size of the vector
|
||||
* \param Nloops number of iterations we want to do for the measuring
|
||||
*
|
||||
*/
|
||||
void benchmark_A_kahan(int const & N, int const & Nloops);
|
||||
|
||||
|
||||
/** \brief Benchmarking A - norm
|
||||
*
|
||||
* \param N size of the vector
|
||||
* \param Nloops number of iterations we want to do for the measuring
|
||||
*
|
||||
*/
|
||||
void benchmark_A_norm(int const & N, int const & Nloops);
|
||||
|
||||
|
||||
/** \brief Benchmarking B - matrix-vector product Ax=b (row wise access)
|
||||
*
|
||||
* \param N size of vector x
|
||||
* \param M size of vector b (=> A: M*N)
|
||||
* \param Nloops number of iterations we want to do for the measuring
|
||||
*
|
||||
*/
|
||||
void benchmark_B(int const & N, int const & M, int const & Nloops);
|
||||
|
||||
|
||||
/** \brief Benchmarking B - matrix-vector product Ax=b (column wise access)
|
||||
*
|
||||
* \param N size of vector x
|
||||
* \param M size of vector b (=> A: M*N)
|
||||
* \param Nloops number of iterations we want to do for the measuring
|
||||
*
|
||||
*/
|
||||
void benchmark_B_column(int const & N, int const & M, int const & Nloops);
|
||||
|
||||
|
||||
/** \brief Benchmarking C - Matrix-Matrix product C=A*B A_M*L, B_L*N
|
||||
*
|
||||
* \param N
|
||||
* \param M
|
||||
* \param L
|
||||
* \param Nloops number of iterations we want to do for the measuring
|
||||
*
|
||||
*/
|
||||
void benchmark_C(int const & N, int const & M, int const & L, int const & Nloops);
|
||||
|
||||
|
||||
/** \brief Benchmarking D - polynomial evaluation
|
||||
*
|
||||
* \param p the degree of the polynomial
|
||||
* \param N size of the input vector x where p(x)
|
||||
* \param Nloops number of iterations we want to do for the measuring
|
||||
*
|
||||
*/
|
||||
void benchmark_D(int const & p, int const & N, int const & Nloops);
|
||||
#endif // BSP_3_LIB_BENCH_H_INCLUDED
|
||||
106
BSP_3_2to5/bsp_3_results_2-5.txt
Normal file
106
BSP_3_2to5/bsp_3_results_2-5.txt
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
Aufgabe 2: Übersicht über memory, number of floating point operations and writing/reading operations
|
||||
|
||||
memory (*8, in Bytes) FLOP writing/reading
|
||||
A - scalar 2N 2N 2N
|
||||
B - MatVec N*M+N 2*N*M 2*N*M+M
|
||||
C - MatMatProd L*(M+N) 2*L*M*N L*N+L*M+M*N
|
||||
D - PolyEval p+1+N 2*(p+1)*N 2*(p+1)*N + N
|
||||
|
||||
Aufgabe 3-5: Ergebnisse saemtlicher benachmark tests
|
||||
|
||||
Start Benchmarking A: scalar product
|
||||
|
||||
<x,y> = 2.5e+08
|
||||
|
||||
|
||||
N = 250000000
|
||||
Time for Nloops: 20
|
||||
Timing in sec. : 0.79
|
||||
GFLOPS : 0.59
|
||||
GiByte/s : 4.7
|
||||
|
||||
|
||||
|
||||
Start Benchmarking A: scalar product with Kahan summation
|
||||
|
||||
<x,y> = 2.5e+08
|
||||
|
||||
|
||||
N = 250000000
|
||||
Time for Nloops: 23
|
||||
Timing in sec. : 0.92
|
||||
|
||||
|
||||
|
||||
Start Benchmarking A_norm: euclidean norm
|
||||
|
||||
||x|| = 1.6e+04
|
||||
|
||||
|
||||
N = 250000000
|
||||
Time for Nloops: 22
|
||||
Timing in sec. : 0.88
|
||||
GFLOPS : 0.53
|
||||
GiByte/s : 2.1
|
||||
|
||||
|
||||
|
||||
Start Benchmarking B: Matrix-Vector Product (row wise access)
|
||||
|
||||
<A[17,.],x> = 8e+03
|
||||
|
||||
|
||||
N = 8000 M = 8000
|
||||
Time for Nloops: 17
|
||||
Timing in sec. : 0.23
|
||||
GFLOPS : 0.52
|
||||
GiByte/s : 4.1
|
||||
|
||||
|
||||
|
||||
Start Benchmarking B: Matrix-Vector Product (column wise access)
|
||||
|
||||
<A[17,.],x> = 8e+03
|
||||
|
||||
|
||||
N = 8000 M = 8000
|
||||
Time for Nloops: 18
|
||||
Timing in sec. : 0.24
|
||||
GFLOPS : 0.51
|
||||
GiByte/s : 4
|
||||
|
||||
|
||||
|
||||
Start Benchmarking C: Matrix-Matrix Product
|
||||
|
||||
C[10,15] = 1e+03
|
||||
|
||||
|
||||
N = 1000 M = 1000 L = 1000
|
||||
Time for Nloops: 19
|
||||
Timing in sec. : 6.5
|
||||
GFLOPS : 0.29
|
||||
GiByte/s : 0.0034
|
||||
|
||||
|
||||
|
||||
Start Benchmarking D: polynomial evaluation
|
||||
|
||||
p(x[0]) = 1
|
||||
|
||||
|
||||
p = 10000 N = 100000
|
||||
Time for Nloops: 19
|
||||
Timing in sec. : 3.8
|
||||
GFLOPS : 0.49
|
||||
GiByte/s : 4
|
||||
|
||||
|
||||
Anmerkungen zu Bsp 5:
|
||||
A: die Berechnung der Norm benötigt länger, obwohl nur ein Vektor beteiligt ist; sqrt ist rechenintensiv im Verlgleich zu +,-,/,*
|
||||
B: kahan summation benötigt etwas länger
|
||||
C: keine gravierenden Unterschiede feststellbar (column wise aber bereits optimiert)
|
||||
|
||||
Anmerkungen zu Bsp 4:
|
||||
bei C) Matrix*Matrix wurde nicht das 10-100 fache des L3 Caches verwendet, da die Berechnung dann irsinnig lange gebraucht hätte;
|
||||
dadurch etwas kleinere Dimensionen
|
||||
42
BSP_3_2to5/bsp_3_x.cbp
Normal file
42
BSP_3_2to5/bsp_3_x.cbp
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
||||
<CodeBlocks_project_file>
|
||||
<FileVersion major="1" minor="6" />
|
||||
<Project>
|
||||
<Option title="bsp_3_x" />
|
||||
<Option pch_mode="2" />
|
||||
<Option compiler="gcc" />
|
||||
<Build>
|
||||
<Target title="Debug">
|
||||
<Option output="bin/Debug/bsp_3_x" prefix_auto="1" extension_auto="1" />
|
||||
<Option object_output="obj/Debug/" />
|
||||
<Option type="1" />
|
||||
<Option compiler="gcc" />
|
||||
<Compiler>
|
||||
<Add option="-g" />
|
||||
</Compiler>
|
||||
</Target>
|
||||
<Target title="Release">
|
||||
<Option output="bin/Release/bsp_3_x" prefix_auto="1" extension_auto="1" />
|
||||
<Option object_output="obj/Release/" />
|
||||
<Option type="1" />
|
||||
<Option compiler="gcc" />
|
||||
<Compiler>
|
||||
<Add option="-O2" />
|
||||
</Compiler>
|
||||
<Linker>
|
||||
<Add option="-s" />
|
||||
</Linker>
|
||||
</Target>
|
||||
</Build>
|
||||
<Compiler>
|
||||
<Add option="-Wall" />
|
||||
<Add option="-fexceptions" />
|
||||
</Compiler>
|
||||
<Unit filename="bsp_3_lib_bench.cpp" />
|
||||
<Unit filename="bsp_3_lib_bench.h" />
|
||||
<Unit filename="main.cpp" />
|
||||
<Extensions>
|
||||
<lib_finder disable_auto="1" />
|
||||
</Extensions>
|
||||
</Project>
|
||||
</CodeBlocks_project_file>
|
||||
17
BSP_3_2to5/main.cpp
Normal file
17
BSP_3_2to5/main.cpp
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
#include "bsp_3_lib_bench.h"
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main()
|
||||
{
|
||||
benchmark_A(25*1e7,25);
|
||||
benchmark_A_kahan(25*1e7,25);
|
||||
benchmark_A_norm(25*1e7,25);
|
||||
benchmark_B(8000,8000,75);
|
||||
benchmark_B_column(8000,8000,75);
|
||||
benchmark_C(1000,1000,1000,3);
|
||||
benchmark_D(1e4,1e5,5);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue