Upload files to "ex3_benchmarks"
This commit is contained in:
parent
50845d49be
commit
b8030d4ce0
5 changed files with 2043 additions and 0 deletions
30
ex3_benchmarks/Makefile
Normal file
30
ex3_benchmarks/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
#
|
||||||
|
# use GNU-Compiler tools
|
||||||
|
COMPILER=GCC_
|
||||||
|
# alternatively from the shell
|
||||||
|
# export COMPILER=GCC_
|
||||||
|
# or, alternatively from the shell
|
||||||
|
# make COMPILER=GCC_
|
||||||
|
|
||||||
|
# use Intel compilers
|
||||||
|
#COMPILER=ICC_
|
||||||
|
|
||||||
|
# use PGI compilers
|
||||||
|
# COMPILER=PGI_
|
||||||
|
|
||||||
|
|
||||||
|
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp
|
||||||
|
OBJECTS = $(SOURCES:.cpp=.o)
|
||||||
|
|
||||||
|
PROGRAM = main.${COMPILER}
|
||||||
|
|
||||||
|
# uncomment the next to lines for debugging and detailed performance analysis
|
||||||
|
CXXFLAGS += -g
|
||||||
|
LINKFLAGS += -g
|
||||||
|
# do not use -pg with PGI compilers
|
||||||
|
|
||||||
|
ifndef COMPILER
|
||||||
|
COMPILER=GCC_
|
||||||
|
endif
|
||||||
|
|
||||||
|
include ../${COMPILER}default.mk
|
||||||
101
ex3_benchmarks/main.cpp
Normal file
101
ex3_benchmarks/main.cpp
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
#include "benchmarks.h"
|
||||||
|
#include "benchmark_tests.h"
|
||||||
|
#include "factorization_solve_tests.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
// ---------------------------------- 1. ----------------------------------
|
||||||
|
// results in file "test_system.txt"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------- 2. ----------------------------------
|
||||||
|
// Memory FLOPS Read-write operations
|
||||||
|
// (A) 2N 2N 2N
|
||||||
|
// (B) MN + N 2NM 2NM + M
|
||||||
|
// (C) ML + LM 2LNM 2LNM + MN
|
||||||
|
// (D) (p+1) + N 3N(p+1) N(2 + 3(p+1))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------- 3. ----------------------------------
|
||||||
|
// implementation in file "benchmarks.cpp"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------- 4.& 6. ----------------------------------
|
||||||
|
vector<vector<double>> results;
|
||||||
|
|
||||||
|
results.push_back(test_A(250, 50000000, scalar));
|
||||||
|
results.push_back(test_A(250, 50000000, Kahan_skalar));
|
||||||
|
results.push_back(test_A(250, 50000000, scalar_cBLAS));
|
||||||
|
// Timing GFLOPS GiByte/s
|
||||||
|
// ------------------------------------------
|
||||||
|
// scalar 0.039 2.4 19
|
||||||
|
// Kahan_skalar 0.037 2.5 20
|
||||||
|
// scalar_cBLAS 0.032 2.9 23
|
||||||
|
|
||||||
|
|
||||||
|
results.push_back(test_B(100, 20000, 10000, MatVec));
|
||||||
|
results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS));
|
||||||
|
// Timing GFLOPS GiByte/s
|
||||||
|
// ------------------------------------------
|
||||||
|
// MatVec 0.1 3.6 29
|
||||||
|
// MatVec_cBLAS 0.074 5 40
|
||||||
|
|
||||||
|
|
||||||
|
results.push_back(test_C(25, 500, 1000, 1500, MatMat));
|
||||||
|
results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS));
|
||||||
|
// Timing GFLOPS GiByte/s
|
||||||
|
// ------------------------------------------
|
||||||
|
// MatMat 0.57 2.5 20
|
||||||
|
// MatMat_cBLAS 0.019 75 6e+02 // unrealistic
|
||||||
|
|
||||||
|
|
||||||
|
results.push_back(test_D(100, 100, 1000000));
|
||||||
|
// Timing GFLOPS GiByte/s
|
||||||
|
// ------------------------------------------
|
||||||
|
// 0.11 2.5 20
|
||||||
|
|
||||||
|
|
||||||
|
cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl;
|
||||||
|
cout << "------------------------------" << endl;
|
||||||
|
for (size_t i = 0; i < results.size(); ++i)
|
||||||
|
cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl;
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------- 5. ----------------------------------
|
||||||
|
// 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product.
|
||||||
|
// Reason: only have to access entries of x, so less memory that has to be accessed
|
||||||
|
//
|
||||||
|
// 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------- 6. ----------------------------------
|
||||||
|
// see 4.
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------- 7. ----------------------------------
|
||||||
|
CheckCorrectness();
|
||||||
|
// Checked correctness by computing the inverse of A
|
||||||
|
|
||||||
|
CheckDuration(5000);
|
||||||
|
// The solving time per RHS scales roughly with factor 1/n_rhs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// ---------------------------------- 8. ----------------------------------
|
||||||
|
// done seperately
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
}
|
||||||
1826
ex3_benchmarks/small_Doxyfile
Normal file
1826
ex3_benchmarks/small_Doxyfile
Normal file
File diff suppressed because it is too large
Load diff
70
ex3_benchmarks/test_system.txt
Normal file
70
ex3_benchmarks/test_system.txt
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
---------------------------------- 1. ----------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
rm -f *.exe *.o
|
||||||
|
gcc -O3 -c -o mysecond.o mysecond.c
|
||||||
|
gcc -O3 -c mysecond.c
|
||||||
|
gfortran -O3 -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 -c stream.f
|
||||||
|
gfortran -O3 stream.o mysecond.o -o stream_f.exe
|
||||||
|
gcc -O3 -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 stream.c -o stream_c.exe
|
||||||
|
gcc -O3 -DUNIX flops.c -o flops.exe
|
||||||
|
flops.c: In function ‘main’:
|
||||||
|
flops.c:231:4: warning: implicit declaration of function ‘dtime’ [-Wimplicit-function-declaration]
|
||||||
|
231 | dtime(TimeArray);
|
||||||
|
| ^~~~~
|
||||||
|
flops.c: At top level:
|
||||||
|
flops.c:723:1: warning: return type defaults to ‘int’ [-Wimplicit-int]
|
||||||
|
723 | dtime(p)
|
||||||
|
| ^~~~~
|
||||||
|
./stream_c.exe
|
||||||
|
-------------------------------------------------------------
|
||||||
|
STREAM version $Revision: 5.10 $
|
||||||
|
-------------------------------------------------------------
|
||||||
|
This system uses 8 bytes per array element.
|
||||||
|
-------------------------------------------------------------
|
||||||
|
Array size = 80000000 (elements), Offset = 0 (elements)
|
||||||
|
Memory per array = 610.4 MiB (= 0.6 GiB).
|
||||||
|
Total memory required = 1831.1 MiB (= 1.8 GiB).
|
||||||
|
Each kernel will be executed 20 times.
|
||||||
|
The *best* time for each kernel (excluding the first iteration)
|
||||||
|
will be used to compute the reported bandwidth.
|
||||||
|
-------------------------------------------------------------
|
||||||
|
Your clock granularity/precision appears to be 1 microseconds.
|
||||||
|
Each test below will take on the order of 79294 microseconds.
|
||||||
|
(= 79294 clock ticks)
|
||||||
|
Increase the size of the arrays if this shows that
|
||||||
|
you are not getting at least 20 clock ticks per test.
|
||||||
|
-------------------------------------------------------------
|
||||||
|
WARNING -- The above is only a rough guideline.
|
||||||
|
For best results, please be sure you know the
|
||||||
|
precision of your system timer.
|
||||||
|
-------------------------------------------------------------
|
||||||
|
Function Best Rate MB/s Avg time Min time Max time
|
||||||
|
Copy: 26720.6 0.057416 0.047903 0.098979
|
||||||
|
Scale: 17008.6 0.087616 0.075256 0.133899
|
||||||
|
Add: 19169.9 0.113818 0.100157 0.177676
|
||||||
|
Triad: 19144.9 0.111248 0.100288 0.170877
|
||||||
|
-------------------------------------------------------------
|
||||||
|
Solution Validates: avg error less than 1.000000e-13 on all three arrays
|
||||||
|
-------------------------------------------------------------
|
||||||
|
./flops.exe
|
||||||
|
|
||||||
|
FLOPS C Program (Double Precision), V2.0 18 Dec 1992
|
||||||
|
|
||||||
|
Module Error RunTime MFLOPS
|
||||||
|
(usec)
|
||||||
|
1 4.0146e-13 0.0023 6183.8896
|
||||||
|
2 -1.4166e-13 0.0006 11377.1999
|
||||||
|
3 4.7184e-14 0.0034 5031.5222
|
||||||
|
4 -1.2557e-13 0.0031 4841.2566
|
||||||
|
5 -1.3800e-13 0.0056 5208.6586
|
||||||
|
6 3.2380e-13 0.0053 5484.2426
|
||||||
|
7 -8.4583e-11 0.0031 3832.8326
|
||||||
|
8 3.4867e-13 0.0055 5410.0375
|
||||||
|
|
||||||
|
Iterations = 512000000
|
||||||
|
NullTime (usec) = 0.0000
|
||||||
|
MFLOPS(1) = 8055.7366
|
||||||
|
MFLOPS(2) = 4732.2658
|
||||||
|
MFLOPS(3) = 5164.0037
|
||||||
|
MFLOPS(4) = 5257.0181
|
||||||
16
ex3_benchmarks/userset.cpp
Normal file
16
ex3_benchmarks/userset.cpp
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
#include "userset.h"
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
|
||||||
|
double FunctF(double const x, double const y)
|
||||||
|
{
|
||||||
|
// return std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
|
||||||
|
// return 16.0*1024. ;
|
||||||
|
// return (double)1.0 ;
|
||||||
|
return x * x * std::sin(2.5 * 3.14159 * y);
|
||||||
|
}
|
||||||
|
|
||||||
|
double FunctU(const double /* x */, double const /* y */)
|
||||||
|
{
|
||||||
|
return 1.0 ;
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue