Upload files to "ex3_benchmarks"

2025-11-11 16:17:31 +01:00 · 2025-11-11 16:17:31 +01:00 · b8030d4ce0
commit b8030d4ce0
parent 50845d49be
5 changed files with 2043 additions and 0 deletions
--- a/ex3_benchmarks/Makefile
+++ b/ex3_benchmarks/Makefile
@ -0,0 +1,30 @@
 #
 # use GNU-Compiler tools
 COMPILER=GCC_
 # alternatively from the shell
 # export COMPILER=GCC_
 # or, alternatively from the shell
 # make COMPILER=GCC_
 # use Intel compilers
 #COMPILER=ICC_
 # use PGI compilers
 # COMPILER=PGI_
 SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp
 OBJECTS = $(SOURCES:.cpp=.o)
 PROGRAM	= main.${COMPILER}
 # uncomment the next to lines for debugging and detailed performance analysis
 CXXFLAGS += -g
 LINKFLAGS += -g
 # do not use -pg with PGI compilers
 ifndef COMPILER
  COMPILER=GCC_
 endif
 include ../${COMPILER}default.mk
--- a/ex3_benchmarks/main.cpp
+++ b/ex3_benchmarks/main.cpp
@ -0,0 +1,101 @@
 #include "benchmarks.h"
 #include "benchmark_tests.h"
 #include "factorization_solve_tests.h"
 #include <iostream>
 int main()
 {   
    // ---------------------------------- 1. ----------------------------------
    // results in file "test_system.txt"
    // ---------------------------------- 2. ----------------------------------
    //      Memory      FLOPS       Read-write operations
    // (A)  2N          2N          2N
    // (B)  MN + N      2NM         2NM + M
    // (C)  ML + LM     2LNM        2LNM + MN
    // (D)  (p+1) + N   3N(p+1)     N(2 + 3(p+1))
    // ---------------------------------- 3. ----------------------------------
    // implementation in file "benchmarks.cpp"
    // ---------------------------------- 4.& 6. ----------------------------------
    vector<vector<double>> results;
    results.push_back(test_A(250, 50000000, scalar));       
    results.push_back(test_A(250, 50000000, Kahan_skalar));
    results.push_back(test_A(250, 50000000, scalar_cBLAS));
    //                  Timing  GFLOPS  GiByte/s
    // ------------------------------------------
    // scalar           0.039   2.4     19
    // Kahan_skalar     0.037   2.5     20
    // scalar_cBLAS     0.032   2.9     23
    results.push_back(test_B(100, 20000, 10000, MatVec));
    results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS));
    //                  Timing  GFLOPS  GiByte/s
    // ------------------------------------------
    // MatVec           0.1     3.6     29
    // MatVec_cBLAS     0.074   5       40
    results.push_back(test_C(25, 500, 1000, 1500, MatMat));
    results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS));
    //                  Timing  GFLOPS  GiByte/s
    // ------------------------------------------
    // MatMat           0.57    2.5     20
    // MatMat_cBLAS     0.019   75      6e+02   // unrealistic
    results.push_back(test_D(100, 100, 1000000));
    //                  Timing  GFLOPS  GiByte/s
    // ------------------------------------------
    //                  0.11    2.5     20
    cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl;
    cout << "------------------------------" << endl;
    for (size_t i = 0; i < results.size(); ++i)
        cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl;
    cout << endl;
    // ---------------------------------- 5. ----------------------------------
    // 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product.
    // Reason: only have to access entries of x, so less memory that has to be accessed
    //
    // 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product
    // ---------------------------------- 6. ----------------------------------
    // see 4.
    // ---------------------------------- 7. ----------------------------------
    CheckCorrectness();
    // Checked correctness by computing the inverse of A
    CheckDuration(5000);
    // The solving time per RHS scales roughly with factor 1/n_rhs
    // ---------------------------------- 8. ----------------------------------
    // done seperately
    return 0;
 }
--- a/ex3_benchmarks/small_Doxyfile
+++ b/ex3_benchmarks/small_Doxyfile
--- a/ex3_benchmarks/test_system.txt
+++ b/ex3_benchmarks/test_system.txt
@ -0,0 +1,70 @@
 ---------------------------------- 1. ----------------------------------
 rm -f *.exe *.o
 gcc -O3    -c -o mysecond.o mysecond.c
 gcc -O3  -c mysecond.c
 gfortran -O3 -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20   -c stream.f
 gfortran -O3 stream.o mysecond.o -o stream_f.exe
 gcc -O3  -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20  stream.c -o stream_c.exe
 gcc -O3  -DUNIX flops.c -o flops.exe
 flops.c: In function ‘main’:
 flops.c:231:4: warning: implicit declaration of function ‘dtime’ [-Wimplicit-function-declaration]
  231 |    dtime(TimeArray);
      |    ^~~~~
 flops.c: At top level:
 flops.c:723:1: warning: return type defaults to ‘int’ [-Wimplicit-int]
  723 | dtime(p)
      | ^~~~~
 ./stream_c.exe
 -------------------------------------------------------------
 STREAM version $Revision: 5.10 $
 -------------------------------------------------------------
 This system uses 8 bytes per array element.
 -------------------------------------------------------------
 Array size = 80000000 (elements), Offset = 0 (elements)
 Memory per array = 610.4 MiB (= 0.6 GiB).
 Total memory required = 1831.1 MiB (= 1.8 GiB).
 Each kernel will be executed 20 times.
 The *best* time for each kernel (excluding the first iteration)
 will be used to compute the reported bandwidth.
 -------------------------------------------------------------
 Your clock granularity/precision appears to be 1 microseconds.
 Each test below will take on the order of 79294 microseconds.
   (= 79294 clock ticks)
 Increase the size of the arrays if this shows that
 you are not getting at least 20 clock ticks per test.
 -------------------------------------------------------------
 WARNING -- The above is only a rough guideline.
 For best results, please be sure you know the
 precision of your system timer.
 -------------------------------------------------------------
 Function    Best Rate MB/s  Avg time     Min time     Max time
 Copy:           26720.6     0.057416     0.047903     0.098979
 Scale:          17008.6     0.087616     0.075256     0.133899
 Add:            19169.9     0.113818     0.100157     0.177676
 Triad:          19144.9     0.111248     0.100288     0.170877
 -------------------------------------------------------------
 Solution Validates: avg error less than 1.000000e-13 on all three arrays
 -------------------------------------------------------------
 ./flops.exe
   FLOPS C Program (Double Precision), V2.0 18 Dec 1992
   Module     Error        RunTime      MFLOPS
                            (usec)
     1      4.0146e-13      0.0023   6183.8896
     2     -1.4166e-13      0.0006  11377.1999
     3      4.7184e-14      0.0034   5031.5222
     4     -1.2557e-13      0.0031   4841.2566
     5     -1.3800e-13      0.0056   5208.6586
     6      3.2380e-13      0.0053   5484.2426
     7     -8.4583e-11      0.0031   3832.8326
     8      3.4867e-13      0.0055   5410.0375
   Iterations      =  512000000
   NullTime (usec) =     0.0000
   MFLOPS(1)       =  8055.7366
   MFLOPS(2)       =  4732.2658
   MFLOPS(3)       =  5164.0037
   MFLOPS(4)       =  5257.0181
--- a/ex3_benchmarks/userset.cpp
+++ b/ex3_benchmarks/userset.cpp
@ -0,0 +1,16 @@
 #include "userset.h"
 #include <cmath>
 double FunctF(double const x, double const y)
 {
 // return  std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
 //  return 16.0*1024. ;
 // return (double)1.0 ;
    return x * x * std::sin(2.5 * 3.14159 * y);
 }
 double FunctU(const double /* x */, double const /* y */)
 {
    return 1.0 ;
 }