diff --git a/ex1/code/Makefile b/ex1/code/Makefile index 06ea13a..d7f2674 100644 --- a/ex1/code/Makefile +++ b/ex1/code/Makefile @@ -22,7 +22,7 @@ ${PROGRAM}: ${OBJECTS} $(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM} clean: - rm -f ${OBJECTS} ${PROGRAM} + rm -f ${OBJECTS} ${PROGRAM} out_1.txt run: ${PROGRAM} diff --git a/ex3/code/Makefile b/ex3/code/Makefile new file mode 100644 index 0000000..ce63d9a --- /dev/null +++ b/ex3/code/Makefile @@ -0,0 +1,30 @@ +PROGRAM = main + +SOURCES = $(wildcard *.cpp) +OBJECTS = ${SOURCES:.cpp=.o} + +CXX = g++ +LINKER = g++ + +WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -fmax-errors=1 + +CXXFLAGS = -g -flto -O3 -ffast-math -march=native ${WARNINGS} +LINKFLAGS = -g -flto -O3 -lopenblas -llapacke + + +all: ${PROGRAM} + +# %.o: %.cpp +# ${CXX} ${CXXFLAGS} -c $< -o $@ + +${PROGRAM}: ${OBJECTS} + $(LINKER) ${OBJECTS} ${LINKFLAGS} -o ${PROGRAM} + +clean: + rm -f ${OBJECTS} ${PROGRAM} + + +run: ${PROGRAM} +# run: clean ${PROGRAM} + ./${PROGRAM} \ No newline at end of file diff --git a/ex3/code/main.cpp b/ex3/code/main.cpp new file mode 100644 index 0000000..9f91412 --- /dev/null +++ b/ex3/code/main.cpp @@ -0,0 +1,267 @@ + +#include "task_3.h" +#include "task_4+6.h" +#include "task_5.h" +#include "task_7.h" +#include "timing.h" + +#include +#include +#include // cBLAS Library +#include + +void task_1() { + printf("\n\n-------------- Task 1 --------------\n\n"); + printf("See comment in main.cpp"); + + // ------------------------------------------------------------- + // STREAM version $Revision: 5.10 $ + // ------------------------------------------------------------- + // This system uses 8 bytes per array element. + // ------------------------------------------------------------- + // Array size = 80000000 (elements), Offset = 0 (elements) + // Memory per array = 610.4 MiB (= 0.6 GiB). + // Total memory required = 1831.1 MiB (= 1.8 GiB). + // Each kernel will be executed 20 times. + // The *best* time for each kernel (excluding the first iteration) + // will be used to compute the reported bandwidth. + // ------------------------------------------------------------- + // Your clock granularity/precision appears to be 1 microseconds. + // Each test below will take on the order of 116886 microseconds. + // (= 116886 clock ticks) + // Increase the size of the arrays if this shows that + // you are not getting at least 20 clock ticks per test. + // ------------------------------------------------------------- + // WARNING -- The above is only a rough guideline. + // For best results, please be sure you know the + // precision of your system timer. + // ------------------------------------------------------------- + // Function Best Rate MB/s Avg time Min time Max time + // Copy: 29569.4 0.048585 0.043288 0.059164 + // Scale: 17644.0 0.082248 0.072546 0.102548 + // Add: 21030.1 0.100620 0.091298 0.124700 + // Triad: 21230.7 0.100758 0.090435 0.120631 + // ------------------------------------------------------------- + // Solution Validates: avg error less than 1.000000e-13 on all three arrays + // ------------------------------------------------------------- + // ./flops.exe + + // FLOPS C Program (Double Precision), V2.0 18 Dec 1992 + + // Module Error RunTime MFLOPS + // (usec) + // 1 4.0146e-13 0.0024 5827.9076 + // 2 -1.4166e-13 0.0007 10037.8942 + // 3 4.7184e-14 0.0039 4371.9185 + // 4 -1.2557e-13 0.0034 4355.5711 + // 5 -1.3800e-13 0.0066 4415.6439 + // 6 3.2380e-13 0.0065 4441.6299 + // 7 -8.4583e-11 0.0053 2277.1707 + // 8 3.4867e-13 0.0069 4367.6094 + + // Iterations = 512000000 + // NullTime (usec) = 0.0000 + // MFLOPS(1) = 7050.6178 + // MFLOPS(2) = 3461.6233 + // MFLOPS(3) = 4175.0442 + // MFLOPS(4) = 4389.7311 +} + +void task_2() { + printf("\n\n-------------- Task 2 --------------\n\n"); + printf("See comment in main.cpp"); + + // Memory needed (double 64-bit, 8 bytes): + // (A) (2N + 1) * 8 bytes + // (B) (M*N + M + N) * 8 bytes + // (C) (M*L + L*N + M*N) * 8 bytes + // (D) (N + N + p) * 8 bytes + + // Floating point operations: + // (A) 2N + // (B) M * 2N + // (C) M * 2L * N + // (D) 2 * N * p (Horner Schema) + + // Read/Write operations: + // (A) Read: 2N Write: 1 + // (B) Read: M*2N Write: M*N + // (C) Read: M*2L*N Write: M*L*N + // (D) Read: 2*N*p Write: N*P +} + +void task_3() { + printf("\n\n-------------- Task 3 --------------\n\n"); + printf("Functions implemented in task_3.cpp"); +} + +void task_4(bool cblas = false) { + if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");} + size_t M, N, L, p, NLOOPS; + + { // Scalar product + printf("----- Benchmark (A) -----\n"); + // Initialization + N = 50'000'000; + NLOOPS = 50; + auto [x,y] = init_A(N); + // Benchmark + tic(); + benchmark_A(x, y, NLOOPS, cblas); + double sec = toc() / NLOOPS; + // Timings and Performance + size_t memory = 2 * N; + size_t flops = 2 * N; + print_performance(sec, memory, flops, sizeof(x[0])); + printf("-------------------------\n"); + } + + { // Matrix-Vector product + printf("----- Benchmark (B) -----\n"); + // Initialization + M = 8'000; + N = 12'000; + NLOOPS = 30; + auto [A,x] = init_B(M,N); + // Benchmark + tic(); + benchmark_B(A, x, NLOOPS, cblas); + double sec = toc() / NLOOPS; + // Timings and Performance + size_t memory = M*N + M + N; + size_t flops = 2 * M * N; + print_performance(sec, memory, flops, sizeof(A[0])); + printf("-------------------------\n"); + } + + { // Matrix-Matrix product + printf("----- Benchmark (C) -----\n"); + // Initialization + M = 1'000; + N = 2'000; + L = 500; + NLOOPS = 20; + auto [A,B] = init_C(M,N,L); + // Benchmark + tic(); + benchmark_C(A, B, L, NLOOPS, cblas); + double sec = toc() / NLOOPS; + // Timings and Performance + size_t memory = M*L + L*N + M*N; + size_t flops = M * 2*L * N; + print_performance(sec, memory, flops, sizeof(A[0])); + printf("-------------------------\n"); + } + + if (cblas == false) + { // Polynomial evaluation + printf("----- Benchmark (D) -----\n"); + // Initialization + N = 1'000'000; + p = 200; + NLOOPS = 20; + auto [x,a] = init_D(N,p); + // Benchmark + tic(); + benchmark_D(x, a, NLOOPS); + double sec = toc() / NLOOPS; + // Timings and Performance + size_t memory = 2.0 * N; + size_t flops = 2.0 * N * p; + print_performance(sec, memory, flops, sizeof(x[0])); + printf("-------------------------\n"); + } +} + +void task_5() { + printf("\n\n-------------- Task 5 --------------\n\n"); + + printf("----- Benchmark norm -----\n"); + // Initialization + size_t N =50'000'000; + size_t NLOOPS = 50; + vector x = init_norm(N); + // Benchmark + tic(); + benchmark_norm(x, NLOOPS); + double sec = toc() / NLOOPS; + // Timings and Performance + size_t memory = N; + size_t flops = 2 * N; + print_performance(sec, memory, flops, sizeof(x[0])); + printf("-------------------------\n"); + printf("What do you observe? Why?\n"); + printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2."); +} + +void task_6() { + printf("\n\n-------------- Task 6 --------------\n\n"); + printf("Benchmarks using cBLAS\n"); + task_4(true); +} + +void task_7() { + printf("\n\n-------------- Task 7 --------------\n\n"); + { // Check Ax=b + size_t N=5, Nrhs=2; + auto [A,b] = init_M(N,Nrhs); + vector A_og = A; + + printf("A ="); + print_matrix(A,N,N); + printf("b ="); + print_matrix(b,N,Nrhs); + + int lda=N, ldb=Nrhs; + vector ipiv(N); + LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data()); + LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb); + + printf("L + U ="); + print_matrix(A,N,N); + printf("x ="); + print_matrix(b,N,Nrhs); + + int ldc=Nrhs; + vector C(N*Nrhs); + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc); + printf("Check solution:\nA * x = "); + print_matrix(C,N,Nrhs); + } + // ################################# + // Benchmark + + cout << fixed << setprecision(4); // 4 digits after decimal + size_t NLOOPS = 1000; + cout << "N = " << " | 1 | 2 | 4 | 8 | 16 | 32 " << endl; + cout << "---------|--------|--------|--------|--------|--------|-------" << endl; + for (int exp = 1; exp < 10; ++exp) { + cout << "Nrhs = " << static_cast(pow(2,exp)); + for (size_t N : {1, 2, 4, 8, 16, 32}) { + tic(); + for (size_t i = 0; i < NLOOPS; ++i) { + benchmark_lapacke(N, static_cast(pow(2,exp))); + } + double sec = toc(); + cout << " | " << sec; + } + cout << endl; + } + printf("\nFor fixed n, the solution time per rhs does not slow down consistently and scales very well.\nIts faster than expected."); + + + +} + +int main() { + task_1(); + task_2(); + task_3(); + task_4(); + task_5(); + task_6(); + task_7(); + printf("\n\n"); + + return 0; +} \ No newline at end of file diff --git a/ex3/code/task_3.cpp b/ex3/code/task_3.cpp new file mode 100644 index 0000000..22a0482 --- /dev/null +++ b/ex3/code/task_3.cpp @@ -0,0 +1,61 @@ +#include "task_3.h" +#include +#include +#include +#include +using namespace std; + + +double scalar(vector const &x, vector const &y) { + assert(x.size() == y.size()); + size_t const N = x.size(); + double sum = 0.0; + for (size_t i = 0; i < N; ++i) { + sum += x[i] * y[i]; + } + return sum; +} + + +vector matrix_vec(vector const &A, vector const &x) { + size_t const N = x.size(); + size_t const M = A.size() / N; + vector b(M); + + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) { + b[i] += A[i*N + j] * x[j]; + } + } + return b; +} + + +vector matrix_matrix(vector const &A, vector const &B, size_t const &M) { + size_t const L = A.size() / M; + size_t const N = B.size() / L; + vector C(M*N,0); + + for (size_t i = 0; i < M; ++i) { + for (size_t k = 0; k < L; ++k) { + for (size_t j = 0; j < N; ++j) { + C[i*N + j] += A[i*L + k] * B[k*N + j]; + } + } + } + return C; +} + + +vector poly(vector const &x, vector const &a) { + size_t N = x.size(); + size_t p = a.size(); + vector y(N); + for (size_t i = 0; i < N; ++i) { + y[i] = a[p]; + for (size_t k = 1; k < p; ++k) { + y[i] = y[i]*x[i] + a[p-k]; + } + } + return y; +} \ No newline at end of file diff --git a/ex3/code/task_3.h b/ex3/code/task_3.h new file mode 100644 index 0000000..9477462 --- /dev/null +++ b/ex3/code/task_3.h @@ -0,0 +1,8 @@ +#pragma once +#include +using namespace std; + +double scalar(vector const &x, vector const &y); +vector matrix_vec(vector const &A, vector const &x); +vector matrix_matrix(vector const &A, vector const &B, size_t const &M); +vector poly(vector const &x, vector const &a); \ No newline at end of file diff --git a/ex3/code/task_4+6.cpp b/ex3/code/task_4+6.cpp new file mode 100644 index 0000000..bcde52c --- /dev/null +++ b/ex3/code/task_4+6.cpp @@ -0,0 +1,148 @@ +#include "task_3.h" +#include "task_4+6.h" +#include "timing.h" +#include +#include +#include // cBLAS Library +using namespace std; + +void print_performance(double sec, size_t memory, size_t flops, unsigned int size) { + printf("Memory allocated : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size); + printf("Duration per loop : %.3f sec\n", sec); + printf("GFLOPS : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024); + printf("GiByte/s : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size); +} + +tuple, vector> init_A(size_t N) { + vector x(N), y(N); + for (size_t i = 0; i < N; ++i) { + x[i] = i%219 + 1.0; + y[i] = 1.0 / x[i]; + } + return make_tuple(x, y); +} + +void benchmark_A(vector const &x, vector const &y, size_t NLOOPS, bool cblas) { + size_t N = x.size(); + + double s(0.0), sum(0.0); + if (cblas == false) { + for (size_t i = 0; i < NLOOPS; ++i) { + s = scalar(x, y); + sum += s; + } + } else if (cblas == true) { + for (size_t i = 0; i < NLOOPS; ++i) { + s = cblas_ddot(N, x.data(), 1, y.data(), 1); + sum += s; + } + } + + // Check correctness + if (static_cast(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");} +} + +tuple, vector> init_B(size_t M, size_t N) { + vector A(M*N), x(N); + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) { + A[i*N + j] = (i+j)%219 + 1.0; + } + } + for (size_t j = 0; j < N; ++j) { + x[j] = 1.0/A[17*N + j]; + } + return make_tuple(A, x); +} + +void benchmark_B(vector const &A, vector const &x, size_t NLOOPS, bool cblas) { + size_t N = x.size(); + size_t M = A.size() / N; + vector b(M); + double sum(0.0); + + if (cblas == false) { + for (size_t i = 0; i < NLOOPS; ++i) { + b = matrix_vec(A,x); + sum += b[17]; + } + } else if (cblas == true) { + for (size_t i = 0; i < NLOOPS; ++i) { + cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1); + sum += b[17]; + } + } + + // Check correctness + if (static_cast(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");} +} + +tuple, vector> init_C(size_t M, size_t N, size_t L) { + vector A(M*L), B(L*N); + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < L; ++j) { + A[i*L + j] = (i+j)%219 + 1.0; + } + } + // B chosen such that C[0,17]=L + // so B[i,17] = 1/A[0,i] + for (size_t i = 0; i < L; ++i) { + for (size_t j = 0; j < N; ++j) { + if (j==17) { + B[i*N + 17] = 1.0/A[i]; + } else { + B[i*N + j] = (i+j)%219 + 1.0; + } + } + } + return make_tuple(A, B); +} + +void benchmark_C(vector const &A, vector const &B, size_t L, size_t NLOOPS, bool cblas) { + size_t M = A.size() / L; + size_t N = B.size() / L; + vector C(M*N); + double sum(0.0); + + if (cblas == false) { + for (size_t i = 0; i < NLOOPS; ++i) { + C = matrix_matrix(A,B,M); + sum += C[17]; + } + } else if (cblas == true) { + for (size_t i = 0; i < NLOOPS; ++i) { + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N); + sum += C[17]; + } + } + + // Check correctness + if (static_cast(sum) != L*NLOOPS) {printf(" !! W R O N G result !!\n");} +} + +tuple, vector> init_D(size_t N, size_t p) { + // x_i = i/N for i=0,...,N-1 + // a_j = 1 for j=0,...,p-1 + vector x(N), a(p); + for (size_t i = 0; i < N; ++i) { + x[i] = static_cast(i) / N; + } + for (size_t j = 0; j < p; ++j) { + a[j] = 1.0; + } + return make_tuple(x, a); +} + +void benchmark_D(vector const &x, vector const &a, size_t NLOOPS) { + size_t N = x.size(); + vector y(N); + double sum(0.0); + + for (size_t i = 0; i < NLOOPS; ++i) { + y = poly(x,a); + sum += y[0]; + } + + // Check correctness + if (static_cast(sum) != NLOOPS) {printf(" !! W R O N G result sum = %f !!\n", sum);} +} \ No newline at end of file diff --git a/ex3/code/task_4+6.h b/ex3/code/task_4+6.h new file mode 100644 index 0000000..1bf54ef --- /dev/null +++ b/ex3/code/task_4+6.h @@ -0,0 +1,14 @@ +#pragma once +#include +using namespace std; + +void print_performance(double sec, size_t memory, size_t flops, unsigned int size); +tuple, vector> init_A(size_t N); +tuple, vector> init_B(size_t M, size_t N); +tuple, vector> init_C(size_t M, size_t N, size_t L); +tuple, vector> init_D(size_t N, size_t p); + +void benchmark_A(vector const &x, vector const &y, size_t NLOOPS, bool cblas); +void benchmark_B(vector const &A, vector const &x, size_t NLOOPS, bool cblas); +void benchmark_C(vector const &A, vector const &B, size_t L, size_t NLOOPS, bool cblas); +void benchmark_D(vector const &x, vector const &a, size_t NLOOPS); \ No newline at end of file diff --git a/ex3/code/task_5.cpp b/ex3/code/task_5.cpp new file mode 100644 index 0000000..7c1812f --- /dev/null +++ b/ex3/code/task_5.cpp @@ -0,0 +1,33 @@ +#include "task_4+6.h" +#include "task_5.h" +#include "timing.h" +#include +#include +#include +using namespace std; + +double norm(vector const &x) { + size_t N = x.size(); + double sum = 0.0; + for (size_t i = 0; i < N; ++i) { + sum += x[i] * x[i]; + } + return sqrt(sum); +} + +vector init_norm(size_t N) { + vector x(N); + for (size_t i = 0; i < N; ++i) { + x[i] = i%219 + 1.0; + } + return x; +} + +void benchmark_norm(vector const &x, size_t NLOOPS) { + double s(0.0), sum(0.0); + for (size_t i = 0; i < NLOOPS; ++i) { + s = norm(x); + sum += s; + } + printf("||x|| = %f\n", sum/NLOOPS); +} \ No newline at end of file diff --git a/ex3/code/task_5.h b/ex3/code/task_5.h new file mode 100644 index 0000000..24609ba --- /dev/null +++ b/ex3/code/task_5.h @@ -0,0 +1,8 @@ +#pragma once + +#include +using namespace std; + +double norm(vector const &x); +vector init_norm(size_t N); +void benchmark_norm(vector const &x, size_t NLOOPS); \ No newline at end of file diff --git a/ex3/code/task_7.cpp b/ex3/code/task_7.cpp new file mode 100644 index 0000000..1121ec7 --- /dev/null +++ b/ex3/code/task_7.cpp @@ -0,0 +1,43 @@ +#include "task_7.h" + +#include +#include +#include +#include +using namespace std; + +tuple, vector> init_M(size_t N, size_t Nrhs) { + vector A(N*N), b(N*Nrhs); + for (size_t i = 0; i < N; ++i) { + for (size_t j = 0; j < N; ++j) { + if (i != j) { + A[i*N + j] = 1.0 / pow(abs(1.0*i-1.0*j),2); + } else { + A[i*N + j] = 4; + } + } + for (size_t j=0; j < Nrhs; ++j) { + b[i*Nrhs + j] = 1.0*j; + } + } + return make_tuple(A, b); +} + +void print_matrix(vector &A, size_t M, size_t N) { + printf("\n"); + for (size_t i = 0; i < M; ++i){ + for (size_t j = 0; j < N; ++j) { + printf("%f ", A[i*N + j]); + } + printf("\n"); + } + printf("\n\n"); +} + +void benchmark_lapacke(int N, int Nrhs) { + auto [A,b] = init_M(N,Nrhs); + int lda=N, ldb=Nrhs; + vector ipiv(N); + LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data()); + LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb); +} \ No newline at end of file diff --git a/ex3/code/task_7.h b/ex3/code/task_7.h new file mode 100644 index 0000000..6965da3 --- /dev/null +++ b/ex3/code/task_7.h @@ -0,0 +1,7 @@ +#pragma once +#include +using namespace std; + +tuple, vector> init_M(size_t N, size_t Nrhs); +void print_matrix(vector &A, size_t M, size_t N); +void benchmark_lapacke(int N, int Nrhs); \ No newline at end of file diff --git a/ex3/code/timing.h b/ex3/code/timing.h new file mode 100644 index 0000000..7e52921 --- /dev/null +++ b/ex3/code/timing.h @@ -0,0 +1,51 @@ +// +// Gundolf Haase, Oct 18 2024 +// +#pragma once +#include // timing +#include + +//using Clock = std::chrono::system_clock; //!< The wall clock timer chosen +using Clock = std::chrono::high_resolution_clock; +using TPoint= std::chrono::time_point; + +// [Galowicz, C++17 STL Cookbook, p. 29] +inline +std::stack MyStopWatch; //!< starting time of stopwatch + +/** Starts stopwatch timer. + * Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode + * + * The timining can be nested and the recent time point is stored on top of the stack. + * + * @return recent time point + * @see toc + */ +inline auto tic() +{ + MyStopWatch.push(Clock::now()); + return MyStopWatch.top(); +} + +/** Returns the elapsed time from stopwatch. + * + * The time point from top of the stack is used + * if time point @p t_b is not passed as input parameter. + * Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode + * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode + * The last option is to be used in the case of + * non-nested but overlapping time measurements. + * + * @param[in] t_b start time of some stop watch + * @return elapsed time in seconds. + * +*/ +inline double toc(TPoint const &t_b = MyStopWatch.top()) +{ + // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point + using Unit = std::chrono::seconds; + using FpSeconds = std::chrono::duration; + auto t_e = Clock::now(); + MyStopWatch.pop(); + return FpSeconds(t_e-t_b).count(); +} diff --git a/ex3/ex_3.pdf b/ex3/ex_3.pdf new file mode 100644 index 0000000..fefc770 Binary files /dev/null and b/ex3/ex_3.pdf differ diff --git a/ex3/stream/CLANG_default.mk b/ex3/stream/CLANG_default.mk new file mode 100644 index 0000000..1a2fde4 --- /dev/null +++ b/ex3/stream/CLANG_default.mk @@ -0,0 +1,124 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=CLANG_ + +#CLANGPATH=//usr/lib/llvm-10/bin/ +CC = ${CLANGPATH}clang +CXX = ${CLANGPATH}clang++ +#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0 +#F77 = gfortran +LINKER = ${CXX} + +#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages +WARNINGS += -Weverything +WARNINGS += -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded +WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic -ferror-limit=1 +#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic + +CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS} +# don't use -Ofast +# -ftrapv +LINKFLAGS += -O3 + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +LINKFLAGS += -llapack -lblas +# -lopenblas +ifeq ($(UBUNTU),1) +# ubuntu +else +# on archlinux +LINKFLAGS += -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# very good check +# http://clang.llvm.org/extra/clang-tidy/ +# good check, see: http://llvm.org/docs/CodingStandards.html#include-style +SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init +SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration +#READABILITY=,readability*${SWITCH_OFF} +#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +#TIDYFLAGS += -checks='modernize* +# ??? +#TIDYFLAGS = -checks='cert*' -header-filter=.* +# MPI checks ?? +#TIDYFLAGS = -checks='mpi*' +# ?? +#TIDYFLAGS = -checks='performance*' -header-filter=.* +#TIDYFLAGS = -checks='portability-*' -header-filter=.* +#TIDYFLAGS = -checks='readability-*' -header-filter=.* + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +codecheck: tidy_check +tidy_check: + clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES} +# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html + +run: clean ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +codecheck: tidy_check diff --git a/ex3/stream/GCC_default.mk b/ex3/stream/GCC_default.mk new file mode 100644 index 0000000..32cec75 --- /dev/null +++ b/ex3/stream/GCC_default.mk @@ -0,0 +1,183 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=GCC_ + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +#LINKFLAGS += -lblas +# The header requires extern "C". + +WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +#CXXFLAGS += -ffast-math -march=native ${WARNINGS} +CXXFLAGS += -O3 -funroll-all-loops -std=c++17 ${WARNINGS} +#-msse3 +# -ftree-vectorizer-verbose=2 -DNDEBUG +# -ftree-vectorizer-verbose=5 +# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr + +# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details +# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2 +# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +LINKFLAGS += -O3 + +#architecture +#CPU = -march=znver2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +ifeq ($(UBUNTU),1) +LINKFLAGS += -llapack -lblas +# -lopenblas +else +# on archlinux +LINKFLAGS += -llapack -lopenblas -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# for debugging purpose (save code) +# -fsanitize=leak # only one out the three can be used +# -fsanitize=address +# -fsanitize=thread +SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \ + -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \ + -fsanitize=bool -fsanitize=enum -fsanitize=vptr +#CXXFLAGS += ${SANITARY} +#LINKFLAGS += ${SANITARY} + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + -@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt + -@rm -rf html + +run: clean ${PROGRAM} +#run: ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +zip: clean + @echo "Zip the directory: " ${MY_DIR} + @cd .. ;\ + zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} + +doc: + doxygen Doxyfile + +######################################################################### +.SUFFIXES: .f90 + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log) + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +.f90.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +# no "-pg" in compile/link options +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} +# Graphical interface +# valkyrie + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/ +# * install +# * sudo vi /etc/sysctl.conf +# add kernel.perf_event_paranoid = 0 + +#Trace your heap: +#> heaptrack ./main.GCC_ +#> heaptrack_gui heaptrack.main.GCC_..gz +heap: ${PROGRAM} + heaptrack ./$^ ${PARAMS} 11 + heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` & + +codecheck: $(SOURCES) + cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^ + + +######################################################################## +# get the detailed status of all optimization flags +info: + echo "detailed status of all optimization flags" + $(CXX) --version + $(CXX) -Q $(CXXFLAGS) --help=optimizers + lscpu + inxi -C + lstopo + +# Excellent hardware info +# hardinfo +# Life monitoring of CPU frequency etc. +# sudo i7z + +# Memory consumption +# vmstat -at -SM 3 +# xfce4-taskmanager + + +# https://www.tecmint.com/check-linux-cpu-information/ +#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/ + +# Debugging: +# https://wiki.archlinux.org/index.php/Debugging diff --git a/ex3/stream/ICC_default.mk b/ex3/stream/ICC_default.mk new file mode 100644 index 0000000..40a765f --- /dev/null +++ b/ex3/stream/ICC_default.mk @@ -0,0 +1,125 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ICC_ + +#BINDIR = /opt/intel/bin/ + +CC = ${BINDIR}icc +CXX = ${BINDIR}icpc +F77 = ${BINDIR}ifort +LINKER = ${CXX} + + +WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3 +# -Winline -Wredundant-decls -Wunreachable-code +CXXFLAGS += -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg +# -vec-report=3 +# -qopt-report=5 -qopt-report-phase=vec +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd -msse3 + +LINKFLAGS += -O3 + +# LAPACK, BLAS: use MKL by INTEL +# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +CXXFLAGS += -mkl +LINKFLAGS += -mkl + +# interprocedural optimization +#CXXFLAGS += -ipo +#LINKFLAGS += -ipo + +# annotated assembler file +ANNOTED = -fsource-asm -S + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# cache: ${PROGRAM} +# valgrind --tool=callgrind --simulate-cache=yes ./$^ +# # kcachegrind callgrind.out. & +# +# # Check for wrong memory accesses, memory leaks, ... +# # use smaller data sets +# mem: ${PROGRAM} +# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ +# +# # Simple run time profiling of your code +# # CXXFLAGS += -g -pg +# # LINKFLAGS += -pg +# prof: ${PROGRAM} +# ./$^ +# gprof -b ./$^ > gp.out +# # kprof -f gp.out -p gprof & +# + + +mem: inspector +prof: amplifier +cache: amplifier + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# alternatively to the solution abouve: + #edit file /etc/sysctl.d/10-ptrace.conf and set variable kernel.yama.ptrace_scope variable to 0 . + vtune-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + inspxe-gui & + +advisor: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + vtune-gui & + +icc-info: + icpc -# main.cpp + + + + diff --git a/ex3/stream/ONEAPI_default.mk b/ex3/stream/ONEAPI_default.mk new file mode 100644 index 0000000..fe7b3fe --- /dev/null +++ b/ex3/stream/ONEAPI_default.mk @@ -0,0 +1,176 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ONEAPI_ + +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# requires +# source /opt/intel/oneapi/setvars.sh +# on AMD: export MKL_DEBUG_CPU_TYPE=5 + +#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/ +#MKL_ROOT = /opt/intel/oneapi/mkl/latest/ +#export KMP_AFFINITY=verbose,compact + +CC = ${BINDIR}icc +CXX = ${BINDIR}dpcpp +F77 = ${BINDIR}ifort +LINKER = ${CXX} + +## Compiler flags +WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic +WARNINGS += -Wpessimizing-move -Wredundant-move +#-wd2015,2012,2014 -wn3 +# -Winline -Wredundant-decls -Wunreachable-code +# -qopt-subscript-in-range +# -vec-threshold0 + +CXXFLAGS += -O3 -std=c++17 ${WARNINGS} +#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include" +#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include" +LINKFLAGS += -O3 + +# interprocedural optimization +CXXFLAGS += -ipo +LINKFLAGS += -ipo +LINKFLAGS += -flto + +# annotated Assembler file +ANNOTED = -fsource-asm -S + +#architecture +CPU = -march=core-avx2 +#CPU += -mtp=zen +# -xCORE-AVX2 +# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# use MKL by INTEL +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# sequential MKL +# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64) +CXXFLAGS += -qmkl=sequential -UMKL_ILP64 +LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread + +# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static +# install intel-oneapi-compiler-static +# or +LINKFLAGS += -shared-intel + + +OPENMP = -qopenmp +CXXFLAGS += ${OPENMP} +LINKFLAGS += ${OPENMP} + + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg +# -vec-report=3 +# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg +# Redirect report from *.optrpt to stderr +# -qopt-report-file=stderr +# Guided paralellization +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +## run time checks +# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html + + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} *.optrpt + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html + +mem: inspector +prof: vtune +cache: inspector + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid + amplxe-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# inspxe-gui & + vtune-gui ./${PROGRAM} & + +advisor: + source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh +# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui & + advisor --collect=survey ./${PROGRAM} +# advisor --collect=roofline ./${PROGRAM} + advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv + +vtune: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new + export ADVIXE_EXPERIMENTAL=roofline + vtune -collect hotspots ./${PROGRAM} + vtune -report hotspots -r r000hs > vtune.out +# vtune-gui ./${PROGRAM} & + +icc-info: + icpc -# main.cpp + +# MKL on AMD +# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/ +# +# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu +# export MKL_DEBUG_CPU_TYPE=5 +# export MKL_NUM_THRAEDS=1 +# export MKL_DYNAMIC=false +# on Intel compiler +# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html diff --git a/ex3/stream/PGI_default.mk b/ex3/stream/PGI_default.mk new file mode 100644 index 0000000..63dcb53 --- /dev/null +++ b/ex3/stream/PGI_default.mk @@ -0,0 +1,94 @@ +# Basic Defintions for using PGI-compiler suite sequentially +# requires setting of COMPILER=PGI_ +# OPTIRUN = optirun + + +CC = pgcc +CXX = pgc++ +F77 = pgfortran +LINKER = ${CXX} + +#LINKFLAGS += -llapack -lblas +# on mephisto: +#CXXFLAGS += -I/share/apps/atlas/include +#LINKFLAGS += -L/share/apps/atlas/lib +#LINKFLAGS += -lcblas -latlas + +#LINKFLAGS += -lblas +# Der Header muss mit extern "C" versehen werden, damit g++ alles findet. + +WARNINGS = -Minform=warn +# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow -Wredundant-decls +# -pedantic -Wunreachable-code -Wextra -Winline +# -Wunreachable-code + +#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel +PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity +# -Minfo +# -Mprof=time +# -Mprof=lines +# take care with option -Msafeptr +CXXFLAGS += -O3 -std=c++17 ${WARNINGS} +#CXXFLAGS += -O3 -std=c++11 -DNDEBUG ${PGI_PROFILING} ${WARNINGS} +# -fastsse -fargument-noalias ${WARNINGS} -msse3 -vec-report=3 + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Simple run time profiling of your code +# # CXXFLAGS += -g -pg +# # LINKFLAGS += -pg + + +# Profiling options PGI, see: pgcollect -help +# CPU_PROF = -allcache +CPU_PROF = -time +# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit +#GPU_PROF = -cuda=branch:cc20 +# +PROF_FILE = pgprof.out + +cache: prof + +prof: ${PROGRAM} + ${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^ + ${OPTIRUN} ${BINDIR}pgprof -exe ./$^ $(PROF_FILE) & + +info: + pgaccelinfo -v diff --git a/ex3/stream/stream/HISTORY.txt b/ex3/stream/stream/HISTORY.txt new file mode 100644 index 0000000..496fca6 --- /dev/null +++ b/ex3/stream/stream/HISTORY.txt @@ -0,0 +1,152 @@ +------------------------------------------------------------------------- + +Revisions as of Thu, Jan 17, 2013 3:50:01 PM + +Version 5.10 of stream.c has been released. +This version includes improved validation code and will automatically +use 64-bit array indices on 64-bit systems to allow very large arrays. + +------------------------------------------------------------------------- + +Revisions as of Thu Feb 19 08:16:57 CST 2009 + +Note that the codes in the "Versions" subdirectory should be +considered obsolete -- the versions of stream.c and stream.f +in this main directory include the OpenMP directives and structure +for creating "TUNED" versions. + +Only the MPI version in the "Versions" subdirectory should be +of any interest, and I have not recently checked that version for +errors or compliance with the current versions of stream.c and +stream.f. + +I added a simple Makefile to this directory. It works under Cygwin +on my Windows XP box (using gcc and g77). + +A user suggested a sneaky trick for "mysecond.c" -- instead of using +the #ifdef UNDERSCORE to generate the function name that the Fortran +compiler expects, the new version simply defines both "mysecond()" +and "mysecond_()", so it should automagically link with most Fortran +compilers. + +------------------------------------------------------------------------- + +Revisions as of Wed Nov 17 09:15:37 CST 2004 + +The most recent "official" versions have been renamed "stream.f" and +"stream.c" -- all other versions have been moved to the "Versions" +subdirectory. + +The "official" timer (was "second_wall.c") has been renamed "mysecond.c". +This is embedded in the C version ("stream.c"), but still needs to be +externally linked to the FORTRAN version ("stream.f"). + +------------------------------------------------------------------------- + +Revisions as of Tue May 27 11:51:23 CDT 2003 + +Copyright and License info added to stream_d.f, stream_mpi.f, and +stream_tuned.f + + +------------------------------------------------------------------------- + +Revisions as of Tue Apr 8 10:26:48 CDT 2003 + +I changed the name of the timer interface from "second" to "mysecond" +and removed the dummy argument in all versions of the source code (but +not the "Contrib" versions). + + +------------------------------------------------------------------------- + +Revisions as of Mon Feb 25 06:48:14 CST 2002 + +Added an OpenMP version of stream_d.c, called stream_d_omp.c. This is +still not up to date with the Fortran version, which includes error +checking and advanced data flow to prevent overoptimization, but it is +a good start.... + + +------------------------------------------------------------------------- + +Revisions as of Tue Jun 4 16:31:31 EDT 1996 + +I have fixed an "off-by-one" error in the RMS time calculation in +stream_d.f. This was already corrected in stream_d.c. No results are +invalidated, since I use minimum time instead of RMS time anyway.... + +------------------------------------------------------------------------- + +Revisions as of Fri Dec 8 14:49:56 EST 1995 + +I have renamed the timer routines to: + second_cpu.c + second_wall.c + second_cpu.f + +All have a function interface named 'second' which returns a double +precision floating point number. It should be possible to link +second_wall.c with stream_d.f without too much trouble, though the +details will depend on your environment. + +If anyone builds versions of these timers for machines running the +Macintosh O/S or DOS/Windows, I would appreciate getting a copy. + +To clarify: + * For single-user machines, the wallclock timer is preferred. + * For parallel machines, the wallclock timer is required. + * For time-shared systems, the cpu timer is more reliable, + though less accurate. + + +------------------------------------------------------------------------- + +Revisions as of Wed Oct 25 09:40:32 EDT 1995 + +(1) NOTICE to C users: + + stream_d.c has been updated to version 4.0 (beta), and + should be functionally identical to stream_d.f + + Two timers are provided --- second_cpu.c and second_wall.c + second_cpu.c measures cpu time, while second_wall.c measures + elapsed (real) time. + + For single-user machines, the wallclock timer is preferred. + For parallel machines, the wallclock timer is required. + For time-shared systems, the cpu timer is more reliable, + though less accurate. + +(2) cstream.c has been removed -- use stream_d.c + +(3) stream_wall.f has been removed --- to do parallel aggregate + bandwidth runs, comment out the definition of FUNCTION SECOND + in stream_d.f and compile/link with second_wall.c + +(4) stream_offset has been deprecated. It is still here + and usable, but stream_d.f is the "standard" version. + There are easy hooks in stream_d.f to change the + array offsets if you want to. + +(5) The rules of the game are clarified as follows: + + The reference case uses array sizes of 2,000,000 elements + and no additional offsets. I would like to see results + for this case. + + But, you are free to use any array size and any offset + you want, provided that the arrays are each bigger than + the last-level of cache. The output will show me what + parameters you chose. + + I expect that I will report just the best number, but + if there is a serious discrepancy between the reference + case and the "best" case, I reserve the right to report + both. + + Of course, I also reserve the right to reject any results + that I do not trust.... +-- +John D. McCalpin, Ph.D. +john@mccalpin.com diff --git a/ex3/stream/stream/LICENSE.txt b/ex3/stream/stream/LICENSE.txt new file mode 100644 index 0000000..cf1c8e0 --- /dev/null +++ b/ex3/stream/stream/LICENSE.txt @@ -0,0 +1,34 @@ +*======================================================================= +*----------------------------------------------------------------------- +* Copyright 1991-2003: John D. McCalpin +*----------------------------------------------------------------------- +* License: +* 1. You are free to use this program and/or to redistribute +* this program. +* 2. You are free to modify this program for your own use, +* including commercial use, subject to the publication +* restrictions in item 3. +* 3. You are free to publish results obtained from running this +* program, or from works that you derive from this program, +* with the following limitations: +* 3a. In order to be referred to as "STREAM benchmark results", +* published results must be in conformance to the STREAM +* Run Rules, (briefly reviewed below) published at +* http://www.cs.virginia.edu/stream/ref.html +* and incorporated herein by reference. +* As the copyright holder, John McCalpin retains the +* right to determine conformity with the Run Rules. +* 3b. Results based on modified source code or on runs not in +* accordance with the STREAM Run Rules must be clearly +* labelled whenever they are published. Examples of +* proper labelling include: +* "tuned STREAM benchmark results" +* "based on a variant of the STREAM benchmark code" +* Other comparable, clear and reasonable labelling is +* acceptable. +* 3c. Submission of results to the STREAM benchmark web site +* is encouraged, but not required. +* 4. Use of this program or creation of derived works based on this +* program constitutes acceptance of these licensing restrictions. +* 5. Absolutely no warranty is expressed or implied. +*----------------------------------------------------------------------- diff --git a/ex3/stream/stream/Makefile b/ex3/stream/stream/Makefile new file mode 100644 index 0000000..d6fa443 --- /dev/null +++ b/ex3/stream/stream/Makefile @@ -0,0 +1,44 @@ +CC = gcc +CFLAGS = -O3 +DIMENSIONS = -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 + +FF = gfortran +FFLAGS = -O3 + +all: stream_f.exe stream_c.exe flops.exe + +stream_f.exe: stream.f mysecond.o + $(CC) $(CFLAGS) -c mysecond.c + $(FF) $(FFLAGS) $(DIMENSIONS) -c stream.f + $(FF) $(FFLAGS) stream.o mysecond.o -o stream_f.exe + +stream_c.exe: stream.c + $(CC) $(CFLAGS) $(DIMENSIONS) stream.c -o stream_c.exe + +clean: + rm -f *.exe *.o + +# an example of a more complex build line for the Intel icc compiler +stream.icc: stream.c + icc -O3 -xCORE-AVX2 -ffreestanding -qopenmp -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 stream.c -o stream.omp.AVX2.80M.20x.icc + +# GH +flops.exe: + $(CC) $(CFLAGS) -DUNIX flops.c -o flops.exe + +run: clean all + ./stream_c.exe + ./flops.exe + +MY_DIR = `basename ${PWD}` +tar: clean + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} + + +zip: clean + @cd .. ;\ + zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk;\ + cd ${MY_DIR} +# HG diff --git a/ex3/stream/stream/READ.ME b/ex3/stream/stream/READ.ME new file mode 100644 index 0000000..175a3f0 --- /dev/null +++ b/ex3/stream/stream/READ.ME @@ -0,0 +1,110 @@ +=============================================== + +STREAM is the de facto industry standard benchmark +for measuring sustained memory bandwidth. + +Documentation for STREAM is on the web at: + http://www.cs.virginia.edu/stream/ref.html + +=============================================== +NEWS +=============================================== +UPDATE: October 28 2014: + +"stream_mpi.c" released in the Versions directory. + +Based on Version 5.10 of stream.c, stream_mpi.c +brings the following new features: +* MPI implementation that *distributes* the arrays + across all MPI ranks. (The older Fortran version + of STREAM in MPI *replicates* the arrays across + all MPI ranks.) +* Data is allocated using "posix_memalign" + rather than using static arrays. Different + compiler flags may be needed for both portability + and optimization. + See the READ.ME file in the Versions directory + for more details. +* Error checking and timing done by all ranks and + gathered by rank 0 for processing and output. +* Timing code uses barriers to ensure correct + operation even when multiple MPI ranks run on + shared memory systems. + +NOTE: MPI is not a preferred implementation for + STREAM, which is intended to measure memory + bandwidth in shared-memory systems. In stream_mpi, + the MPI calls are only used to properly synchronize + the timers (using MPI_Barrier) and to gather + timing and error data, so the performance should + scale linearly with the size of the cluster. + But it may be useful, and was an interesting + exercise to develop and debug. + +=============================================== +UPDATE: January 17 2013: + +Version 5.10 of stream.c is finally available! + +There are no changes to what is being measured, but +a number of long-awaited improvements have been made: + +* Updated validation code does not suffer from + accumulated roundoff error for large arrays. +* Defining the preprocessor variable "VERBOSE" + when compiling will (1) cause the code to print the + measured average relative absolute error (rather than + simply printing "Solution Validates", and (2) print + the first 10 array entries with relative error exceeding + the error tolerance. +* Array index variables have been upgraded from + "int" to "ssize_t" to allow arrays with more + than 2 billion elements on 64-bit systems. +* Substantial improvements to the comments in + the source on how to configure/compile/run the + benchmark. +* The proprocessor variable controlling the array + size has been changed from "N" to "STREAM_ARRAY_SIZE". +* A new preprocessor variable "STREAM_TYPE" can be + used to override the data type from the default + "double" to "float". + This mechanism could also be used to change to + non-floating-point types, but several "printf" + statements would need to have their formats changed + to accomodate the modified data type. +* Some small changes in output, including printing + array sizes is GiB as well as MiB. +* Change to the default output format to print fewer + decimals for the bandwidth and more decimals for + the min/max/avg execution times. + + +=============================================== +UPDATE: February 19 2009: + +The most recent "official" versions have been renamed +"stream.f" and "stream.c" -- all other versions have +been moved to the "Versions" subdirectory and should be +considered obsolete. + +The "official" timer (was "second_wall.c") has been +renamed "mysecond.c". This is embedded in the C version +("stream.c"), but still needs to be externally linked to +the FORTRAN version ("stream.f"). The new version defines +entry points both with and without trailing underscores, +so it *should* link automagically with any Fortran compiler. + +=============================================== + +STREAM is a project of "Dr. Bandwidth": + John D. McCalpin, Ph.D. + john@mccalpin.com + +=============================================== + +The STREAM web and ftp sites are currently hosted at +the Department of Computer Science at the University of +Virginia under the generous sponsorship of Professor Bill +Wulf and Professor Alan Batson. + +=============================================== diff --git a/ex3/stream/stream/flops.c b/ex3/stream/stream/flops.c new file mode 100644 index 0000000..10325c1 --- /dev/null +++ b/ex3/stream/stream/flops.c @@ -0,0 +1,1156 @@ +/* gcc -O3 -DUNIX -funroll-all-loops -o flops flops.c */ +/* ./flops */ + +/*--------------------- Start flops.c source code ----------------------*/ + +/*****************************/ +/* flops.c */ +/* Version 2.0, 18 Dec 1992 */ +/* Al Aburto */ +/* aburto@nosc.mil */ +/*****************************/ + +/* + Flops.c is a 'c' program which attempts to estimate your systems + floating-point 'MFLOPS' rating for the FADD, FSUB, FMUL, and FDIV + operations based on specific 'instruction mixes' (discussed below). + The program provides an estimate of PEAK MFLOPS performance by making + maximal use of register variables with minimal interaction with main + memory. The execution loops are all small so that they will fit in + any cache. Flops.c can be used along with Linpack and the Livermore + kernels (which exersize memory much more extensively) to gain further + insight into the limits of system performance. The flops.c execution + modules also include various percent weightings of FDIV's (from 0% to + 25% FDIV's) so that the range of performance can be obtained when + using FDIV's. FDIV's, being computationally more intensive than + FADD's or FMUL's, can impact performance considerably on some systems. + + Flops.c consists of 8 independent modules (routines) which, except for + module 2, conduct numerical integration of various functions. Module + 2, estimates the value of pi based upon the Maclaurin series expansion + of atan(1). MFLOPS ratings are provided for each module, but the + programs overall results are summerized by the MFLOPS(1), MFLOPS(2), + MFLOPS(3), and MFLOPS(4) outputs. + + The MFLOPS(1) result is identical to the result provided by all + previous versions of flops.c. It is based only upon the results from + modules 2 and 3. Two problems surfaced in using MFLOPS(1). First, it + was difficult to completely 'vectorize' the result due to the + recurrence of the 's' variable in module 2. This problem is addressed + in the MFLOPS(2) result which does not use module 2, but maintains + nearly the same weighting of FDIV's (9.2%) as in MFLOPS(1) (9.6%). + The second problem with MFLOPS(1) centers around the percentage of + FDIV's (9.6%) which was viewed as too high for an important class of + problems. This concern is addressed in the MFLOPS(3) result where NO + FDIV's are conducted at all. + + The number of floating-point instructions per iteration (loop) is + given below for each module executed: + + MODULE FADD FSUB FMUL FDIV TOTAL Comment + 1 7 0 6 1 14 7.1% FDIV's + 2 3 2 1 1 7 difficult to vectorize. + 3 6 2 9 0 17 0.0% FDIV's + 4 7 0 8 0 15 0.0% FDIV's + 5 13 0 15 1 29 3.4% FDIV's + 6 13 0 16 0 29 0.0% FDIV's + 7 3 3 3 3 12 25.0% FDIV's + 8 13 0 17 0 30 0.0% FDIV's + + A*2+3 21 12 14 5 52 A=5, MFLOPS(1), Same as + 40.4% 23.1% 26.9% 9.6% previous versions of the + flops.c program. Includes + only Modules 2 and 3, does + 9.6% FDIV's, and is not + easily vectorizable. + + 1+3+4 58 14 66 14 152 A=4, MFLOPS(2), New output + +5+6+ 38.2% 9.2% 43.4% 9.2% does not include Module 2, + A*7 but does 9.2% FDIV's. + + 1+3+4 62 5 74 5 146 A=0, MFLOPS(3), New output + +5+6+ 42.9% 3.4% 50.7% 3.4% does not include Module 2, + 7+8 but does 3.4% FDIV's. + + 3+4+6 39 2 50 0 91 A=0, MFLOPS(4), New output + +8 42.9% 2.2% 54.9% 0.0% does not include Module 2, + and does NO FDIV's. + + NOTE: Various timer routines are included as indicated below. The + timer routines, with some comments, are attached at the end + of the main program. + + NOTE: Please do not remove any of the printouts. + + EXAMPLE COMPILATION: + UNIX based systems + cc -DUNIX -O flops.c -o flops + cc -DUNIX -DROPT flops.c -o flops + cc -DUNIX -fast -O4 flops.c -o flops + . + . + . + etc. + + Al Aburto + aburto@nosc.mil +*/ + +/***************************************************************/ +/* Timer options. You MUST uncomment one of the options below */ +/* or compile, for example, with the '-DUNIX' option. */ +/***************************************************************/ +/* #define Amiga */ +/* #define UNIX */ +/* #define UNIX_Old */ +/* #define VMS */ +/* #define BORLAND_C */ +/* #define MSC */ +/* #define MAC */ +/* #define IPSC */ +/* #define FORTRAN_SEC */ +/* #define GTODay */ +/* #define CTimer */ +/* #define UXPM */ +/* #define MAC_TMgr */ +/* #define PARIX */ +/* #define POSIX */ +/* #define WIN32 */ +/* #define POSIX1 */ +/***********************/ + +#include +#include + /* 'Uncomment' the line below to run */ + /* with 'register double' variables */ + /* defined, or compile with the */ + /* '-DROPT' option. Don't need this if */ + /* registers used automatically, but */ + /* you might want to try it anyway. */ +/* #define ROPT */ + +double nulltime, TimeArray[3]; /* Variables needed for 'dtime()'. */ +double TLimit; /* Threshold to determine Number of */ + /* Loops to run. Fixed at 15.0 seconds.*/ + +double T[36]; /* Global Array used to hold timing */ + /* results and other information. */ + +double sa,sb,sc,sd,one,two,three; +double four,five,piref,piprg; +double scale,pierr; + +double A0 = 1.0; +double A1 = -0.1666666666671334; +double A2 = 0.833333333809067E-2; +double A3 = 0.198412715551283E-3; +double A4 = 0.27557589750762E-5; +double A5 = 0.2507059876207E-7; +double A6 = 0.164105986683E-9; + +double B0 = 1.0; +double B1 = -0.4999999999982; +double B2 = 0.4166666664651E-1; +double B3 = -0.1388888805755E-2; +double B4 = 0.24801428034E-4; +double B5 = -0.2754213324E-6; +double B6 = 0.20189405E-8; + +double C0 = 1.0; +double C1 = 0.99999999668; +double C2 = 0.49999995173; +double C3 = 0.16666704243; +double C4 = 0.4166685027E-1; +double C5 = 0.832672635E-2; +double C6 = 0.140836136E-2; +double C7 = 0.17358267E-3; +double C8 = 0.3931683E-4; + +double D1 = 0.3999999946405E-1; +double D2 = 0.96E-3; +double D3 = 0.1233153E-5; + +double E2 = 0.48E-3; +double E3 = 0.411051E-6; + +// void dtime(double p[]); + +int main() +{ + +#ifdef ROPT + register double s,u,v,w,x; +#else + double s,u,v,w,x; +#endif + + long loops, NLimit; + register long i, m, n; + + printf("\n"); + printf(" FLOPS C Program (Double Precision), V2.0 18 Dec 1992\n\n"); + + /****************************/ + loops = 15625; /* Initial number of loops. */ + /* DO NOT CHANGE! */ + /****************************/ + +/****************************************************/ +/* Set Variable Values. */ +/* T[1] references all timing results relative to */ +/* one million loops. */ +/* */ +/* The program will execute from 31250 to 512000000 */ +/* loops based on a runtime of Module 1 of at least */ +/* TLimit = 15.0 seconds. That is, a runtime of 15 */ +/* seconds for Module 1 is used to determine the */ +/* number of loops to execute. */ +/* */ +/* No more than NLimit = 512000000 loops are allowed*/ +/****************************************************/ + + T[1] = 1.0E+06/(double)loops; + + TLimit = 15.0; + NLimit = 512000000; + + piref = 3.14159265358979324; + one = 1.0; + two = 2.0; + three = 3.0; + four = 4.0; + five = 5.0; + scale = one; + + printf(" Module Error RunTime MFLOPS\n"); + printf(" (usec)\n"); +/*************************/ +/* Initialize the timer. */ +/*************************/ + + dtime(TimeArray); + dtime(TimeArray); + +/*******************************************************/ +/* Module 1. Calculate integral of df(x)/f(x) defined */ +/* below. Result is ln(f(1)). There are 14 */ +/* double precision operations per loop */ +/* ( 7 +, 0 -, 6 *, 1 / ) that are included */ +/* in the timing. */ +/* 50.0% +, 00.0% -, 42.9% *, and 07.1% / */ +/*******************************************************/ + n = loops; + sa = 0.0; + + while ( sa < TLimit ) + { + n = 2 * n; + x = one / (double)n; /*********************/ + s = 0.0; /* Loop 1. */ + v = 0.0; /*********************/ + w = one; + + dtime(TimeArray); + for( i = 1 ; i <= n-1 ; i++ ) + { + v = v + w; + u = v * x; + s = s + (D1+u*(D2+u*D3))/(w+u*(D1+u*(E2+u*E3))); + } + dtime(TimeArray); + sa = TimeArray[1]; + + if ( n == NLimit ) break; + /* printf(" %10ld %12.5lf\n",n,sa); */ + } + + scale = 1.0E+06 / (double)n; + T[1] = scale; + +/****************************************/ +/* Estimate nulltime ('for' loop time). */ +/****************************************/ + dtime(TimeArray); + for( i = 1 ; i <= n-1 ; i++ ) + { + } + dtime(TimeArray); + nulltime = T[1] * TimeArray[1]; + if ( nulltime < 0.0 ) nulltime = 0.0; + + T[2] = T[1] * sa - nulltime; + + sa = (D1+D2+D3)/(one+D1+E2+E3); + sb = D1; + + T[3] = T[2] / 14.0; /*********************/ + sa = x * ( sa + sb + two * s ) / two; /* Module 1 Results. */ + sb = one / sa; /*********************/ + n = (long)( (double)( 40000 * (long)sb ) / scale ); + sc = sb - 25.2; + T[4] = one / T[3]; + /********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /********************/ + printf(" 1 %13.4le %10.4lf %10.4lf\n",sc,T[2],T[4]); + + m = n; + +/*******************************************************/ +/* Module 2. Calculate value of PI from Taylor Series */ +/* expansion of atan(1.0). There are 7 */ +/* double precision operations per loop */ +/* ( 3 +, 2 -, 1 *, 1 / ) that are included */ +/* in the timing. */ +/* 42.9% +, 28.6% -, 14.3% *, and 14.3% / */ +/*******************************************************/ + + s = -five; /********************/ + sa = -one; /* Loop 2. */ + /********************/ + dtime(TimeArray); + for ( i = 1 ; i <= m ; i++ ) + { + s = -s; + sa = sa + s; + } + dtime(TimeArray); + T[5] = T[1] * TimeArray[1]; + if ( T[5] < 0.0 ) T[5] = 0.0; + + sc = (double)m; + + u = sa; /*********************/ + v = 0.0; /* Loop 3. */ + w = 0.0; /*********************/ + x = 0.0; + + dtime(TimeArray); + for ( i = 1 ; i <= m ; i++) + { + s = -s; + sa = sa + s; + u = u + two; + x = x +(s - u); + v = v - s * u; + w = w + s / u; + } + dtime(TimeArray); + T[6] = T[1] * TimeArray[1]; + + T[7] = ( T[6] - T[5] ) / 7.0; /*********************/ + m = (long)( sa * x / sc ); /* PI Results */ + sa = four * w / five; /*********************/ + sb = sa + five / v; + sc = 31.25; + piprg = sb - sc / (v * v * v); + pierr = piprg - piref; + T[8] = one / T[7]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 2 %13.4le %10.4lf %10.4lf\n",pierr,T[6]-T[5],T[8]); + +/*******************************************************/ +/* Module 3. Calculate integral of sin(x) from 0.0 to */ +/* PI/3.0 using Trapazoidal Method. Result */ +/* is 0.5. There are 17 double precision */ +/* operations per loop (6 +, 2 -, 9 *, 0 /) */ +/* included in the timing. */ +/* 35.3% +, 11.8% -, 52.9% *, and 00.0% / */ +/*******************************************************/ + + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 4. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + v = v + one; + u = v * x; + w = u * u; + s = s + u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one); + } + dtime(TimeArray); + T[9] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = u * ((((((A6*w-A5)*w+A4)*w-A3)*w+A2)*w+A1)*w+one); + + T[10] = T[9] / 17.0; /*********************/ + sa = x * ( sa + two * s ) / two; /* sin(x) Results. */ + sb = 0.5; /*********************/ + sc = sa - sb; + T[11] = one / T[10]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 3 %13.4le %10.4lf %10.4lf\n",sc,T[9],T[11]); + +/************************************************************/ +/* Module 4. Calculate Integral of cos(x) from 0.0 to PI/3 */ +/* using the Trapazoidal Method. Result is */ +/* sin(PI/3). There are 15 double precision */ +/* operations per loop (7 +, 0 -, 8 *, and 0 / ) */ +/* included in the timing. */ +/* 50.0% +, 00.0% -, 50.0% *, 00.0% / */ +/************************************************************/ + A3 = -A3; + A5 = -A5; + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 5. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + s = s + w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + } + dtime(TimeArray); + T[12] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + + T[13] = T[12] / 15.0; /*******************/ + sa = x * ( sa + one + two * s ) / two; /* Module 4 Result */ + u = piref / three; /*******************/ + w = u * u; + sb = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+A0); + sc = sa - sb; + T[14] = one / T[13]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 4 %13.4le %10.4lf %10.4lf\n",sc,T[12],T[14]); + +/************************************************************/ +/* Module 5. Calculate Integral of tan(x) from 0.0 to PI/3 */ +/* using the Trapazoidal Method. Result is */ +/* ln(cos(PI/3)). There are 29 double precision */ +/* operations per loop (13 +, 0 -, 15 *, and 1 /)*/ +/* included in the timing. */ +/* 46.7% +, 00.0% -, 50.0% *, and 03.3% / */ +/************************************************************/ + + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 6. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + s = s + v / (w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one); + } + dtime(TimeArray); + T[15] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + sa = sa / sb; + + T[16] = T[15] / 29.0; /*******************/ + sa = x * ( sa + two * s ) / two; /* Module 5 Result */ + sb = 0.6931471805599453; /*******************/ + sc = sa - sb; + T[17] = one / T[16]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 5 %13.4le %10.4lf %10.4lf\n",sc,T[15],T[17]); + +/************************************************************/ +/* Module 6. Calculate Integral of sin(x)*cos(x) from 0.0 */ +/* to PI/4 using the Trapazoidal Method. Result */ +/* is sin(PI/4)^2. There are 29 double precision */ +/* operations per loop (13 +, 0 -, 16 *, and 0 /)*/ +/* included in the timing. */ +/* 46.7% +, 00.0% -, 53.3% *, and 00.0% / */ +/************************************************************/ + + x = piref / ( four * (double)m ); /*********************/ + s = 0.0; /* Loop 7. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + v = u * ((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + s = s + v*(w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one); + } + dtime(TimeArray); + T[18] = T[1] * TimeArray[1] - nulltime; + + u = piref / four; + w = u * u; + sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + sa = sa * sb; + + T[19] = T[18] / 29.0; /*******************/ + sa = x * ( sa + two * s ) / two; /* Module 6 Result */ + sb = 0.25; /*******************/ + sc = sa - sb; + T[20] = one / T[19]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 6 %13.4le %10.4lf %10.4lf\n",sc,T[18],T[20]); + + +/*******************************************************/ +/* Module 7. Calculate value of the definite integral */ +/* from 0 to sa of 1/(x+1), x/(x*x+1), and */ +/* x*x/(x*x*x+1) using the Trapizoidal Rule.*/ +/* There are 12 double precision operations */ +/* per loop ( 3 +, 3 -, 3 *, and 3 / ) that */ +/* are included in the timing. */ +/* 25.0% +, 25.0% -, 25.0% *, and 25.0% / */ +/*******************************************************/ + + /*********************/ + s = 0.0; /* Loop 8. */ + w = one; /*********************/ + sa = 102.3321513995275; + v = sa / (double)m; + + dtime(TimeArray); + for ( i = 1 ; i <= m-1 ; i++) + { + x = (double)i * v; + u = x * x; + s = s - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w ); + } + dtime(TimeArray); + T[21] = T[1] * TimeArray[1] - nulltime; + /*********************/ + /* Module 7 Results */ + /*********************/ + T[22] = T[21] / 12.0; + x = sa; + u = x * x; + sa = -w - w / ( x + w ) - x / ( u + w ) - u / ( x * u + w ); + sa = 18.0 * v * (sa + two * s ); + + m = -2000 * (long)sa; + m = (long)( (double)m / scale ); + + sc = sa + 500.2; + T[23] = one / T[22]; + /********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /********************/ + printf(" 7 %13.4le %10.4lf %10.4lf\n",sc,T[21],T[23]); + +/************************************************************/ +/* Module 8. Calculate Integral of sin(x)*cos(x)*cos(x) */ +/* from 0 to PI/3 using the Trapazoidal Method. */ +/* Result is (1-cos(PI/3)^3)/3. There are 30 */ +/* double precision operations per loop included */ +/* in the timing: */ +/* 13 +, 0 -, 17 * 0 / */ +/* 46.7% +, 00.0% -, 53.3% *, and 00.0% / */ +/************************************************************/ + + x = piref / ( three * (double)m ); /*********************/ + s = 0.0; /* Loop 9. */ + v = 0.0; /*********************/ + + dtime(TimeArray); + for( i = 1 ; i <= m-1 ; i++ ) + { + u = (double)i * x; + w = u * u; + v = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + s = s + v*v*u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + } + dtime(TimeArray); + T[24] = T[1] * TimeArray[1] - nulltime; + + u = piref / three; + w = u * u; + sa = u*((((((A6*w+A5)*w+A4)*w+A3)*w+A2)*w+A1)*w+one); + sb = w*(w*(w*(w*(w*(B6*w+B5)+B4)+B3)+B2)+B1)+one; + sa = sa * sb * sb; + + T[25] = T[24] / 30.0; /*******************/ + sa = x * ( sa + two * s ) / two; /* Module 8 Result */ + sb = 0.29166666666666667; /*******************/ + sc = sa - sb; + T[26] = one / T[25]; + /*********************/ + /* DO NOT REMOVE */ + /* THIS PRINTOUT! */ + /*********************/ + printf(" 8 %13.4le %10.4lf %10.4lf\n",sc,T[24],T[26]); + +/**************************************************/ +/* MFLOPS(1) output. This is the same weighting */ +/* used for all previous versions of the flops.c */ +/* program. Includes Modules 2 and 3 only. */ +/**************************************************/ + T[27] = ( five * (T[6] - T[5]) + T[9] ) / 52.0; + T[28] = one / T[27]; + +/**************************************************/ +/* MFLOPS(2) output. This output does not include */ +/* Module 2, but it still does 9.2% FDIV's. */ +/**************************************************/ + T[29] = T[2] + T[9] + T[12] + T[15] + T[18]; + T[29] = (T[29] + four * T[21]) / 152.0; + T[30] = one / T[29]; + +/**************************************************/ +/* MFLOPS(3) output. This output does not include */ +/* Module 2, but it still does 3.4% FDIV's. */ +/**************************************************/ + T[31] = T[2] + T[9] + T[12] + T[15] + T[18]; + T[31] = (T[31] + T[21] + T[24]) / 146.0; + T[32] = one / T[31]; + +/**************************************************/ +/* MFLOPS(4) output. This output does not include */ +/* Module 2, and it does NO FDIV's. */ +/**************************************************/ + T[33] = (T[9] + T[12] + T[18] + T[24]) / 91.0; + T[34] = one / T[33]; + + + printf("\n"); + printf(" Iterations = %10ld\n",m); + printf(" NullTime (usec) = %10.4lf\n",nulltime); + printf(" MFLOPS(1) = %10.4lf\n",T[28]); + printf(" MFLOPS(2) = %10.4lf\n",T[30]); + printf(" MFLOPS(3) = %10.4lf\n",T[32]); + printf(" MFLOPS(4) = %10.4lf\n\n",T[34]); + return 0; +} + +/*****************************************************/ +/* Various timer routines. */ +/* Al Aburto, aburto@nosc.mil, 18 Feb 1997 */ +/* */ +/* dtime(p) outputs the elapsed time seconds in p[1] */ +/* from a call of dtime(p) to the next call of */ +/* dtime(p). Use CAUTION as some of these routines */ +/* will mess up when timing across the hour mark!!! */ +/* */ +/* For timing I use the 'user' time whenever */ +/* possible. Using 'user+sys' time is a separate */ +/* issue. */ +/* */ +/* Example Usage: */ +/* [Timer options added here] */ +/* double RunTime, TimeArray[3]; */ +/* main() */ +/* { */ +/* dtime(TimeArray); */ +/* [routine to time] */ +/* dtime(TimeArray); */ +/* RunTime = TimeArray[1]; */ +/* } */ +/* [Timer code added here] */ +/*****************************************************/ + +/******************************/ +/* Timer code. */ +/******************************/ + +/*******************/ +/* Amiga dtime() */ +/*******************/ +#ifdef Amiga +#include +#define HZ 50 + +dtime(p) +double p[]; +{ + double q; + + struct tt { + long days; + long minutes; + long ticks; + } tt; + + q = p[2]; + + DateStamp(&tt); + + p[2] = ( (double)(tt.ticks + (tt.minutes * 60L * 50L)) ) / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*****************************************************/ +/* UNIX dtime(). This is the preferred UNIX timer. */ +/* Provided by: Markku Kolkka, mk59200@cc.tut.fi */ +/* HP-UX Addition by: Bo Thide', bt@irfu.se */ +/*****************************************************/ +#ifdef UNIX +#include +#include + +#ifdef hpux +#include +#define getrusage(a,b) syscall(SYS_getrusage,a,b) +#endif + +struct rusage rusage; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + getrusage(RUSAGE_SELF,&rusage); + + p[2] = (double)(rusage.ru_utime.tv_sec); + p[2] = p[2] + (double)(rusage.ru_utime.tv_usec) * 1.0e-06; + p[1] = p[2] - q; + + return 0; +} +#endif + +/***************************************************/ +/* UNIX_Old dtime(). This is the old UNIX timer. */ +/* Use only if absolutely necessary as HZ may be */ +/* ill defined on your system. */ +/***************************************************/ +#ifdef UNIX_Old +#include +#include +#include + +#ifndef HZ +#define HZ 60 +#endif + +struct tms tms; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + times(&tms); + + p[2] = (double)(tms.tms_utime) / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*********************************************************/ +/* VMS dtime() for VMS systems. */ +/* Provided by: RAMO@uvphys.phys.UVic.CA */ +/* Some people have run into problems with this timer. */ +/*********************************************************/ +#ifdef VMS +#include time + +#ifndef HZ +#define HZ 100 +#endif + +struct tbuffer_t + { + int proc_user_time; + int proc_system_time; + int child_user_time; + int child_system_time; + }; + +struct tbuffer_t tms; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + times(&tms); + + p[2] = (double)(tms.proc_user_time) / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/******************************/ +/* BORLAND C dtime() for DOS */ +/******************************/ +#ifdef BORLAND_C +#include +#include +#include + +#define HZ 100 +struct time tnow; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + gettime(&tnow); + + p[2] = 60.0 * (double)(tnow.ti_min); + p[2] = p[2] + (double)(tnow.ti_sec); + p[2] = p[2] + (double)(tnow.ti_hund)/(double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/**************************************/ +/* Microsoft C (MSC) dtime() for DOS */ +/**************************************/ +#ifdef MSC +#include +#include + +#define HZ CLOCKS_PER_SEC +clock_t tnow; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + tnow = clock(); + + p[2] = (double)tnow / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*************************************/ +/* Macintosh (MAC) Think C dtime() */ +/*************************************/ +#ifdef MAC +#include + +#define HZ 60 + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + p[2] = (double)clock() / (double)HZ; + p[1] = p[2] - q; + + return 0; +} +#endif + +/************************************************************/ +/* iPSC/860 (IPSC) dtime() for i860. */ +/* Provided by: Dan Yergeau, yergeau@gloworm.Stanford.EDU */ +/************************************************************/ +#ifdef IPSC +extern double dclock(); + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + p[2] = dclock(); + p[1] = p[2] - q; + + return 0; +} +#endif + +/**************************************************/ +/* FORTRAN dtime() for Cray type systems. */ +/* This is the preferred timer for Cray systems. */ +/**************************************************/ +#ifdef FORTRAN_SEC + +fortran double second(); + +dtime(p) +double p[]; +{ + double q,v; + + q = p[2]; + + second(&v); + p[2] = v; + p[1] = p[2] - q; + + return 0; +} +#endif + +/***********************************************************/ +/* UNICOS C dtime() for Cray UNICOS systems. Don't use */ +/* unless absolutely necessary as returned time includes */ +/* 'user+system' time. Provided by: R. Mike Dority, */ +/* dority@craysea.cray.com */ +/***********************************************************/ +#ifdef CTimer +#include + +dtime(p) +double p[]; +{ + double q; + clock_t clock(void); + + q = p[2]; + + p[2] = (double)clock() / (double)CLOCKS_PER_SEC; + p[1] = p[2] - q; + + return 0; +} +#endif + +/********************************************/ +/* Another UNIX timer using gettimeofday(). */ +/* However, getrusage() is preferred. */ +/********************************************/ +#ifdef GTODay +#include + +struct timeval tnow; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + gettimeofday(&tnow,NULL); + p[2] = (double)tnow.tv_sec + (double)tnow.tv_usec * 1.0e-6; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*****************************************************/ +/* Fujitsu UXP/M timer. */ +/* Provided by: Mathew Lim, ANUSF, M.Lim@anu.edu.au */ +/*****************************************************/ +#ifdef UXPM +#include +#include +struct tmsu rusage; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + timesu(&rusage); + + p[2] = (double)(rusage.tms_utime) * 1.0e-06; + p[1] = p[2] - q; + + return 0; +} +#endif + +/**********************************************/ +/* Macintosh (MAC_TMgr) Think C dtime() */ +/* requires Think C Language Extensions or */ +/* #include in the prefix */ +/* provided by Francis H Schiffer 3rd (fhs) */ +/* skipschiffer@genie.geis.com */ +/**********************************************/ +#ifdef MAC_TMgr +#include +#include + +static TMTask mgrTimer; +static Boolean mgrInited = FALSE; +static double mgrClock; + +#define RMV_TIMER RmvTime( (QElemPtr)&mgrTimer ) +#define MAX_TIME 1800000000L +/* MAX_TIME limits time between calls to */ +/* dtime( ) to no more than 30 minutes */ +/* this limitation could be removed by */ +/* creating a completion routine to sum */ +/* 30 minute segments (fhs 1994 feb 9) */ + +static void Remove_timer( ) +{ + RMV_TIMER; + mgrInited = FALSE; +} + +int dtime( p ) +double p[]; +{ + if ( mgrInited ) { + RMV_TIMER; + mgrClock += (MAX_TIME + mgrTimer.tmCount)*1.0e-6; + } else { + if ( _atexit( &Remove_timer ) == 0 ) mgrInited = TRUE; + mgrClock = 0.0; + } + + p[1] = mgrClock - p[2]; + p[2] = mgrClock; + if ( mgrInited ) { + mgrTimer.tmAddr = NULL; + mgrTimer.tmCount = 0; + mgrTimer.tmWakeUp = 0; + mgrTimer.tmReserved = 0; + InsTime( (QElemPtr)&mgrTimer ); + PrimeTime( (QElemPtr)&mgrTimer, -MAX_TIME ); + } + return( 0 ); +} +#endif + +/***********************************************************/ +/* Parsytec GCel timer. */ +/* Provided by: Georg Wambach, gw@informatik.uni-koeln.de */ +/***********************************************************/ +#ifdef PARIX +#include + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + p[2] = (double) (TimeNowHigh()) / (double) CLK_TCK_HIGH; + p[1] = p[2] - q; + + return 0; +} +#endif + +/************************************************/ +/* Sun Solaris POSIX dtime() routine */ +/* Provided by: Case Larsen, CTLarsen@lbl.gov */ +/************************************************/ +#ifdef POSIX +#include +#include +#include + +#ifdef __hpux +#include +#define getrusage(a,b) syscall(SYS_getrusage,a,b) +#endif + +struct rusage rusage; + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + getrusage(RUSAGE_SELF,&rusage); + + p[2] = (double)(rusage.ru_utime.tv_sec); + p[2] = p[2] + (double)(rusage.ru_utime.tv_nsec) * 1.0e-09; + p[1] = p[2] - q; + + return 0; +} +#endif + +/****************************************************/ +/* Windows NT (32 bit) dtime() routine */ +/* Provided by: Piers Haken, piersh@microsoft.com */ +/****************************************************/ +#ifdef WIN32 +#include + +dtime(p) +double p[]; +{ + double q; + + q = p[2]; + + p[2] = (double)GetTickCount() * 1.0e-03; + p[1] = p[2] - q; + + return 0; +} +#endif + +/*****************************************************/ +/* Time according to POSIX.1 - */ +/* Ref: "POSIX Programmer's Guide" O'Reilly & Assoc.*/ +/*****************************************************/ +#ifdef POSIX1 +#define _POSIX_SOURCE 1 +#include +#include +#include + +struct tms tms; + +dtime(p) +double p[]; +{ + double q; + times(&tms); + q = p[2]; + p[2] = (double)tms.tms_utime / (double)CLK_TCK; + p[1] = p[2] - q; + return 0; +} +#endif + +/*------ End flops.c code, say good night Jan! (Sep 1992) ------*/ diff --git a/ex3/stream/stream/flops.exe b/ex3/stream/stream/flops.exe new file mode 100755 index 0000000..b6b2dd8 Binary files /dev/null and b/ex3/stream/stream/flops.exe differ diff --git a/ex3/stream/stream/mysecond.c b/ex3/stream/stream/mysecond.c new file mode 100644 index 0000000..d206a4a --- /dev/null +++ b/ex3/stream/stream/mysecond.c @@ -0,0 +1,27 @@ +/* A gettimeofday routine to give access to the wall + clock timer on most UNIX-like systems. + + This version defines two entry points -- with + and without appended underscores, so it *should* + automagically link with FORTRAN */ + +#include + +double mysecond() +{ +/* struct timeval { long tv_sec; + long tv_usec; }; + +struct timezone { int tz_minuteswest; + int tz_dsttime; }; */ + + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +double mysecond_() {return mysecond();} + diff --git a/ex3/stream/stream/stream.c b/ex3/stream/stream/stream.c new file mode 100644 index 0000000..b9a2cee --- /dev/null +++ b/ex3/stream/stream/stream.c @@ -0,0 +1,585 @@ +/*-----------------------------------------------------------------------*/ +/* Program: STREAM */ +/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ +/* Original code developed by John D. McCalpin */ +/* Programmers: John D. McCalpin */ +/* Joe R. Zagar */ +/* */ +/* This program measures memory transfer rates in MB/s for simple */ +/* computational kernels coded in C. */ +/*-----------------------------------------------------------------------*/ +/* Copyright 1991-2013: John D. McCalpin */ +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear, and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ +# include +# include +# include +# include +# include +# include + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 10000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +# define NTIMES 10 +#endif +#endif +#ifndef NTIMES +# define NTIMES 10 +#endif + +/* Users are allowed to modify the "OFFSET" variable, which *may* change the + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). + * Use of non-zero values for OFFSET can be especially helpful if the + * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. + * OFFSET can also be set on the compile line without changing the source + * code using, for example, "-DOFFSET=56". + */ +#ifndef OFFSET +# define OFFSET 0 +#endif + +/* + * 3) Compile the code with optimization. Many compilers generate + * unreasonably bad code before the optimizer tightens things up. + * If the results are unreasonably good, on the other hand, the + * optimizer might be too smart for me! + * + * For a simple single-core version, try compiling with: + * cc -O stream.c -o stream + * This is known to work on many, many systems.... + * + * To use multiple cores, you need to tell the compiler to obey the OpenMP + * directives in the code. This varies by compiler, but a common example is + * gcc -O -fopenmp stream.c -o stream_omp + * The environment variable OMP_NUM_THREADS allows runtime control of the + * number of threads/cores used when the resulting "stream_omp" program + * is executed. + * + * To run with single-precision variables and arithmetic, simply add + * -DSTREAM_TYPE=float + * to the compile line. + * Note that this changes the minimum array sizes required --- see (1) above. + * + * The preprocessor directive "TUNED" does not do much -- it simply causes the + * code to call separate functions to execute each kernel. Trivial versions + * of these functions are provided, but they are *not* tuned -- they just + * provide predefined interfaces to be replaced with tuned code. + * + * + * 4) Optional: Mail the results to mccalpin@cs.virginia.edu + * Be sure to include info that will help me understand: + * a) the computer hardware configuration (e.g., processor model, memory type) + * b) the compiler name/version and compilation flags + * c) any run-time information (such as OMP_NUM_THREADS) + * d) all of the output from the test case. + * + * Thanks! + * + *-----------------------------------------------------------------------*/ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +#ifndef STREAM_TYPE +#define STREAM_TYPE double +#endif + +static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], + b[STREAM_ARRAY_SIZE+OFFSET], + c[STREAM_ARRAY_SIZE+OFFSET]; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE + }; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + int k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + + printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted = %i\n",k); +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < STREAM_ARRAY_SIZE; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +#ifndef abs +#define abs(a) ((a) >= 0 ? (a) : -(a)) +#endif +void checkSTREAMresults () +{ + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; + double epsilon; + ssize_t j; + int k,ierr,err; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j