// Compile with "make" / "make run" or // g++ *.cpp -O3 -ffast-math -lopenblas -llapacke -o main #include "task_3.h" #include "task_4+6.h" #include "task_5.h" #include "task_7.h" #include "timing.h" #include // cBLAS Library #include #include #include void task_1() { printf("\n\n-------------- Task 1 --------------\n\n"); printf("See comment in main.cpp"); // ------------------------------------------------------------- // STREAM version $Revision: 5.10 $ // ------------------------------------------------------------- // This system uses 8 bytes per array element. // ------------------------------------------------------------- // Array size = 80000000 (elements), Offset = 0 (elements) // Memory per array = 610.4 MiB (= 0.6 GiB). // Total memory required = 1831.1 MiB (= 1.8 GiB). // Each kernel will be executed 20 times. // The *best* time for each kernel (excluding the first iteration) // will be used to compute the reported bandwidth. // ------------------------------------------------------------- // Your clock granularity/precision appears to be 1 microseconds. // Each test below will take on the order of 116886 microseconds. // (= 116886 clock ticks) // Increase the size of the arrays if this shows that // you are not getting at least 20 clock ticks per test. // ------------------------------------------------------------- // WARNING -- The above is only a rough guideline. // For best results, please be sure you know the // precision of your system timer. // ------------------------------------------------------------- // Function Best Rate MB/s Avg time Min time Max time // Copy: 29569.4 0.048585 0.043288 0.059164 // Scale: 17644.0 0.082248 0.072546 0.102548 // Add: 21030.1 0.100620 0.091298 0.124700 // Triad: 21230.7 0.100758 0.090435 0.120631 // ------------------------------------------------------------- // Solution Validates: avg error less than 1.000000e-13 on all three arrays // ------------------------------------------------------------- // ./flops.exe // FLOPS C Program (Double Precision), V2.0 18 Dec 1992 // Module Error RunTime MFLOPS // (usec) // 1 4.0146e-13 0.0024 5827.9076 // 2 -1.4166e-13 0.0007 10037.8942 // 3 4.7184e-14 0.0039 4371.9185 // 4 -1.2557e-13 0.0034 4355.5711 // 5 -1.3800e-13 0.0066 4415.6439 // 6 3.2380e-13 0.0065 4441.6299 // 7 -8.4583e-11 0.0053 2277.1707 // 8 3.4867e-13 0.0069 4367.6094 // Iterations = 512000000 // NullTime (usec) = 0.0000 // MFLOPS(1) = 7050.6178 // MFLOPS(2) = 3461.6233 // MFLOPS(3) = 4175.0442 // MFLOPS(4) = 4389.7311 } void task_2() { printf("\n\n-------------- Task 2 --------------\n\n"); printf("See comment in main.cpp"); // Memory needed (double 64-bit, 8 bytes): // (A) (2N + 1) * 8 bytes // (B) (M*N + M + N) * 8 bytes // (C) (M*L + L*N + M*N) * 8 bytes // (D) (N + N + p) * 8 bytes // Floating point operations: // (A) 2N // (B) M * 2N // (C) M * 2L * N // (D) 2 * N * p (Horner Schema) // Read/Write operations: // (A) Read: 2N Write: 1 // (B) Read: M*2N Write: M*N // (C) Read: M*2L*N Write: M*L*N // (D) Read: 2*N*p Write: N*P } void task_3() { printf("\n\n-------------- Task 3 --------------\n\n"); printf("Functions implemented in task_3.cpp"); } void task_4(bool cblas = false) { if (cblas == false) {printf("\n\n-------------- Task 4 --------------\n\n");} size_t M, N, L, p, NLOOPS; { // Scalar product printf("----- Benchmark (A) -----\n"); // Initialization N = 50'000'000; NLOOPS = 50; auto [x,y] = init_A(N); // Benchmark tic(); benchmark_A(x, y, NLOOPS, cblas); double sec = toc() / NLOOPS; // Timings and Performance size_t memory = 2 * N; size_t flops = 2 * N; print_performance(sec, memory, flops, sizeof(x[0])); printf("-------------------------\n"); } { // Matrix-Vector product printf("----- Benchmark (B) -----\n"); // Initialization M = 8'000; N = 12'000; NLOOPS = 30; auto [A,x] = init_B(M,N); // Benchmark tic(); benchmark_B(A, x, NLOOPS, cblas); double sec = toc() / NLOOPS; // Timings and Performance size_t memory = M*N + M + N; size_t flops = 2 * M * N; print_performance(sec, memory, flops, sizeof(A[0])); printf("-------------------------\n"); } { // Matrix-Matrix product printf("----- Benchmark (C) -----\n"); // Initialization M = 1'000; N = 2'000; L = 500; NLOOPS = 20; auto [A,B] = init_C(M,N,L); // Benchmark tic(); benchmark_C(A, B, L, NLOOPS, cblas); double sec = toc() / NLOOPS; // Timings and Performance size_t memory = M*L + L*N + M*N; size_t flops = M * 2*L * N; print_performance(sec, memory, flops, sizeof(A[0])); printf("-------------------------\n"); } if (cblas == false) { // Polynomial evaluation printf("----- Benchmark (D) -----\n"); // Initialization N = 1'000'000; p = 200; NLOOPS = 20; auto [x,a] = init_D(N,p); // Benchmark tic(); benchmark_D(x, a, NLOOPS); double sec = toc() / NLOOPS; // Timings and Performance size_t memory = 2.0 * N; size_t flops = 2.0 * N * p; print_performance(sec, memory, flops, sizeof(x[0])); printf("-------------------------\n"); } } void task_5() { printf("\n\n-------------- Task 5 --------------\n\n"); printf("----- Benchmark norm -----\n"); // Initialization size_t N =50'000'000; size_t NLOOPS = 50; vector x = init_norm(N); // Benchmark tic(); benchmark_norm(x, NLOOPS); double sec = toc() / NLOOPS; // Timings and Performance size_t memory = N; size_t flops = 2 * N; print_performance(sec, memory, flops, sizeof(x[0])); printf("-------------------------\n"); printf("What do you observe? Why?\n"); printf("-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2."); } void task_6() { printf("\n\n-------------- Task 6 --------------\n\n"); printf("Benchmarks using cBLAS\n"); task_4(true); } void task_7() { printf("\n\n-------------- Task 7 --------------\n\n"); { // Check Ax=b size_t N=5, Nrhs=2; auto [A,b] = init_M(N,Nrhs); vector A_og = A; printf("A ="); print_matrix(A,N,N); printf("b ="); print_matrix(b,N,Nrhs); int lda=N, ldb=Nrhs; vector ipiv(N); LAPACKE_dgetrf(LAPACK_ROW_MAJOR, N, N, A.data(), lda, ipiv.data()); LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', N, Nrhs, A.data(), lda, ipiv.data(), b.data(), ldb); printf("L + U ="); print_matrix(A,N,N); printf("x ="); print_matrix(b,N,Nrhs); int ldc=Nrhs; vector C(N*Nrhs); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, N, Nrhs, N, 1.0, A_og.data(), lda, b.data(), ldb, 0.0, C.data(), ldc); printf("Check solution:\nA * x = "); print_matrix(C,N,Nrhs); } // ################################# // Benchmark cout << fixed << setprecision(4); size_t NLOOPS = 200; size_t Nrhs = 2000; cout << "Solution time per right hand side in milliseconds: sec*1000/Nrhs" << endl; cout << "N = " << " | 1 | 2 | 4 | 8 | 16 | 32 " << endl; cout << "------------|--------|--------|--------|--------|--------|-------" << endl; for (int k = 1; k < 10; ++k) { cout << "Nrhs = " << Nrhs*k; for (size_t N : {1, 2, 4, 8, 16, 32}) { tic(); for (size_t i = 0; i < NLOOPS; ++i) { benchmark_lapacke(N, Nrhs*k); } double sec = toc()*1000 / (static_cast(Nrhs)*k); cout << " | " << sec; } cout << endl; } printf("\nFor fixed n, the solution time per rhs stays roughly constant."); } int main() { // task_1(); // task_2(); // task_3(); // task_4(); // task_5(); // task_6(); task_7(); printf("\n\n"); return 0; }