#include "mylib.h" #include #include // timing #include // sqrt() #include // atoi() #include // strncmp() #include #include #include #include "benchmark.h" using namespace std; using namespace std::chrono; // timing int main(int argc, char **argv) { const unsigned int NA = 1400000; const unsigned int NLOOPSA = 2000; //const unsigned int NLOOPS = 10; const unsigned int MC = 1000; int const NLOOPSC = 5; // ---------- Benchmark A ---------- { vector xA(NA), yA(NA); for (unsigned int i = 0; i < NA; ++i) { double xi= (i % 219) + 1; xA[i] = xi; yA[i] = 1.0 / xi; } auto tA1 = system_clock::now(); double sA = 0.0, sumA = 0.0; for (unsigned int loop = 0; loop < NLOOPSA; ++loop) { sA = benchmark_A(xA, yA); sumA += sA; } auto tA2 = system_clock::now(); auto durA = duration_cast(tA2 - tA1); double tA = static_cast(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds cout << "\n===== Benchmark A =====\n"; cout << " = " << sA << endl; cout << "Timing in sec. : " << tA << endl; cout << "GFLOPS : " << 2.0 * NA / tA / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << 2.0 * NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl; } // ---------- Benchmark B---------- { const unsigned int MB = 1700; const unsigned int NB = MB; const unsigned int NLOOPSB = 200;//50; vector AB(MB * NB); vector xB(NB); for (unsigned int i = 0; i < MB; ++i) for (unsigned int j = 0; j < NB; ++j) AB[i * NB + j] = (i+j) %219 +1; for (unsigned int j = 0; j < NB; ++j) { xB[j] = 1.0 / AB[17*NB+j]; } vector bB; auto tB1 = system_clock::now(); double guardB = 0.0; for (unsigned int loop = 0; loop < NLOOPSB; ++loop) { bB = benchmark_B(AB, xB); guardB += bB[17]; } auto tB2 = system_clock::now(); auto durB = duration_cast(tB2 - tB1); double tB = static_cast(durB.count()) / 1e6 / NLOOPSB; double flopsB = 2.0 * MB * NB; double bytesB = (MB * NB + NB + MB) * sizeof(double); cout << "\n===== Benchmark B =====\n"; cout << guardB << endl; cout << "bytes: " << bytesB << endl; cout << "Timing in sec. : " << tB << endl; cout << "GFLOPS : " << flopsB / tB / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << bytesB / tB / 1024 / 1024 / 1024 << endl; } // ---------- Benchmark C ---------- { const unsigned int LC = MC; const unsigned int NC = MC; vector AC(MC * LC), BC(LC * NC); for (unsigned int i = 0; i < MC; ++i) for (unsigned int j = 0; j < LC; ++j) AC[i * LC + j] = (i+j) %219 +1; for (unsigned int i = 0; i < LC; ++i) for (unsigned int j = 0; j < NC; ++j) BC[i * NC + j] = (i+j) %219 +1; vector CC; auto tC1 = system_clock::now(); double guardC = 0.0; for (unsigned int loop = 0; loop < NLOOPSC; ++loop) { CC = benchmark_C(AC, BC, MC); guardC += CC[0]; } auto tC2 = system_clock::now(); auto durC = duration_cast(tC2 - tC1); double tC = static_cast(durC.count()) / 1e6 / NLOOPSC; double flopsC = 2.0 * MC * LC * NC; double bytesC = (MC * LC + LC * NC + MC * NC)* sizeof(double); cout << "\n===== Benchmark C =====\n"; cout << "bytes: " << bytesC << endl; cout << "Timing in sec. : " << tC << endl; cout << "GFLOPS : " << flopsC / tC / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << bytesC / tC / 1024 / 1024 / 1024 << endl; } // ---------- Benchmark D---------- { const unsigned int ND = 2000000; const unsigned int p = 14; // degree p-1 = 15 const unsigned int NLOOPSD = 100; vector coeff(p, 0.0); vector xD(ND); for (unsigned int k = 0; k < p; ++k) coeff[k] = k%219+1; for (unsigned int i = 0; i < ND; ++i) xD[i] = i%219+1; vector yD; auto tD1 = system_clock::now(); double guardD = 0.0; for (unsigned int loop = 0; loop < NLOOPSD; ++loop) { yD = benchmark_D(coeff, xD); guardD += yD[0]; } auto tD2 = system_clock::now(); auto durD = duration_cast(tD2 - tD1); double tD = static_cast(durD.count()) / 1e6 / NLOOPSD; double flopsD = ND * 2 * p; double bytesD = (p + 2 * ND)*sizeof(double); cout << "\n===== Benchmark D =====\n"; cout << "bytes: " << bytesD << endl; cout << "Timing in sec. : " << tD << endl; cout << "GFLOPS : " << flopsD / tD / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << bytesD / tD / 1024 / 1024 / 1024 << endl; } //-------------TASK 5 { vector xA(NA); for (unsigned int i = 0; i < NA; ++i) { double xi= (i % 219) + 1; xA[i] = xi; } auto tA1 = system_clock::now(); double sA = 0.0, sumA = 0.0; for (unsigned int loop = 0; loop < NLOOPSA; ++loop) { sA = norm2(xA); sumA += sA; } auto tA2 = system_clock::now(); auto durA = duration_cast(tA2 - tA1); double tA = static_cast(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds cout << "\n===== Benchmark 5A =====\n"; cout << "NORM = " << sA << endl; cout << "Timing in sec. : " << tA << endl; cout << "GFLOPS : " << 2.0 * NA / tA / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl; } { vector xA(NA), yA(NA); for (unsigned int i = 0; i < NA; ++i) { double xi= (i % 219) + 1; xA[i] = xi; yA[i] = 1.0 / xi; } auto tA1 = system_clock::now(); double sA = 0.0, sumA = 0.0; for (unsigned int loop = 0; loop < NLOOPSA; ++loop) { sA = scalar_kahan(xA, yA); sumA += sA; } auto tA2 = system_clock::now(); auto durA = duration_cast(tA2 - tA1); double tA = static_cast(durA.count()) / 1e6 / NLOOPSA; //duration per loop seconds cout << "\n===== Benchmark 5B =====\n"; cout << " = " << sA << endl; cout << "Timing in sec. : " << tA << endl; cout << "GFLOPS : " << 5.0 * NA / tA / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << 2.0 * NA * sizeof(xA[0]) / tA / 1024 / 1024 / 1024 << endl; } { const unsigned int LC = MC; const unsigned int NC = MC; vector AC(MC * LC), BC(LC * NC); for (unsigned int i = 0; i < MC; ++i) for (unsigned int j = 0; j < LC; ++j) AC[i * LC + j] = (i+j) %219 +1; for (unsigned int i = 0; i < LC; ++i) for (unsigned int j = 0; j < NC; ++j) BC[i * NC + j] = (i+j) %219 +1; vector CC; auto tC1 = system_clock::now(); double guardC = 0.0; for (unsigned int loop = 0; loop < NLOOPSC; ++loop) { CC = matrixMultColumnWise(AC, BC, MC); guardC += CC[0]; } auto tC2 = system_clock::now(); auto durC = duration_cast(tC2 - tC1); double tC = static_cast(durC.count()) / 1e6 / NLOOPSC; double flopsC = 2.0 * MC * LC * NC; double bytesC = (MC * LC + LC * NC + MC * NC)* sizeof(double); cout << "\n===== Benchmark 5C =====\n"; cout << "bytes: " << bytesC << endl; cout << "Timing in sec. : " << tC << endl; cout << "GFLOPS : " << flopsC / tC / 1024 / 1024 / 1024 << endl; cout << "GiByte/s : " << bytesC / tC / 1024 / 1024 / 1024 << endl; } return 0; } // memory for x and y will be deallocated by their destructors