scf_celebic/ex5/code/task_4.cpp
dino.celebic 95b3017475 ex5
2025-12-05 23:13:00 +01:00

232 lines
No EOL
6.3 KiB
C++

#include "task_4.h"
#include "timing.h"
#include <cassert>
#include <cblas.h> // cBLAS Library
#include <iostream>
#include <vector>
using namespace std;
vector<double> matrix_vec(vector<double> const &A, vector<double> const &x) {
size_t const N = x.size();
size_t const M = A.size() / N;
vector<double> b(M);
#pragma omp parallel for shared(A,x,N,M,b)
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
b[i] += A[i*N + j] * x[j];
}
}
return b;
}
vector<double> matrix_matrix(vector<double> const &A, vector<double> const &B, size_t const &M) {
size_t const L = A.size() / M;
size_t const N = B.size() / L;
vector<double> C(M*N,0);
#pragma omp parallel for shared(A,B,M,L,N,C)
for (size_t i = 0; i < M; ++i) {
for (size_t k = 0; k < L; ++k) {
for (size_t j = 0; j < N; ++j) {
C[i*N + j] += A[i*L + k] * B[k*N + j];
}
}
}
return C;
}
vector<double> poly(vector<double> const &x, vector<double> const &a) {
size_t N = x.size();
size_t p = a.size();
vector<double> y(N);
#pragma omp parallel for shared(x,a,N,p,y)
for (size_t i = 0; i < N; ++i) {
y[i] = a[p];
for (size_t k = 1; k < p; ++k) {
y[i] = y[i]*x[i] + a[p-k];
}
}
return y;
}
double scalar(vector<double> const &x, vector<double> const &y) {
assert(x.size() == y.size());
size_t const N = x.size();
double sum = 0.0;
#pragma omp parallel for shared(x,y,N) reduction(+:sum)
for (size_t i = 0; i < N; ++i) {
sum += x[i] * y[i];
}
return sum;
}
double summation(vector<double> const &x){
size_t N = x.size();
double sum = 0.0;
#pragma omp parallel for shared(x,N) reduction(+:sum)
for (size_t i = 0; i < N; ++i) {
sum += x[i];
}
return sum;
}
// ##########################################################################
void print_performance(double sec, size_t memory, size_t flops, unsigned int size) {
printf("Memory allocated : %.3f GByte\n", 1.0 * memory / 1024 / 1024 / 1024 * size);
printf("Duration per loop : %.3f sec\n", sec);
printf("GFLOPS : %.3f\n", 1.0 * flops / sec / 1024 / 1024 / 1024);
printf("GiByte/s : %.3f\n", 1.0 * memory / sec / 1024 / 1024 / 1024 * size);
}
tuple<vector<double>, vector<double>> init_A(size_t N) {
vector<double> x(N), y(N);
for (size_t i = 0; i < N; ++i) {
x[i] = i%219 + 1.0;
y[i] = 1.0 / x[i];
}
return make_tuple(x, y);
}
void benchmark_A(vector<double> const &x, vector<double> const &y, size_t NLOOPS, bool cblas) {
size_t N = x.size();
double s(0.0), sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
s = scalar(x, y);
sum += s;
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
s = cblas_ddot(N, x.data(), 1, y.data(), 1);
sum += s;
}
}
// Check correctness
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_B(size_t M, size_t N) {
vector<double> A(M*N), x(N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < N; ++j) {
A[i*N + j] = (i+j)%219 + 1.0;
}
}
for (size_t j = 0; j < N; ++j) {
x[j] = 1.0/A[17*N + j];
}
return make_tuple(A, x);
}
void benchmark_B(vector<double> const &A, vector<double> const &x, size_t NLOOPS, bool cblas) {
size_t N = x.size();
size_t M = A.size() / N;
vector<double> b(M);
double sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
b = matrix_vec(A,x);
sum += b[17];
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0, b.data(), 1);
sum += b[17];
}
}
// Check correctness
if (static_cast<size_t>(sum) != N*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_C(size_t M, size_t N, size_t L) {
vector<double> A(M*L), B(L*N);
for (size_t i = 0; i < M; ++i) {
for (size_t j = 0; j < L; ++j) {
A[i*L + j] = (i+j)%219 + 1.0;
}
}
// B chosen such that C[0,17]=L
// so B[i,17] = 1/A[0,i]
for (size_t i = 0; i < L; ++i) {
for (size_t j = 0; j < N; ++j) {
if (j==17) {
B[i*N + 17] = 1.0/A[i];
} else {
B[i*N + j] = (i+j)%219 + 1.0;
}
}
}
return make_tuple(A, B);
}
void benchmark_C(vector<double> const &A, vector<double> const &B, size_t L, size_t NLOOPS, bool cblas) {
size_t M = A.size() / L;
size_t N = B.size() / L;
vector<double> C(M*N);
double sum(0.0);
if (cblas == false) {
for (size_t i = 0; i < NLOOPS; ++i) {
C = matrix_matrix(A,B,M);
sum += C[17];
}
} else if (cblas == true) {
for (size_t i = 0; i < NLOOPS; ++i) {
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
sum += C[17];
}
}
// Check correctness
if (static_cast<size_t>(sum) != L*NLOOPS) {printf(" !! W R O N G result !!\n");}
}
tuple<vector<double>, vector<double>> init_D(size_t N, size_t p) {
// x_i = i/N for i=0,...,N-1
// a_j = 1 for j=0,...,p-1
vector<double> x(N), a(p);
for (size_t i = 0; i < N; ++i) {
x[i] = static_cast<double>(i) / N;
}
for (size_t j = 0; j < p; ++j) {
a[j] = 1.0;
}
return make_tuple(x, a);
}
void benchmark_D(vector<double> const &x, vector<double> const &a, size_t NLOOPS) {
size_t N = x.size();
vector<double> y(N);
double sum(0.0);
for (size_t i = 0; i < NLOOPS; ++i) {
y = poly(x,a);
sum += y[0];
}
// Check correctness
if (static_cast<size_t>(sum) != NLOOPS) {printf(" !! W R O N G result sum = %f !!\n", sum);}
}
void benchmark_summation(vector<double> const &x, size_t NLOOPS) {
double s(0.0), sum(0.0);
for (size_t i = 0; i < NLOOPS; ++i) {
s = summation(x);
sum += s;
}
}