Compare commits
No commits in common. "582fc9b47af474834eb1a3b6afe8dedac5a4ed30" and "f5310c6391e5c3bc186c8106d09a49238f251c92" have entirely different histories.
582fc9b47a
...
f5310c6391
113 changed files with 12407 additions and 4620 deletions
123
CLANG_default.mk
Normal file
123
CLANG_default.mk
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=CLANG_
|
||||
|
||||
#CLANGPATH=//usr/lib/llvm-10/bin/
|
||||
CC = ${CLANGPATH}clang
|
||||
CXX = ${CLANGPATH}clang++
|
||||
#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
|
||||
#F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
|
||||
WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
|
||||
WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
|
||||
#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
|
||||
|
||||
CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
|
||||
# don't use -Ofast
|
||||
# -ftrapv
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
ifeq ($(UBUNTU),1)
|
||||
# ubuntu
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# very good check
|
||||
# http://clang.llvm.org/extra/clang-tidy/
|
||||
# good check, see: http://llvm.org/docs/CodingStandards.html#include-style
|
||||
SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
|
||||
SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
|
||||
#READABILITY=,readability*${SWITCH_OFF}
|
||||
#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
#TIDYFLAGS += -checks='modernize*
|
||||
# ???
|
||||
#TIDYFLAGS = -checks='cert*' -header-filter=.*
|
||||
# MPI checks ??
|
||||
#TIDYFLAGS = -checks='mpi*'
|
||||
# ??
|
||||
#TIDYFLAGS = -checks='performance*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='portability-*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='readability-*' -header-filter=.*
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
codecheck: tidy_check
|
||||
tidy_check:
|
||||
clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
|
||||
# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
codecheck: tidy_check
|
||||
196
GCC_default.mk
Normal file
196
GCC_default.mk
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=GCC_
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -fmax-errors=1
|
||||
# -Wunreachable-code -Winline
|
||||
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
|
||||
#-msse3
|
||||
# -ftree-vectorizer-verbose=2 -DNDEBUG
|
||||
# -ftree-vectorizer-verbose=5
|
||||
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
|
||||
|
||||
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
|
||||
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
|
||||
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
LINKFLAGS += -O3
|
||||
|
||||
#architecture
|
||||
#CPU = -march=znver2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU}
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
ifeq ($(UBUNTU),1)
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -llapack -lopenblas -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
#CXXFLAGS += -flto
|
||||
#LINKFLAGS += -flto
|
||||
|
||||
# for debugging purpose (save code)
|
||||
# -fsanitize=leak # only one out the three can be used
|
||||
# -fsanitize=address
|
||||
# -fsanitize=thread
|
||||
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
|
||||
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
|
||||
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
|
||||
#CXXFLAGS += ${SANITARY}
|
||||
#LINKFLAGS += ${SANITARY}
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
|
||||
-@rm -rf html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
#run: ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
#find . -size +10M > large_files
|
||||
#--exclude-from ${MY_DIR}/large_files
|
||||
|
||||
zip: clean
|
||||
@echo "Zip the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
.SUFFIXES: .f90
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
.f90.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
# no "-pg" in compile/link options
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
# Graphical interface
|
||||
# valkyrie
|
||||
|
||||
# Simple run time profiling of your code
|
||||
CXXFLAGS += -g -pg
|
||||
LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
./$^ ${PARAMS}
|
||||
gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
# sudo apt install gprofng-gui
|
||||
# https://parallel.computer/presentations/PPoPP2023/2023-Ruud-Slides.pdf
|
||||
# read §3 in https://sourceware.org/binutils/docs/gprofng.html
|
||||
# /usr/bin/gp-collect-app -o test.1.er -p on -S on /home/ghaase/Lectures/Math2CPP/Codes/seq/jacobi_oo_stl/main.GCC_
|
||||
prof2: ${PROGRAM}
|
||||
gprofng collect app -h auto ./$^ ${PARAMS}
|
||||
# gprofng display text -functions `ls -1tdr test.*.er |tail -1`
|
||||
gprofng display text -script gprofng_script2 `ls -1tdr test.*.er |tail -1`
|
||||
# gprofng display text -script gprofng_script2 test.*.er
|
||||
# gprofng display gui &
|
||||
|
||||
prof3: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
|
||||
# * install
|
||||
# * sudo vi /etc/sysctl.conf
|
||||
# add kernel.perf_event_paranoid = 0
|
||||
|
||||
#Trace your heap:
|
||||
#> heaptrack ./main.GCC_
|
||||
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
|
||||
heap: ${PROGRAM}
|
||||
heaptrack ./$^ ${PARAMS}
|
||||
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
|
||||
|
||||
codecheck: $(SOURCES)
|
||||
cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
|
||||
|
||||
|
||||
########################################################################
|
||||
# get the detailed status of all optimization flags
|
||||
info:
|
||||
echo "detailed status of all optimization flags"
|
||||
$(CXX) --version
|
||||
$(CXX) -Q $(CXXFLAGS) --help=optimizers
|
||||
lscpu
|
||||
inxi -C
|
||||
lstopo
|
||||
|
||||
# Excellent hardware info
|
||||
# hardinfo
|
||||
# Life monitoring of CPU frequency etc.
|
||||
# sudo i7z
|
||||
|
||||
# Memory consumption
|
||||
# vmstat -at -SM 3
|
||||
# xfce4-taskmanager
|
||||
|
||||
|
||||
# https://www.tecmint.com/check-linux-cpu-information/
|
||||
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
|
||||
|
||||
# Debugging:
|
||||
# https://wiki.archlinux.org/index.php/Debugging
|
||||
|
|
@ -1,99 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <unordered_map>
|
||||
|
||||
//#####################################
|
||||
// G.Haase
|
||||
// See https://sourceforge.net/p/predef/wiki/Compilers/
|
||||
// http://www.cplusplus.com/doc/tutorial/preprocessor/
|
||||
// also: export OMP_DISPLAY_ENV=VERBOSE
|
||||
//#####################################
|
||||
/** Checks for compilers, its versions, threads etc.
|
||||
*
|
||||
@param[in] argc number of command line arguemnts
|
||||
@param[in] argv command line arguments as array of C-strings
|
||||
*/
|
||||
template <class T>
|
||||
void check_env(T argc, char const *argv[])
|
||||
{
|
||||
std::cout << "\n#######################################################################\n";
|
||||
std::cout << "Code :";
|
||||
for (T k = 0; k < argc; ++k) std::cout << " " << argv[k];
|
||||
std::cout << std::endl;
|
||||
|
||||
// compiler: https://sourceforge.net/p/predef/wiki/Compilers/
|
||||
std::cout << "Compiler: ";
|
||||
#if defined __INTEL_COMPILER
|
||||
#pragma message(" ########## INTEL ###############")
|
||||
std::cout << "INTEL " << __INTEL_COMPILER;
|
||||
// Ignore warnings for #pragma acc unrecognice
|
||||
#pragma warning disable 161
|
||||
// Ignore warnings for #pragma omp unrecognice
|
||||
#pragma warning disable 3180
|
||||
|
||||
#elif defined __PGI
|
||||
#pragma message(" ########## PGI ###############")
|
||||
std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
|
||||
#elif defined __clang__
|
||||
#pragma message(" ########## CLANG ###############")
|
||||
std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
|
||||
#elif defined __GNUC__
|
||||
#pragma message(" ########## Gnu ###############")
|
||||
std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
|
||||
#else
|
||||
#pragma message(" ########## unknown Compiler ###############")
|
||||
std::cout << "unknown";
|
||||
#endif
|
||||
std::cout << " C++ standard: " << __cplusplus << std::endl;
|
||||
|
||||
// Parallel environments
|
||||
std::cout << "Parallel: ";
|
||||
#if defined MPI_VERSION
|
||||
#pragma message(" ########## MPI ###############")
|
||||
#ifdef OPEN_MPI
|
||||
std::cout << "OpenMPI ";
|
||||
#else
|
||||
std::cout << "MPI ";
|
||||
#endif
|
||||
std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " ";
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
//https://www.openmp.org/specifications/
|
||||
//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
|
||||
std::unordered_map<unsigned, std::string> const map{
|
||||
{200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
|
||||
#pragma message(" ########## OPENMP ###############")
|
||||
//std::cout << _OPENMP;
|
||||
std::cout << "OpenMP ";
|
||||
try {
|
||||
std::cout << map.at(_OPENMP);
|
||||
}
|
||||
catch (...) {
|
||||
std::cout << _OPENMP;
|
||||
}
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp master
|
||||
{
|
||||
const int nn = omp_get_num_threads(); // OpenMP
|
||||
std::cout << " ---> " << nn << " Threads ";
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef _OPENACC
|
||||
#pragma message(" ########## OPENACC ###############")
|
||||
std::cout << "OpenACC ";
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
std::cout << "Date : " << __DATE__ << " " << __TIME__;
|
||||
std::cout << "\n#######################################################################\n";
|
||||
}
|
||||
// HG
|
||||
|
||||
|
|
@ -1,142 +0,0 @@
|
|||
#include "check_env.h"
|
||||
#include "mylib.h"
|
||||
#include <cstdlib> // atoi()
|
||||
#include <cstring> // strncmp()
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <omp.h> // OpenMP
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char const *argv[])
|
||||
{
|
||||
omp_set_schedule(omp_sched_static, 2000000);
|
||||
//omp_set_schedule(omp_sched_dynamic, 1000000);
|
||||
//omp_set_schedule(omp_sched_guided, 1000000);
|
||||
//omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
|
||||
|
||||
|
||||
// Speedup for different number of cores (incl. hyperthreading)
|
||||
omp_set_num_threads(8);
|
||||
|
||||
// Print number of available processors
|
||||
cout << "Number of available processors: " << omp_get_num_procs() << endl;
|
||||
|
||||
// Currently executing parallel code? -> no
|
||||
cout << "Currently in parallel? " << omp_in_parallel() << endl;
|
||||
|
||||
|
||||
int const NLOOPS = 10; // chose a value such that the benchmark runs at least 10 sec.
|
||||
unsigned int N = 500000001;
|
||||
|
||||
|
||||
//##########################################################################
|
||||
// Read Parameter from command line (C++ style)
|
||||
cout << "Checking command line parameters for: -n <number> " << endl;
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
cout << " arg[" << i << "] = " << argv[i] << endl;
|
||||
string ss(argv[i]);
|
||||
if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
|
||||
{
|
||||
N = static_cast<unsigned int>(atoi(argv[i + 1]));
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << "Corect call: " << argv[0] << " -n <number>\n";
|
||||
}
|
||||
}
|
||||
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
check_env(argc, argv);
|
||||
//########################################################################
|
||||
int nthreads; // OpenMP
|
||||
#pragma omp parallel default(none) shared(cout,nthreads)
|
||||
{
|
||||
stringstream inparallel;
|
||||
inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
|
||||
|
||||
|
||||
int const th_id = omp_get_thread_num(); // OpenMP
|
||||
int const nthrds = omp_get_num_threads(); // OpenMP
|
||||
stringstream ss;
|
||||
ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
|
||||
#pragma omp critical
|
||||
{
|
||||
cout << ss.str(); // output to a shared ressource
|
||||
cout << inparallel.str() << endl;
|
||||
}
|
||||
#pragma omp master
|
||||
nthreads = nthrds; // transfer nn to to master thread
|
||||
}
|
||||
cout << " " << nthreads << " threads have been started." << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
//##########################################################################
|
||||
// Data initialization
|
||||
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
|
||||
for (unsigned int i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = i + 1;
|
||||
y[i] = 1.0 / x[i];
|
||||
}
|
||||
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking\n";
|
||||
|
||||
// Do calculation
|
||||
double tstart = omp_get_wtime(); // OpenMP
|
||||
|
||||
double sk(0.0);
|
||||
for (int i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
//sk = scalar(x, y);
|
||||
sk = scalar_parallel(x, y);
|
||||
//sk = scalar_trans(x, y);
|
||||
//sk = norm(x);
|
||||
}
|
||||
|
||||
double t1 = omp_get_wtime() - tstart; // OpenMP
|
||||
|
||||
t1 /= NLOOPS; // divide by number of function calls
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << sk << endl;
|
||||
if (static_cast<unsigned int>(sk) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "Total benchmarking time: " << t1*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t1 << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
|
||||
//#########################################################################
|
||||
|
||||
cout << "\n Try the reduction with an STL-vektor \n";
|
||||
|
||||
auto vr = reduction_vec_append(5);
|
||||
cout << "done\n";
|
||||
cout << vr << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
} // memory for x and y will be deallocated their destructors
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
#include "mylib.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <functional> // multiplies<>{}
|
||||
#include <list>
|
||||
#include <numeric> // iota()
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
|
||||
{
|
||||
const size_t nthreads = omp_get_num_threads();
|
||||
const size_t threadnum = omp_get_thread_num();
|
||||
const size_t chunksize = N/nthreads;
|
||||
|
||||
|
||||
size_t start = threadnum*chunksize;
|
||||
size_t end = start + chunksize;
|
||||
if (threadnum == nthreads - 1)
|
||||
end = N;
|
||||
|
||||
|
||||
for (size_t i = start; i < end; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<int> reduction_vec_append(int n)
|
||||
{
|
||||
vector<int> vec(n);
|
||||
#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp critical
|
||||
cout << omp_get_thread_num() << " : " << vec.size() << endl;
|
||||
#pragma omp barrier
|
||||
iota( vec.begin(),vec.end(), omp_get_thread_num() );
|
||||
#pragma omp barrier
|
||||
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
//sum += exp(x[i])*log(y[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
double norm(vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i]*x[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<int> reduction_vec(int n)
|
||||
{
|
||||
vector<int> vec(n);
|
||||
#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp critical
|
||||
cout << omp_get_thread_num() << " : " << vec.size() << endl;
|
||||
#pragma omp barrier
|
||||
iota( vec.begin(),vec.end(), omp_get_thread_num() );
|
||||
#pragma omp barrier
|
||||
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
double scalar_trans(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
vector<double> z(x.size());
|
||||
//list<double> z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-')
|
||||
// c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison
|
||||
|
||||
transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
|
||||
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (auto pi = cbegin(z); pi!=cend(z); ++pi)
|
||||
{
|
||||
sum += *pi;
|
||||
}
|
||||
//for (auto val: z)
|
||||
//{
|
||||
//sum += val;
|
||||
//}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
#pragma once
|
||||
#include <cassert>
|
||||
#include <iomanip> // setw()
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
/** Inner product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
|
||||
// Declare additional reduction operation in OpenMP for STL-vector
|
||||
#pragma omp declare reduction(VecAppend : std::vector<int> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
|
||||
initializer (omp_priv=omp_orig)
|
||||
|
||||
std::vector<int> reduction_vec_append(int n);
|
||||
|
||||
|
||||
/** l2-norm
|
||||
@param[in] x vector
|
||||
@return resulting Euclidian norm
|
||||
*/
|
||||
double norm(std::vector<double> const &x);
|
||||
|
||||
/** Vector @p b adds its elements to vector @p a .
|
||||
@param[in] a vector
|
||||
@param[in] b vector
|
||||
@return a+=b componentwise
|
||||
*/
|
||||
template<class T>
|
||||
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
|
||||
{
|
||||
assert(a.size()==b.size());
|
||||
for (size_t k = 0; k < a.size(); ++k) {
|
||||
a[k] += b[k];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Declare the reduction operation in OpenMP for an STL-vector
|
||||
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
|
||||
// ------------------------------------------------------------
|
||||
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
|
||||
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
|
||||
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
|
||||
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
|
||||
#pragma omp declare reduction(VecAdd : std::vector<int> : omp_out += omp_in) \
|
||||
initializer (omp_priv=omp_orig)
|
||||
|
||||
// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type.
|
||||
//template <class T>
|
||||
//#pragma omp declare reduction(VecAdd : std::vector<T> : omp_out += omp_in) initializer (omp_priv(omp_orig))
|
||||
// MS: template nach #pragma !?
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
/** Test for vector reduction.
|
||||
*
|
||||
* The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
|
||||
* Afterwards these vectors are accumulated, i.e.,
|
||||
* @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
|
||||
*
|
||||
* @param[in] n size of global/private vector
|
||||
* @return resulting global vector.
|
||||
*/
|
||||
std::vector<int> reduction_vec(int n);
|
||||
|
||||
|
||||
|
||||
/** Output of a vector.
|
||||
@param[in,out] s output stream
|
||||
@param[in] x vector
|
||||
@return modified output stream
|
||||
*/
|
||||
template <class T>
|
||||
std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
|
||||
{
|
||||
for (auto const &v : x) s << std::setw(4) << v << " ";
|
||||
return s;
|
||||
}
|
||||
|
||||
|
|
@ -1,70 +0,0 @@
|
|||
#pragma once
|
||||
#include <chrono> // timing
|
||||
#include <stack>
|
||||
|
||||
using Clock = std::chrono::system_clock; //!< The wall clock timer chosen
|
||||
//using Clock = std::chrono::high_resolution_clock;
|
||||
using TPoint= std::chrono::time_point<Clock>;
|
||||
|
||||
// [Galowicz, C++17 STL Cookbook, p. 29]
|
||||
inline
|
||||
std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
|
||||
|
||||
/** Starts stopwatch timer.
|
||||
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
|
||||
*
|
||||
* The timining is allowed to be nested and the recent time is stored on top of the stack.
|
||||
*
|
||||
* @return recent time
|
||||
* @see toc
|
||||
*/
|
||||
inline auto tic()
|
||||
{
|
||||
MyStopWatch.push(Clock::now());
|
||||
return MyStopWatch.top();
|
||||
}
|
||||
|
||||
/** Returns the elapsed time from stopwatch.
|
||||
*
|
||||
* The time from top of the stack is used
|
||||
* if time point @p t_b is not passed as input parameter.
|
||||
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
|
||||
* or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode
|
||||
* The last option is to be used in the case of
|
||||
* non-nested but overlapping time measurements.
|
||||
*
|
||||
* @param[in] t_b start time of some stop watch
|
||||
* @return elapsed time in seconds.
|
||||
*
|
||||
*/
|
||||
inline double toc(TPoint const &t_b = MyStopWatch.top())
|
||||
{
|
||||
// https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
|
||||
using Unit = std::chrono::seconds;
|
||||
using FpSeconds = std::chrono::duration<double, Unit::period>;
|
||||
auto t_e = Clock::now();
|
||||
MyStopWatch.pop();
|
||||
return FpSeconds(t_e-t_b).count();
|
||||
}
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
/** Executes function @p f and measures/prints elapsed wall clock time in seconds
|
||||
*
|
||||
* Call as
|
||||
* @code measure("Time for (b = b + 1)", [&]() {
|
||||
thrust::transform(b.begin(), b.end(), b.begin(), increment());
|
||||
}); @endcode
|
||||
*
|
||||
* @param[in] label additional string to be printed with the measurement.
|
||||
* @param[in] f function to execute.
|
||||
* @author Therese Bösmüller, 2025
|
||||
*
|
||||
*/
|
||||
auto measure = [](const std::string& label, auto&& f) {
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
f();
|
||||
auto stop = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
|
||||
std::cout << label << ": " << duration << " microseconds" << std::endl;
|
||||
}; // ';' is needed for a visible documentation of this lambda-function
|
||||
Binary file not shown.
|
|
@ -1,130 +0,0 @@
|
|||
#include "mylib.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
// read vector from file
|
||||
vector<size_t> data_vector = {};
|
||||
|
||||
ifstream input_stream("data_1.txt");
|
||||
|
||||
size_t line;
|
||||
while(input_stream >> line)
|
||||
{
|
||||
data_vector.push_back(line);
|
||||
}
|
||||
data_vector.shrink_to_fit();
|
||||
|
||||
|
||||
|
||||
|
||||
// specify loops
|
||||
size_t NLOOPS = 10000;
|
||||
|
||||
|
||||
|
||||
// ############# Parallelization with openMP #############
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am_omp, gm_omp, hm_omp;
|
||||
|
||||
double tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
means_omp(data_vector, am_omp, gm_omp, hm_omp);
|
||||
|
||||
double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
// calculate minimum and maximum
|
||||
size_t min, max;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
minmax_omp(data_vector, min, max);
|
||||
|
||||
double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// ############# Parallelization with C++ algorithms #############
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am_cpp, gm_cpp, hm_cpp;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
|
||||
|
||||
double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
// calculate minimum and maximum
|
||||
size_t min_cpp, max_cpp;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
minmax_cpp(data_vector, min_cpp, max_cpp);
|
||||
|
||||
double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// print results
|
||||
cout << "####### OpenMP #######" << endl;
|
||||
cout << "minimum: " << min << endl;
|
||||
cout << "maximum: " << max << endl;
|
||||
cout << "duration: " << t_minmax_omp << endl << endl;
|
||||
|
||||
cout << "arithmetic mean: " << am_omp << endl;
|
||||
cout << "geometric mean: " << gm_omp << endl;
|
||||
cout << "harmonic mean: " << hm_omp << endl;
|
||||
cout << "duration: " << t_means_omp << endl << endl;
|
||||
|
||||
|
||||
cout << "####### C++ #######" << endl;
|
||||
cout << "minimum: " << min_cpp << endl;
|
||||
cout << "maximum: " << max_cpp << endl;
|
||||
cout << "duration: " << t_minmax_cpp << endl << endl;
|
||||
|
||||
cout << "arithmetic mean: " << am_cpp << endl;
|
||||
cout << "geometric mean: " << gm_cpp << endl;
|
||||
cout << "harmonic mean: " << hm_cpp << endl;
|
||||
cout << "duration: " << t_means_cpp << endl << endl;
|
||||
|
||||
|
||||
|
||||
// ####### OpenMP #######
|
||||
// minimum: 1
|
||||
// maximum: 1000
|
||||
// duration: 3.52086e-06
|
||||
|
||||
// arithmetic mean: 498.184
|
||||
// geometric mean: 364.412
|
||||
// harmonic mean: 95.6857
|
||||
// duration: 5.90171e-06
|
||||
|
||||
// ####### C++ #######
|
||||
// minimum: 1
|
||||
// maximum: 1000
|
||||
// duration: 1.76816e-05
|
||||
|
||||
// arithmetic mean: 498.184
|
||||
// geometric mean: 364.412
|
||||
// harmonic mean: 95.6857
|
||||
// duration: 2.35728e-05
|
||||
|
||||
// --> the openMP variant is faster in both cases
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
Binary file not shown.
|
|
@ -1,103 +0,0 @@
|
|||
#include "mylib.h"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <execution>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
am = 0.;
|
||||
gm = 0.;
|
||||
hm = 0.;
|
||||
|
||||
#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
am += numbers[i];
|
||||
gm += log(numbers[i]);
|
||||
hm += 1.0/numbers[i];
|
||||
|
||||
// #pragma omp critical
|
||||
// {
|
||||
// cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
|
||||
// }
|
||||
}
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
|
||||
|
||||
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
global_min = -1; // gives the maximum size_t value
|
||||
global_max = 0;
|
||||
|
||||
#pragma omp parallel shared(numbers, n, global_min, global_max)
|
||||
{
|
||||
const size_t nthreads = omp_get_num_threads();
|
||||
const size_t threadnum = omp_get_thread_num();
|
||||
const size_t chunksize = n/nthreads;
|
||||
|
||||
|
||||
size_t start = threadnum*chunksize;
|
||||
size_t end = start + chunksize;
|
||||
if (threadnum == nthreads - 1)
|
||||
end = n;
|
||||
|
||||
|
||||
size_t local_min = -1;
|
||||
size_t local_max = 0;
|
||||
for (size_t i = start; i < end ; ++i)
|
||||
{
|
||||
if (numbers[i] < local_min)
|
||||
local_min = numbers[i];
|
||||
|
||||
if (numbers[i] > local_max)
|
||||
local_max = numbers[i];
|
||||
}
|
||||
|
||||
#pragma omp critical
|
||||
{
|
||||
if (local_min < global_min)
|
||||
global_min = local_min;
|
||||
|
||||
if (local_max > global_max)
|
||||
global_max = local_max;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
am = reduce(std::execution::par, numbers.begin(), numbers.end());
|
||||
gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
|
||||
hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
|
||||
|
||||
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
|
||||
{
|
||||
auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
|
||||
auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
|
||||
|
||||
global_min = *min_it;
|
||||
global_max = *max_it;
|
||||
}
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
#include <vector>
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
Uses openMP parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates the minimum and maximum of a vector.
|
||||
Uses openMP parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] global_min minimum
|
||||
@param[out] global_max maximum
|
||||
*/
|
||||
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
Uses C++ parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates the minimum and maximum of a vector.
|
||||
Uses C++ parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] global_min minimum
|
||||
@param[out] global_max maximum
|
||||
*/
|
||||
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
|
||||
Binary file not shown.
|
|
@ -1,30 +0,0 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp goldbach.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
#include "goldbach.h"
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <omp.h>
|
||||
|
||||
size_t single_goldbach(size_t k)
|
||||
{
|
||||
const std::vector<size_t> relevant_primes = get_primes(k);
|
||||
size_t m = relevant_primes.size();
|
||||
|
||||
size_t counter = 0;
|
||||
|
||||
#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
{
|
||||
if(relevant_primes[i] + relevant_primes[j] == k)
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
|
||||
std::vector<size_t> count_goldbach(size_t n)
|
||||
{
|
||||
const std::vector<size_t> relevant_primes = get_primes(n);
|
||||
size_t m = relevant_primes.size();
|
||||
|
||||
std::vector<size_t> counter_vector(n + 1, 0);
|
||||
|
||||
#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
{
|
||||
size_t sum = relevant_primes[i] + relevant_primes[j];
|
||||
if(sum <= n)
|
||||
++counter_vector[relevant_primes[i] + relevant_primes[j]];
|
||||
}
|
||||
}
|
||||
|
||||
return counter_vector;
|
||||
}
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#pragma once
|
||||
#include "mayer_primes.h"
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
|
||||
@param[in] k first integer
|
||||
@param[out] count number of decompositions
|
||||
*/
|
||||
size_t single_goldbach(size_t k);
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
|
||||
@param[in] n upper integer bound
|
||||
@param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index
|
||||
*/
|
||||
std::vector<size_t> count_goldbach(size_t n);
|
||||
|
||||
|
||||
/** Vector @p b adds its elements to vector @p a .
|
||||
@param[in] a vector
|
||||
@param[in] b vector
|
||||
@return a+=b componentwise
|
||||
*/
|
||||
template<class T>
|
||||
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
|
||||
{
|
||||
assert(a.size()==b.size());
|
||||
for (size_t k = 0; k < a.size(); ++k) {
|
||||
a[k] += b[k];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Declare the reduction operation in OpenMP for an STL-vector
|
||||
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
|
||||
// ------------------------------------------------------------
|
||||
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
|
||||
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
|
||||
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
|
||||
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
|
||||
#pragma omp declare reduction(VecAdd : std::vector<size_t> : omp_out += omp_in) initializer (omp_priv=omp_orig)
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#include "goldbach.h"
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
|
||||
|
||||
for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
|
||||
{
|
||||
double t_start = omp_get_wtime();
|
||||
|
||||
auto goldbach_vector = count_goldbach(n);
|
||||
|
||||
auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
|
||||
size_t max_number = distance(goldbach_vector.begin(), max_it);
|
||||
|
||||
double t_end = omp_get_wtime() - t_start;
|
||||
|
||||
cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
|
||||
}
|
||||
|
||||
/*
|
||||
###### WITHOUT PARALLELIZATION ######
|
||||
The number 9240 has 329 decompositions. Duration: 0.00307696
|
||||
The number 99330 has 2168 decompositions. Duration: 0.189839
|
||||
The number 390390 has 7094 decompositions. Duration: 1.3042
|
||||
The number 990990 has 15594 decompositions. Duration: 5.45034
|
||||
The number 1981980 has 27988 decompositions. Duration: 47.1807
|
||||
|
||||
|
||||
###### WITH PARALLELIZATION ######
|
||||
The number 9240 has 329 decompositions. Duration: 0.000734854
|
||||
The number 99330 has 2168 decompositions. Duration: 0.0251322
|
||||
The number 390390 has 7094 decompositions. Duration: 0.487375
|
||||
The number 990990 has 15594 decompositions. Duration: 6.16972
|
||||
The number 1981980 has 27988 decompositions. Duration: 31.5699
|
||||
*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N);
|
||||
|
||||
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
|
||||
|
|
@ -1,141 +0,0 @@
|
|||
#include "benchmarks.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <omp.h>
|
||||
|
||||
// (A) Inner product of two vectors (from skalar_stl)
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
|
||||
#pragma omp parallel for shared(x, y, N) reduction(+:sum)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// (A) Vector entry sum
|
||||
double sum(vector<double> const &x)
|
||||
{
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for shared(x) reduction(+:sum)
|
||||
for (size_t i = 0; i < x.size(); ++i)
|
||||
{
|
||||
sum += x[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
// (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
|
||||
{
|
||||
size_t const nelem = A.size();
|
||||
size_t const N = x.size();
|
||||
assert(nelem % N == 0); // make sure multiplication is possible
|
||||
size_t const M = nelem/N;
|
||||
|
||||
vector<double> b(M);
|
||||
|
||||
#pragma omp parallel for shared(A, x, N, M, b)
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
double tmp = 0.0;
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
tmp += A[N*i + j] * x[j];
|
||||
b[i] = tmp;
|
||||
}
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
// (C) Matrix-matrix product
|
||||
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
|
||||
{
|
||||
size_t const nelem_A = A.size();
|
||||
size_t const nelem_B = B.size();
|
||||
|
||||
assert(nelem_A % L == 0 && nelem_B % L == 0);
|
||||
|
||||
size_t const M = nelem_A/L;
|
||||
size_t const N = nelem_B/L;
|
||||
|
||||
|
||||
vector<double> C(M*N);
|
||||
|
||||
|
||||
#pragma omp parallel for shared(A, B, M, N, L, C)
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
C[N*i + j] += A[L*i + k]*B[N*k + j];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
|
||||
// (D) Evaluation of a polynomial function
|
||||
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
size_t const p = a.size() - 1;
|
||||
vector<double> y(N, 0);
|
||||
|
||||
#pragma omp parallel for shared(a, x, N, p, y)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
double x_temp = x[i];
|
||||
double y_temp = 0;
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
{
|
||||
y_temp += x_temp*y_temp + a[p - k];
|
||||
}
|
||||
y[i] = y_temp;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,55 +0,0 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
/** (A) Inner product of two vectors (from skalar_stl)
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
|
||||
/** (A) Sum entries of vector
|
||||
@param[in] x vector
|
||||
@return sum
|
||||
*/
|
||||
double sum(vector<double> const &x);
|
||||
|
||||
|
||||
/** (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
* @param[in] A dense matrix (1D access)
|
||||
* @param[in] u vector
|
||||
*
|
||||
* @return resulting vector
|
||||
*/
|
||||
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
|
||||
|
||||
|
||||
/** (C) Matrix-matrix product
|
||||
* @param[in] A MxL dense matrix (1D access)
|
||||
* @param[in] B LxN dense matrix (1D access)
|
||||
* @param[in] shared_dim shared dimension L
|
||||
*
|
||||
* @return resulting MxN matrix
|
||||
*/
|
||||
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
|
||||
|
||||
|
||||
/** (D) Evaluation of a polynomial function using Horner's scheme
|
||||
* @param[in] a coefficient vector
|
||||
* @param[in] x vector with input values
|
||||
*
|
||||
* @return vector with output values
|
||||
*/
|
||||
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
#include "benchmark_tests.h"
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
int main()
|
||||
{
|
||||
vector<vector<double>> results_scalar;
|
||||
results_scalar.push_back(test_A(2000000, pow(10,3)));
|
||||
results_scalar.push_back(test_A(1000000, pow(10,4)));
|
||||
results_scalar.push_back(test_A(100000, pow(10,5)));
|
||||
results_scalar.push_back(test_A(10000, pow(10,6)));
|
||||
results_scalar.push_back(test_A(750, pow(10,7)));
|
||||
results_scalar.push_back(test_A(125, pow(10,8)));
|
||||
|
||||
|
||||
vector<vector<double>> results_sum;
|
||||
results_sum.push_back(test_A_sum(3000000, pow(10,3)));
|
||||
results_sum.push_back(test_A_sum(2000000, pow(10,4)));
|
||||
results_sum.push_back(test_A_sum(1000000, pow(10,5)));
|
||||
results_sum.push_back(test_A_sum(50000, pow(10,6)));
|
||||
results_sum.push_back(test_A_sum(2000, pow(10,7)));
|
||||
results_sum.push_back(test_A_sum(250, pow(10,8)));
|
||||
|
||||
|
||||
test_B(100, 20000, 10000);
|
||||
|
||||
test_C(25, 500, 1000, 1500);
|
||||
|
||||
test_D(100, 100, 1000000);
|
||||
|
||||
|
||||
|
||||
cout << endl << "###### Scalar ######" << endl;
|
||||
cout << "Timing\tGFLOPS\tGiByte/s" << endl;
|
||||
cout << "------------------------------" << endl;
|
||||
for (size_t i = 0; i < results_scalar.size(); ++i)
|
||||
cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
|
||||
|
||||
cout << endl << "###### Sum ######" << endl;
|
||||
cout << "Timing\tGFLOPS\tGiByte/s" << endl;
|
||||
cout << "------------------------------" << endl;
|
||||
for (size_t i = 0; i < results_sum.size(); ++i)
|
||||
cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
// ###### Scalar ######
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------
|
||||
// 3.4e-06 0.54 4.3
|
||||
// 4.6e-06 4 32
|
||||
// 1.6e-05 12 95
|
||||
// 0.0011 1.7 13
|
||||
// 0.0097 1.9 15
|
||||
// 0.075 2.5 20
|
||||
|
||||
|
||||
// ###### Sum ######
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------
|
||||
// 5.5e-06 0.17 1.3
|
||||
// 5.4e-06 1.7 14
|
||||
// 1.5e-05 6.1 49
|
||||
// 0.00013 7.2 57
|
||||
// 0.0033 2.8 23
|
||||
// 0.032 2.9 23
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// ######### NOT PARALLEL (from exercise sheet 2) #########
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ----------------------------------
|
||||
// (A) 0.038 2.5 20
|
||||
// (B) 0.13 2.9 23
|
||||
// (C) 0.44 3.2 25
|
||||
// (D) 0.19 1.5 12
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
35
ex1A_mean_values/main.cpp
Normal file
35
ex1A_mean_values/main.cpp
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
#include "means.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
double arithmetic_mean, geometric_mean, harmonic_mean;
|
||||
|
||||
// Fixed version
|
||||
calculate_means(1, 4, 16, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(2, 3, 5, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(1000, 4000, 16000, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
cout << "--------------------------------" << endl;
|
||||
|
||||
|
||||
|
||||
// Scalable version
|
||||
calculate_means(vector<int> {1, 4, 16}, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(vector<int> {2, 3, 5}, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(vector<int> {1000, 4000, 16000}, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
30
ex1A_mean_values/means.cpp
Normal file
30
ex1A_mean_values/means.cpp
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#include "../ex1A_mean_values/means.h"
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
void calculate_means(int a, int b, int c, double &am, double &gm, double &hm)
|
||||
{
|
||||
am = (a + b + c)/3.0;
|
||||
gm = exp((log(a)+log(b)+log(c))/3);
|
||||
hm = 3.0/(1.0/a + 1.0/b + 1.0/c);
|
||||
}
|
||||
|
||||
void calculate_means(std::vector<int> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
int n = numbers.size();
|
||||
|
||||
am = 0.;
|
||||
gm = 0.;
|
||||
hm = 0.;
|
||||
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
am += numbers[i];
|
||||
gm += log(numbers[i]);
|
||||
hm += 1.0/numbers[i];
|
||||
}
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
23
ex1A_mean_values/means.h
Normal file
23
ex1A_mean_values/means.h
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of three integers.
|
||||
@param[in] a first integer
|
||||
@param[in] b second integer
|
||||
@param[in] c third integer
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void calculate_means(int a, int b, int c, double &am, double &gm, double &hm);
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void calculate_means(std::vector<int> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
|
|
@ -13,7 +13,7 @@ COMPILER=GCC_
|
|||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp mylib.cpp
|
||||
SOURCES = main.cpp ../ex1A_mean_values/means.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
53
ex1B_data-IO_and_vectors/main.cpp
Normal file
53
ex1B_data-IO_and_vectors/main.cpp
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
#include "../ex1A_mean_values/means.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// read vector from file
|
||||
vector<int> data_vector = {};
|
||||
|
||||
ifstream input_stream("data_1.txt");
|
||||
|
||||
int line;
|
||||
while(input_stream >> line)
|
||||
{
|
||||
data_vector.push_back(line);
|
||||
}
|
||||
data_vector.shrink_to_fit();
|
||||
|
||||
|
||||
// calculate minimum and maximum
|
||||
vector<int>::iterator min_it = min_element(data_vector.begin(), data_vector.end());
|
||||
vector<int>::iterator max_it = max_element(data_vector.begin(), data_vector.end());
|
||||
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am, gm, hm;
|
||||
calculate_means(data_vector, am, gm, hm);
|
||||
|
||||
|
||||
// calculate standard deviation
|
||||
double sd = 0.;
|
||||
int n = data_vector.size();
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
sd += pow(data_vector[i] - am, 2);
|
||||
}
|
||||
sd = sqrt(sd/n);
|
||||
|
||||
|
||||
// print results
|
||||
cout << "minimum: " << *min_it << endl;
|
||||
cout << "maximum: " << *max_it << endl;
|
||||
cout << "arithmetic mean: " << am << endl;
|
||||
cout << "geometric mean: " << gm << endl;
|
||||
cout << "harmonic mean: " << hm << endl;
|
||||
cout << "standard deviation: " << sd << endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
31
ex1C_summation_of_specified_numbers/main.cpp
Normal file
31
ex1C_summation_of_specified_numbers/main.cpp
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
#include "special_sum.h"
|
||||
#include "../utils/timing.h"
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <stack>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// check results and compare speeds
|
||||
for(size_t n : {15, 1001, 1432987})
|
||||
{
|
||||
cout << "n = " << n << endl;
|
||||
size_t sum_1, sum_2;
|
||||
|
||||
tic();
|
||||
for(size_t i = 0; i < 1000; ++i)
|
||||
sum_1 = special_sum_loop(n);
|
||||
double time_1 = toc();
|
||||
|
||||
tic();
|
||||
for(size_t i = 0; i < 1000; ++i)
|
||||
sum_2 = special_sum_noloop(n);
|
||||
double time_2 = toc();
|
||||
|
||||
cout << "loop: " << sum_1 << "\t\tDuration: " << time_1 << endl;
|
||||
cout << "no loop: " << sum_2 << "\t\tDuration: " << time_2 << endl << "---------------------------------------------------" << endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
28
ex1C_summation_of_specified_numbers/special_sum.cpp
Normal file
28
ex1C_summation_of_specified_numbers/special_sum.cpp
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
#include "special_sum.h"
|
||||
|
||||
size_t gauss_sum(size_t n)
|
||||
{
|
||||
return (n*(n+1))/2;
|
||||
}
|
||||
|
||||
size_t special_sum_loop(size_t n)
|
||||
{
|
||||
size_t sum = 0;
|
||||
for (size_t i = 1; i < n+1; ++i)
|
||||
{
|
||||
if (i % 3 == 0 || i % 5 == 0)
|
||||
{
|
||||
sum += i;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
size_t special_sum_noloop(size_t n)
|
||||
{
|
||||
size_t factor_3 = gauss_sum(n/3); // dividing int by int automatically gets rounded off
|
||||
size_t factor_5 = gauss_sum(n/5);
|
||||
size_t factor_15 = gauss_sum(n/15);
|
||||
|
||||
return factor_3*3 + factor_5*5 - factor_15*15;
|
||||
}
|
||||
18
ex1C_summation_of_specified_numbers/special_sum.h
Normal file
18
ex1C_summation_of_specified_numbers/special_sum.h
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
#include <cstddef>
|
||||
|
||||
/**
|
||||
This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITH using a loop.
|
||||
@param[in] n
|
||||
@param[out] M
|
||||
*/
|
||||
size_t special_sum_loop(size_t n);
|
||||
|
||||
|
||||
/**
|
||||
This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITHOUT using a loop.
|
||||
Example: For n=15, we have 60 = 3+5+6+9+10+12+15 = (1+2+3+4+5)*3 + (1+2+3)*5 - 1*15
|
||||
Formula: M = (\sum_{i=1}^{k_3} i)*3 + (\sum_{i=1}^{k_5} i)*5 - (\sum_{i=1}^{k_15} i)*15
|
||||
@param[in] n
|
||||
@param[out] M
|
||||
*/
|
||||
size_t special_sum_noloop(size_t n);
|
||||
33
ex1D_kahan_summation/main.cpp
Normal file
33
ex1D_kahan_summation/main.cpp
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
#include "mylib.h"
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
for(size_t i = 1; i < 8; ++i)
|
||||
{
|
||||
size_t n = pow(10,i);
|
||||
vector<double> x(n);
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
x[k] = 1.0/(k + 1);
|
||||
|
||||
|
||||
// compute scalar products
|
||||
double sum_1 = scalar(x, x);
|
||||
double sum_2 = Kahan_skalar(x, x);
|
||||
|
||||
// compute error
|
||||
double err_1 = abs(sum_1 - pow(M_PI,2)/6);
|
||||
double err_2 = abs(sum_2 - pow(M_PI,2)/6);
|
||||
|
||||
cout << "n = " << n << endl;
|
||||
cout << "Normal scalar product: " << sum_1 << "\terror: " << err_1 << endl;
|
||||
cout << "Kahan scalar product: " << sum_2 << "\terror: " << err_2 << endl;
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
34
ex1D_kahan_summation/mylib.cpp
Normal file
34
ex1D_kahan_summation/mylib.cpp
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
#include "mylib.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Kahan_skalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
double sum = 0;
|
||||
double c = 0;
|
||||
size_t n = x.size();
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
double z = x[i]*y[i] - c; // c is the part that got lost in the last iteration
|
||||
double t = sum + z; // when adding sum + z, the lower digits are lost if sum is large
|
||||
c = (t - sum) - z; // now we recover the lower digits to add in the next iteration
|
||||
sum = t;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
30
ex1D_kahan_summation/mylib.h
Normal file
30
ex1D_kahan_summation/mylib.h
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#ifndef FILE_MYLIB
|
||||
#define FILE_MYLIB
|
||||
#include <vector>
|
||||
|
||||
/** Inner product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
/** Inner product using BLAS routines
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
|
||||
float scalar_cblas(std::vector<float> const &x, std::vector<float> const &y);
|
||||
|
||||
|
||||
/** L_2 Norm of a vector
|
||||
@param[in] x vector
|
||||
@return resulting Euclidian norm <x,y>
|
||||
*/
|
||||
double norm(std::vector<double> const &x);
|
||||
|
||||
double Kahan_skalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
#endif
|
||||
59
ex1E_vector_vs_list/main.cpp
Normal file
59
ex1E_vector_vs_list/main.cpp
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
#include "../utils/timing.h"
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
|
||||
size_t random_integer(int lower_bound, int upper_bound)
|
||||
{
|
||||
unsigned seed = chrono::system_clock::now().time_since_epoch().count();
|
||||
|
||||
minstd_rand0 generator (seed);
|
||||
|
||||
return lower_bound + generator() % (upper_bound - lower_bound + 1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// start with generating a sorted vector/list
|
||||
size_t n = 10000;
|
||||
vector<int> x_vec = {};
|
||||
list<int> x_list = {};
|
||||
for(size_t k = 0; k < n; ++k)
|
||||
{
|
||||
x_vec.push_back(k + 1);
|
||||
x_list.push_back(k + 1);
|
||||
}
|
||||
|
||||
|
||||
// insert new random entries such that the container stays sorted
|
||||
tic();
|
||||
for(size_t i = 0; i < n; ++i)
|
||||
{
|
||||
size_t new_entry = random_integer(1,n);
|
||||
auto it = lower_bound(x_vec.begin(), x_vec.end(), new_entry);
|
||||
x_vec.insert(it, new_entry);
|
||||
}
|
||||
double time_1 = toc();
|
||||
|
||||
tic();
|
||||
for(size_t i = 0; i < n; ++i)
|
||||
{
|
||||
size_t new_entry = random_integer(1,n);
|
||||
auto it = lower_bound(x_list.begin(), x_list.end(), new_entry);
|
||||
x_list.insert(it, new_entry);
|
||||
}
|
||||
double time_2 = toc();
|
||||
|
||||
|
||||
// check results
|
||||
cout << "New vector is sorted: " << std::boolalpha << is_sorted(x_vec.begin(), x_vec.end()) << "\tsize: " << x_vec.size() << "\tduration: " << time_1 << endl;
|
||||
cout << "New list is sorted: " << std::boolalpha << is_sorted(x_list.begin(), x_list.end()) << "\tsize: " << x_list.size() << "\tduration: " << time_2 << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -1,7 +1,4 @@
|
|||
#include "goldbach.h"
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <omp.h>
|
||||
|
||||
size_t single_goldbach(size_t k)
|
||||
{
|
||||
|
|
@ -10,13 +7,12 @@ size_t single_goldbach(size_t k)
|
|||
|
||||
size_t counter = 0;
|
||||
|
||||
#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
{
|
||||
if(relevant_primes[i] + relevant_primes[j] == k)
|
||||
++counter;
|
||||
counter += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -31,7 +27,7 @@ std::vector<size_t> count_goldbach(size_t n)
|
|||
|
||||
std::vector<size_t> counter_vector(n + 1, 0);
|
||||
|
||||
#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
|
||||
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
21
ex1F_goldbachs_conjecture/goldbach.h
Normal file
21
ex1F_goldbachs_conjecture/goldbach.h
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
#pragma once
|
||||
#include "mayer_primes.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <iterator>
|
||||
#include <cassert>
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
|
||||
@param[in] k first integer
|
||||
@param[out] count number of decompositions
|
||||
*/
|
||||
size_t single_goldbach(size_t k);
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
|
||||
@param[in] n upper integer bound
|
||||
@param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index
|
||||
*/
|
||||
std::vector<size_t> count_goldbach(size_t n);
|
||||
37
ex1F_goldbachs_conjecture/main.cpp
Normal file
37
ex1F_goldbachs_conjecture/main.cpp
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
#include "../utils/timing.h"
|
||||
#include "goldbach.h"
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
|
||||
|
||||
|
||||
for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
|
||||
{
|
||||
tic();
|
||||
|
||||
auto goldbach_vector = count_goldbach(n);
|
||||
|
||||
auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
|
||||
size_t max_number = distance(goldbach_vector.begin(), max_it);
|
||||
|
||||
double time = toc();
|
||||
|
||||
cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << time << endl;
|
||||
}
|
||||
|
||||
/*
|
||||
The number 9240 has 329 decompositions. Duration: 0.00572876
|
||||
The number 99330 has 2168 decompositions. Duration: 0.3342
|
||||
The number 390390 has 7094 decompositions. Duration: 4.23734
|
||||
The number 990990 has 15594 decompositions. Duration: 29.5817
|
||||
The number 1981980 has 27988 decompositions. Duration: 135.985
|
||||
*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
71
ex1G_dense_matrices_access/DenseMatrix.h
Normal file
71
ex1G_dense_matrices_access/DenseMatrix.h
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
#pragma once
|
||||
#include "sigmoid.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
class DenseMatrix
|
||||
{
|
||||
private:
|
||||
vector<double> M;
|
||||
size_t n;
|
||||
size_t m;
|
||||
|
||||
|
||||
public:
|
||||
vector<double> Mult(const vector<double> &x) const
|
||||
{
|
||||
vector<double> y(n,0);
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
y[i] += M[i*m + j]*x[j];
|
||||
}
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
vector<double> MultT(const vector<double> &y) const
|
||||
{
|
||||
vector<double> x(m,0);
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
x[j] += M[i*m + j]*y[i];
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
void Print() const
|
||||
{
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
cout << M[i*m + j] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
|
||||
DenseMatrix(size_t n, size_t m)
|
||||
{
|
||||
this->n = n;
|
||||
this->m = m;
|
||||
M = vector<double>(n*m);
|
||||
size_t nm = max(n,m);
|
||||
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
M[i*m + j] = sigmoid(x_entry(i,nm))*sigmoid(x_entry(j,nm));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
52
ex1G_dense_matrices_access/ProductMatrix.h
Normal file
52
ex1G_dense_matrices_access/ProductMatrix.h
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
#pragma once
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
class ProductMatrix
|
||||
{
|
||||
private:
|
||||
vector<double> u;
|
||||
vector<double> v;
|
||||
size_t n;
|
||||
size_t m;
|
||||
|
||||
public:
|
||||
vector<double> Mult(const vector<double> &x) const
|
||||
{
|
||||
vector<double> y(n,0);
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
for(int j = 0; j < m; ++j)
|
||||
{
|
||||
y[i] += v[j]*x[j];
|
||||
}
|
||||
y[i] *= u[i];
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
vector<double> MultT(const vector<double> &y) const
|
||||
{
|
||||
vector<double> x(m,0);
|
||||
for(int j = 0; j < m; ++j)
|
||||
{
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
x[j] += y[i]*u[i];
|
||||
}
|
||||
x[j] *= v[j];
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
ProductMatrix(const vector<double> &u, const vector<double> &v)
|
||||
{
|
||||
n = u.size();
|
||||
m = v.size();
|
||||
this->u = u;
|
||||
this->v = v;
|
||||
|
||||
}
|
||||
};
|
||||
93
ex1G_dense_matrices_access/main.cpp
Normal file
93
ex1G_dense_matrices_access/main.cpp
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
#include "../utils/timing.h"
|
||||
#include "DenseMatrix.h"
|
||||
#include "ProductMatrix.h"
|
||||
#include <algorithm>
|
||||
|
||||
int main()
|
||||
{
|
||||
// b) ------------------------------------------------------------------------------------------------------
|
||||
DenseMatrix const M(5,3);
|
||||
vector<double> const u{{1, 2, 3}};
|
||||
vector<double> f1 = M.Mult(u);
|
||||
vector<double> const v{{-1, 2, -3, 4, -5}};
|
||||
vector<double> f2 = M.MultT(v);
|
||||
|
||||
M.Print();
|
||||
|
||||
for(size_t i = 0; i < f1.size(); ++i)
|
||||
cout << f1[i] << endl;
|
||||
cout << endl;
|
||||
|
||||
|
||||
for(size_t j = 0; j < f2.size(); ++j)
|
||||
cout << f2[j] << " ";
|
||||
cout << endl << "-------------------------------------------------" << endl;
|
||||
|
||||
// c) ------------------------------------------------------------------------------------------------------
|
||||
size_t n = pow(10,3);
|
||||
DenseMatrix const M_1(n,n);
|
||||
vector<double> x(n, 1.0);
|
||||
|
||||
size_t n_loops = 100;
|
||||
vector<double> y_1;
|
||||
vector<double> y_2;
|
||||
|
||||
double time_1 = 0;
|
||||
double time_2 = 0;
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_1 = M_1.Mult(x);
|
||||
time_1 += toc();
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_2 = M_1.MultT(x);
|
||||
time_2 += toc();
|
||||
|
||||
vector<double> error_vec(n,0);
|
||||
for(int i = 0; i < n; ++i)
|
||||
error_vec[i] = abs(y_1[i] - y_2[i]);
|
||||
double sup_error = *max_element(error_vec.begin(), error_vec.end());
|
||||
|
||||
|
||||
cout << "n = " << n << endl;
|
||||
cout << "Average duration for Mult: " << time_1/n_loops << endl;
|
||||
cout << "Average duration for MultT: " << time_2/n_loops << endl;
|
||||
cout << "sup-error: " << sup_error << endl;
|
||||
cout << "-------------------------------------------------" << endl;
|
||||
|
||||
// d) ------------------------------------------------------------------------------------------------------
|
||||
vector<double> u_M(n,0);
|
||||
for(int i = 0; i < n; ++i)
|
||||
u_M[i] = sigmoid(x_entry(i, n));
|
||||
|
||||
ProductMatrix const M_2(u_M, u_M);
|
||||
|
||||
time_1 = 0;
|
||||
time_2 = 0;
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_1 = M_2.Mult(x);
|
||||
time_1 += toc();
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_2 = M_2.MultT(x);
|
||||
time_2 += toc();
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
error_vec[i] = abs(y_1[i] - y_2[i]);
|
||||
sup_error = *max_element(error_vec.begin(), error_vec.end());
|
||||
|
||||
|
||||
cout << "n = " << n << endl;
|
||||
cout << "Average duration for Mult: " << time_1/n_loops << endl;
|
||||
cout << "Average duration for MultT: " << time_2/n_loops << endl;
|
||||
cout << "sup-error: " << sup_error << endl;
|
||||
cout << "-------------------------------------------------" << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
12
ex1G_dense_matrices_access/sigmoid.h
Normal file
12
ex1G_dense_matrices_access/sigmoid.h
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
#pragma once
|
||||
#include <cmath>
|
||||
|
||||
double sigmoid(double x)
|
||||
{
|
||||
return 1./(1. + exp(-x));
|
||||
}
|
||||
|
||||
double x_entry(size_t k, size_t nm)
|
||||
{
|
||||
return (10.*k)/(nm - 1) - 5.;
|
||||
}
|
||||
BIN
ex2A_stationary_heat_equation_fixed_lambda/ex2A.pdf
Normal file
BIN
ex2A_stationary_heat_equation_fixed_lambda/ex2A.pdf
Normal file
Binary file not shown.
BIN
ex2B_stationary_heat_equation_variable_lambda/ex2B.pdf
Normal file
BIN
ex2B_stationary_heat_equation_variable_lambda/ex2B.pdf
Normal file
Binary file not shown.
BIN
ex2C_peclet_problem/ex2C.pdf
Normal file
BIN
ex2C_peclet_problem/ex2C.pdf
Normal file
Binary file not shown.
18
ex2C_peclet_problem/main.py
Normal file
18
ex2C_peclet_problem/main.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def u(x,p):
|
||||
return (np.exp(p*x) - 1)/(np.exp(p) - 1)
|
||||
|
||||
x = np.linspace(0, 1, 200)
|
||||
|
||||
|
||||
plt.figure(figsize=(8, 5))
|
||||
for p in [ -3, -1, -0.5, 0.5, 2, 4]:
|
||||
plt.plot(x, u(x, p), label="p ="+ str(p))
|
||||
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u(x)')
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
2563
ex3_benchmarks/Doxyfile
Normal file
2563
ex3_benchmarks/Doxyfile
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -13,7 +13,9 @@ COMPILER=GCC_
|
|||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp mylib.cpp
|
||||
#SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp
|
||||
# GH
|
||||
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp vdop.cpp getmatrix.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
|
@ -27,5 +29,4 @@ ifndef COMPILER
|
|||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
|
|
@ -5,7 +5,7 @@
|
|||
#include <math.h>
|
||||
using namespace std::chrono;
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N)
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function)
|
||||
{
|
||||
cout << "#################### (A) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
|
|
@ -39,7 +39,7 @@ vector<double> test_A(const size_t &NLOOPS, const size_t &N)
|
|||
double check(0.0),ss(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
check = scalar_parallel(x, y);
|
||||
check = scalar_function(x, y);
|
||||
ss += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
|
|
@ -70,79 +70,37 @@ vector<double> test_A(const size_t &NLOOPS, const size_t &N)
|
|||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking norm\n";
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
|
||||
{
|
||||
cout << "#################### (A) sum ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = 1;
|
||||
}
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking sum\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
auto t3 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double check(0.0),ss(0.0);
|
||||
double ss2(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
check = sum(x);
|
||||
ss += check; // prevents the optimizer from removing unused calculation results.
|
||||
auto sk1 = sqrt(scalar(x, x));
|
||||
ss2 += sk1; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
auto t4 = system_clock::now(); // stop timer
|
||||
auto duration2 = duration_cast<microseconds>(t4 - t3); // duration in microseconds
|
||||
double t_diff2 = static_cast<double>(duration2.count()) / 1e6; // overall duration in seconds
|
||||
t_diff2 = t_diff2/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
cout << "ss(norm): " << ss2 << endl;
|
||||
cout << "Timing in sec. : " << t_diff2 << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != N)
|
||||
cout << " !! W R O N G result !!\n";
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function)
|
||||
{
|
||||
cout << "#################### (B) ####################" << endl;
|
||||
|
||||
|
|
@ -180,7 +138,7 @@ vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
|
|||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
b = MatVec_parallel(A, x);
|
||||
b = MatVec_function(A, x);
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
|
|
@ -216,7 +174,7 @@ vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
|
|||
}
|
||||
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function)
|
||||
{
|
||||
cout << "#################### (C) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
|
|
@ -253,11 +211,11 @@ vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, co
|
|||
// Do calculation
|
||||
vector<double> C(M*N);
|
||||
double check;
|
||||
double check_sum = 0;
|
||||
double check_sum; // GH: initialize
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
C = MatMat_parallel(A, B, L);
|
||||
C = MatMat_function(A, B, L);
|
||||
|
||||
check = C[N*17];
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
|
|
@ -334,7 +292,7 @@ vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
|
|||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
y = poly_parallel(a, x);
|
||||
y = poly(a, x);
|
||||
check = y[0];
|
||||
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
|
|
@ -372,4 +330,4 @@ vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
|
|||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
}
|
||||
15
ex3_benchmarks/benchmark_tests.h
Normal file
15
ex3_benchmarks/benchmark_tests.h
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function);
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function);
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function);
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
|
||||
246
ex3_benchmarks/benchmarks.cpp
Normal file
246
ex3_benchmarks/benchmarks.cpp
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
#include "benchmarks.h"
|
||||
#include "vdop.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <cassert> // assert()
|
||||
|
||||
|
||||
#ifdef __INTEL_CLANG_COMPILER
|
||||
#pragma message(" ########## Use of MKL ###############")
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#pragma message(" ########## Use of CBLAS ###############")
|
||||
|
||||
#include <cblas.h> // cBLAS Library
|
||||
#include <lapacke.h> // Lapack
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// (A) Inner product of two vectors (from skalar_stl)
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// (A) 5.(b) Kahan scalar product
|
||||
double Kahan_skalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
double sum = 0.0;
|
||||
double c = 0.0;
|
||||
size_t n = x.size();
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
double z = x[i]*y[i] - c; // c is the part that got lost in the last iteration
|
||||
double t = sum + z; // when adding sum + z, the lower digits are lost if sum is large
|
||||
c = (t - sum) - z; // now we recover the lower digits to add in the next iteration
|
||||
sum = t;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// (A) 6. cBLAS scalar product
|
||||
double scalar_cBLAS(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
return cblas_ddot(x.size(), x.data(), 1, y.data(), 1); // x.data() = &x[0]
|
||||
}
|
||||
|
||||
|
||||
// (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
vector<double> MatVec(vector<double> const &A, vector<double> const &x)
|
||||
{
|
||||
size_t const nelem = A.size();
|
||||
size_t const N = x.size();
|
||||
assert(nelem % N == 0); // make sure multiplication is possible
|
||||
size_t const M = nelem/N;
|
||||
|
||||
|
||||
vector<double> b(M);
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
double tmp = 0.0;
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
tmp += A[N*i + j] * x[j];
|
||||
b[i] = tmp;
|
||||
}
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
// (B) cBLAS Matrix-vector product
|
||||
vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x)
|
||||
{
|
||||
size_t const nelem = A.size();
|
||||
size_t const N = x.size();
|
||||
assert(nelem % N == 0); // make sure multiplication is possible
|
||||
size_t const M = nelem/N;
|
||||
|
||||
|
||||
vector<double> b(M);
|
||||
|
||||
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0.0, b.data(), 1);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
// (C) Matrix-matrix product
|
||||
vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &L)
|
||||
{
|
||||
size_t const nelem_A = A.size();
|
||||
size_t const nelem_B = B.size();
|
||||
|
||||
assert(nelem_A % L == 0 && nelem_B % L == 0);
|
||||
|
||||
size_t const M = nelem_A/L;
|
||||
size_t const N = nelem_B/L;
|
||||
|
||||
|
||||
vector<double> C(M*N);
|
||||
|
||||
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
double C_temp = 0;
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
{
|
||||
C_temp += A[L*i + k]*B[N*k + j];
|
||||
}
|
||||
C[N*i + j] = C_temp;
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
// (C) cBLAS matrix-matrix product
|
||||
vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &L)
|
||||
{
|
||||
size_t const nelem_A = A.size();
|
||||
size_t const nelem_B = B.size();
|
||||
|
||||
assert(nelem_A % L == 0 && nelem_B % L == 0);
|
||||
|
||||
size_t const M = nelem_A/L;
|
||||
size_t const N = nelem_B/L;
|
||||
|
||||
|
||||
vector<double> C(M*N);
|
||||
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
|
||||
// (D) Evaluation of a polynomial function
|
||||
vector<double> poly(vector<double> const &a, vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
size_t const p = a.size() - 1;
|
||||
vector<double> y(N, 0);
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
double x_temp = x[i];
|
||||
double y_temp = 0;
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
{
|
||||
y_temp += x_temp*y_temp + a[p - k];
|
||||
}
|
||||
y[i] = y_temp;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
|
||||
// (E) Solves linear system of equations
|
||||
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
|
||||
{
|
||||
const double omega = 1.0;
|
||||
const int maxiter = 1000;
|
||||
const double tol = 1e-5, // tolerance
|
||||
tol2 = tol * tol; // tolerance^2
|
||||
|
||||
int nrows = SK.Nrows(); // number of rows == number of columns
|
||||
assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
|
||||
|
||||
cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl;
|
||||
// Choose initial guess
|
||||
for (int k = 0; k < nrows; ++k)
|
||||
{
|
||||
u[k] = 0.0; // u := 0
|
||||
}
|
||||
|
||||
vector<double> dd(nrows); // matrix diagonal
|
||||
vector<double> r(nrows); // residual
|
||||
vector<double> w(nrows); // correction
|
||||
|
||||
SK.GetDiag(dd); // dd := diag(K)
|
||||
////DebugVector(dd);{int ijk; cin >> ijk;}
|
||||
|
||||
// Initial sweep
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
double sigma0 = dscapr(w, r); // s0 := <w,r>
|
||||
|
||||
// Iteration sweeps
|
||||
int iter = 0;
|
||||
double sigma = sigma0;
|
||||
while ( sigma > tol2 * sigma0 && maxiter > iter)
|
||||
{
|
||||
++iter;
|
||||
vdaxpy(u, u, omega, w ); // u := u + om*w
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
sigma = dscapr(w, r); // s0 := <w,r>
|
||||
// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
|
||||
}
|
||||
cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl;
|
||||
cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n";
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
89
ex3_benchmarks/benchmarks.h
Normal file
89
ex3_benchmarks/benchmarks.h
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
#pragma once
|
||||
#include "getmatrix.h"
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/** (A) Inner product of two vectors (from skalar_stl)
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
/** (A) 5.(b) Inner product of two vectors using the Kahan scalar product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double Kahan_skalar(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
/** (A) 6. cBLAS scalar product of two vectors
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_cBLAS(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
|
||||
/** (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
* @param[in] A dense matrix (1D access)
|
||||
* @param[in] u vector
|
||||
*
|
||||
* @return resulting vector
|
||||
*/
|
||||
vector<double> MatVec(vector<double> const &A, vector<double> const &x);
|
||||
|
||||
/** (B) 6. cBLAS Matrix-vector product
|
||||
* @param[in] A dense matrix (1D access)
|
||||
* @param[in] u vector
|
||||
*
|
||||
* @return resulting vector
|
||||
*/
|
||||
vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x);
|
||||
|
||||
|
||||
/** (C) Matrix-matrix product
|
||||
* @param[in] A MxL dense matrix (1D access)
|
||||
* @param[in] B LxN dense matrix (1D access)
|
||||
* @param[in] shared_dim shared dimension L
|
||||
*
|
||||
* @return resulting MxN matrix
|
||||
*/
|
||||
vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
|
||||
|
||||
/** (C) 6. cBLAS Matrix-matrix product
|
||||
* @param[in] A MxL dense matrix (1D access)
|
||||
* @param[in] B LxN dense matrix (1D access)
|
||||
* @param[in] shared_dim shared dimension L
|
||||
*
|
||||
* @return resulting MxN matrix
|
||||
*/
|
||||
vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
|
||||
|
||||
|
||||
/** (D) Evaluation of a polynomial function using Horner's scheme
|
||||
* @param[in] a coefficient vector
|
||||
* @param[in] x vector with input values
|
||||
*
|
||||
* @return vector with output values
|
||||
*/
|
||||
vector<double> poly(vector<double> const &a, vector<double> const &x);
|
||||
|
||||
|
||||
/** (E) Solves linear system of equations K @p u = @p f via the Jacobi iteration (from jaboci_oo_stl)
|
||||
* We use a distributed symmetric CSR matrix @p SK and initial guess of the
|
||||
* solution is set to 0.
|
||||
* @param[in] SK CSR matrix
|
||||
* @param[in] f distributed local vector storing the right hand side
|
||||
* @param[out] u accumulated local vector storing the solution.
|
||||
*/
|
||||
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
39
ex3_benchmarks/factorization_solve.cpp
Normal file
39
ex3_benchmarks/factorization_solve.cpp
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#include "factorization_solve.h"
|
||||
#include <vector>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <lapack.h>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs)
|
||||
{
|
||||
int32_t nelem = A.size();
|
||||
int32_t N = sqrt(nelem);
|
||||
assert(N == static_cast<int32_t>(b.size())/n_rhs);
|
||||
|
||||
vector<int> IPIV(N); // pivot indices
|
||||
int info_factorization;
|
||||
|
||||
dgetrf_(&N, &N, A.data(), &N, IPIV.data(), &info_factorization);
|
||||
|
||||
const char transp = 'N';
|
||||
int info_solve;
|
||||
dgetrs_(&transp, &N, &n_rhs, A.data(), &N, IPIV.data(), b.data(), &N, &info_solve, 1); // 1 is length of parameter 'N'
|
||||
|
||||
}
|
||||
|
||||
void print_matrix(vector<double> &A, size_t M, size_t N)
|
||||
{
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
cout << A[N*i + j] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
21
ex3_benchmarks/factorization_solve.h
Normal file
21
ex3_benchmarks/factorization_solve.h
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
using namespace std;
|
||||
|
||||
|
||||
/** Solve linear system of equations with multiple right hand sides using LU factorization
|
||||
* @param[inout] A NxN Matrix (1D access), gets modified to contain the LU decomposition of A
|
||||
* @param[inout] B N x n_rhs Matrix of right-hand-sides (1D access), gets modified to contain the solution vectors x
|
||||
* @param[in] n_rhs number of right-hand-sides b
|
||||
*
|
||||
*/
|
||||
void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs);
|
||||
|
||||
/** Print a matrix to console
|
||||
* @param[in] A MxN Matrix (1D access)
|
||||
* @param[in] M rows
|
||||
* @param[in] N columns
|
||||
*
|
||||
*/
|
||||
void print_matrix(vector<double> &A, size_t M, size_t N);
|
||||
89
ex3_benchmarks/factorization_solve_tests.cpp
Normal file
89
ex3_benchmarks/factorization_solve_tests.cpp
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
#include "factorization_solve_tests.h"
|
||||
#include "factorization_solve.h"
|
||||
#include "benchmarks.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std::chrono;
|
||||
|
||||
void CheckCorrectness()
|
||||
{
|
||||
cout.precision(4);
|
||||
|
||||
size_t N = 5;
|
||||
size_t n_rhs = 5;
|
||||
vector<double> A(N*N);
|
||||
vector<double> b(N*n_rhs);
|
||||
|
||||
// initialize A
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
if (i == j)
|
||||
A[N*i + j] = 4.0;
|
||||
else
|
||||
A[N*i + j] = 1.0/((i - j)*(i - j));
|
||||
}
|
||||
const vector<double> A_copy = A;
|
||||
|
||||
// initialize b as identity matrix
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
for (size_t l = 0; l < n_rhs; ++l)
|
||||
if (l == j)
|
||||
b[N*l + j] = 1.0;
|
||||
else
|
||||
b[N*l + j] = 0.0;
|
||||
}
|
||||
|
||||
cout << "A = " << endl;
|
||||
print_matrix(A, N, N);
|
||||
cout << "b = " << endl;
|
||||
print_matrix(b, N, n_rhs);
|
||||
|
||||
// solve system
|
||||
factorization_solve(A, b, n_rhs);
|
||||
|
||||
vector<double> check_matrix_identity = MatMat_cBLAS(A_copy, b, N);
|
||||
cout << "A*A^{-1} = " << endl;
|
||||
print_matrix(check_matrix_identity, N, n_rhs);
|
||||
}
|
||||
|
||||
|
||||
void CheckDuration(const size_t &N)
|
||||
{
|
||||
for (size_t n_rhs : {1, 2, 4, 8, 16, 32})
|
||||
{
|
||||
auto time_start = system_clock::now();
|
||||
|
||||
vector<double> A(N*N);
|
||||
vector<double> b(N*n_rhs);
|
||||
|
||||
// initialize A
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
if (i == j)
|
||||
A[N*i + j] = 4.0;
|
||||
else
|
||||
A[N*i + j] = 1.0/((i - j)*(i - j));
|
||||
}
|
||||
|
||||
// initialize b
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
for (size_t l = 0; l < n_rhs; ++l)
|
||||
b[N*l + j] = 1.0;
|
||||
|
||||
|
||||
// solve system
|
||||
factorization_solve(A, b, n_rhs);
|
||||
|
||||
|
||||
|
||||
auto time_end = system_clock::now();
|
||||
auto duration = duration_cast<microseconds>(time_end - time_start); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
|
||||
cout << "n_rhs = " << n_rhs << "\tTime per rhs: " << t_diff/n_rhs << endl;
|
||||
}
|
||||
}
|
||||
7
ex3_benchmarks/factorization_solve_tests.h
Normal file
7
ex3_benchmarks/factorization_solve_tests.h
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
#include <cstddef>
|
||||
|
||||
|
||||
void CheckCorrectness();
|
||||
|
||||
void CheckDuration(const size_t &N);
|
||||
522
ex3_benchmarks/geom.cpp
Normal file
522
ex3_benchmarks/geom.cpp
Normal file
|
|
@ -0,0 +1,522 @@
|
|||
// see: http://llvm.org/docs/CodingStandards.html#include-style
|
||||
#include "geom.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
Mesh::Mesh(int ndim, int nvert_e, int ndof_e)
|
||||
: _nelem(0), _nvert_e(nvert_e), _ndof_e(ndof_e), _nnode(0), _ndim(ndim), _ia(0), _xc(0)
|
||||
{
|
||||
}
|
||||
|
||||
Mesh::~Mesh()
|
||||
{}
|
||||
|
||||
void Mesh::SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const
|
||||
{
|
||||
int const nnode = Nnodes(); // number of vertices in mesh
|
||||
assert( nnode == static_cast<int>(v.size()) );
|
||||
for (int k = 0; k < nnode; ++k)
|
||||
{
|
||||
v[k] = func( _xc[2 * k], _xc[2 * k + 1] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Mesh::Debug() const
|
||||
{
|
||||
cout << "\n ############### Debug M E S H ###################\n";
|
||||
cout << "\n ............... Coordinates ...................\n";
|
||||
for (int k = 0; k < _nnode; ++k)
|
||||
{
|
||||
cout << k << " : " ;
|
||||
for (int i = 0; i < _ndof_e; ++i )
|
||||
{
|
||||
cout << _xc[2*k+i] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << "\n ............... Elements ...................\n";
|
||||
for (int k = 0; k < _nelem; ++k)
|
||||
{
|
||||
cout << k << " : ";
|
||||
for (int i = 0; i < _ndof_e; ++i )
|
||||
cout << _ia[_ndof_e * k + i] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void Mesh::Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const
|
||||
{
|
||||
assert(Nnodes() == static_cast<int>(v.size())); // fits vector length to mesh information?
|
||||
|
||||
ofstream fout(fname); // open file ASCII mode
|
||||
if ( !fout.is_open() )
|
||||
{
|
||||
cout << "\nFile " << fname << " has not been opened.\n\n" ;
|
||||
assert( fout.is_open() && "File not opened." );
|
||||
}
|
||||
|
||||
string const DELIMETER(" "); // define the same delimeter as in matlab/ascii_read*.m
|
||||
int const OFFSET(1); // convert C-indexing to matlab
|
||||
|
||||
// Write data: #nodes, #space dimensions, #elements, #vertices per element
|
||||
fout << Nnodes() << DELIMETER << Ndims() << DELIMETER << Nelems() << DELIMETER << NverticesElements() << endl;
|
||||
|
||||
// Write cordinates: x_k, y_k in seperate lines
|
||||
assert( Nnodes()*Ndims() == static_cast<int>(_xc.size()));
|
||||
for (int k = 0, kj = 0; k < Nnodes(); ++k)
|
||||
{
|
||||
for (int j = 0; j < Ndims(); ++j, ++kj)
|
||||
{
|
||||
fout << _xc[kj] << DELIMETER;
|
||||
}
|
||||
fout << endl;
|
||||
}
|
||||
|
||||
// Write connectivity: ia_k,0, ia_k,1 etc in seperate lines
|
||||
assert( Nelems()*NverticesElements() == static_cast<int>(_ia.size()));
|
||||
for (int k = 0, kj = 0; k < Nelems(); ++k)
|
||||
{
|
||||
for (int j = 0; j < NverticesElements(); ++j, ++kj)
|
||||
{
|
||||
fout << _ia[kj] + OFFSET << DELIMETER; // C to matlab
|
||||
}
|
||||
fout << endl;
|
||||
}
|
||||
|
||||
// Write vector
|
||||
for (int k = 0; k < Nnodes(); ++k)
|
||||
{
|
||||
fout << v[k] << endl;
|
||||
}
|
||||
|
||||
fout.close();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void Mesh::Visualize(std::vector<double> const &v) const
|
||||
{
|
||||
// define external command
|
||||
const string exec_m("matlab -nosplash < visualize_results.m"); // Matlab
|
||||
//const string exec_m("octave --no-window-system --no-gui visualize_results.m"); // Octave
|
||||
//const string exec_m("flatpak run org.octave.Octave visualize_results.m"); // Octave (flatpak): desktop GH
|
||||
|
||||
const string fname("uv.txt");
|
||||
Write_ascii_matlab(fname, v);
|
||||
|
||||
int ierror = system(exec_m.c_str()); // call external command
|
||||
|
||||
if (ierror != 0)
|
||||
{
|
||||
cout << endl << "Check path to Matlab/octave on your system" << endl;
|
||||
}
|
||||
cout << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// #####################################################################
|
||||
Mesh_2d_3_square::Mesh_2d_3_square(int nx, int ny, int myid, int procx, int procy)
|
||||
: Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
|
||||
_myid(myid), _procx(procx), _procy(procy), _neigh{{-1, -1, -1, -1}}, _color(0),
|
||||
_xl(0.0), _xr(1.0), _yb(0.0), _yt(1.0), _nx(nx), _ny(ny)
|
||||
{
|
||||
//void IniGeom(int const myid, int const procx, int const procy, int neigh[], int &color)
|
||||
int const ky = _myid / _procx;
|
||||
int const kx = _myid % _procy; // MOD(myid,procx)
|
||||
// Determine the neighbors of domain/rank myid
|
||||
_neigh[0] = (ky == 0) ? -1 : _myid - _procx; // South
|
||||
_neigh[1] = (kx == _procx - 1) ? -1 : _myid + 1; // East
|
||||
_neigh[2] = (ky == _procy - 1) ? -1 : _myid + _procx; // North
|
||||
_neigh[3] = (kx == 0) ? -1 : _myid - 1; // West
|
||||
|
||||
_color = (kx + ky) & 1 ;
|
||||
|
||||
// subdomain is part of unit square
|
||||
double const hx = 1. / _procx;
|
||||
double const hy = 1. / _procy;
|
||||
_xl = kx * hx; // left
|
||||
_xr = (kx + 1) * hx; // right
|
||||
_yb = ky * hy; // bottom
|
||||
_yt = (ky + 1) * hy; // top
|
||||
|
||||
// Calculate coordinates
|
||||
int const nnode = (_nx + 1) * (_ny + 1); // number of nodes
|
||||
Resize_Coords(nnode, 2); // coordinates in 2D [nnode][ndim]
|
||||
GetCoordsInRectangle(_nx, _ny, _xl, _xr, _yb, _yt, GetCoords().data());
|
||||
|
||||
// Calculate element connectivity (linear triangles)
|
||||
int const nelem = 2 * _nx * _ny; // number of elements
|
||||
Resize_Connectivity(nelem, 3); // connectivity matrix [nelem][3]
|
||||
GetConnectivityInRectangle(_nx, _ny, GetConnectivity().data());
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void Mesh_2d_3_square::SetU(std::vector<double> &u) const
|
||||
{
|
||||
int dx = _nx + 1;
|
||||
for (int j = 0; j <= _ny; ++j)
|
||||
{
|
||||
int k = j * dx;
|
||||
for (int i = 0; i <= _nx; ++i, ++k)
|
||||
{
|
||||
u[k] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::SetF(std::vector<double> &f) const
|
||||
{
|
||||
int dx = _nx + 1;
|
||||
for (int j = 0; j <= _ny; ++j)
|
||||
{
|
||||
int k = j * dx;
|
||||
for (int i = 0; i <= _nx; ++i, ++k)
|
||||
{
|
||||
f[k] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
std::vector<int> Mesh_2d_3_square::Index_DirichletNodes() const
|
||||
{
|
||||
int const dx = 1,
|
||||
dy = _nx + 1,
|
||||
bl = 0,
|
||||
br = _nx,
|
||||
tl = _ny * (_nx + 1),
|
||||
tr = (_ny + 1) * (_nx + 1) - 1;
|
||||
int const start[4] = { bl, br, tl, bl},
|
||||
end[4] = { br, tr, tr, tl},
|
||||
step[4] = { dx, dy, dx, dy};
|
||||
|
||||
vector<int> idx(0);
|
||||
for (int j = 0; j < 4; j++)
|
||||
{
|
||||
if (_neigh[j] < 0)
|
||||
{
|
||||
for (int i = start[j]; i <= end[j]; i += step[j])
|
||||
{
|
||||
idx.push_back(i); // node i is Dirichlet node
|
||||
}
|
||||
}
|
||||
}
|
||||
// remove multiple elements
|
||||
sort(idx.begin(), idx.end()); // sort
|
||||
idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::SaveVectorP(std::string const &name, vector<double> const &u) const
|
||||
{
|
||||
// construct the file name for subdomain myid
|
||||
const string tmp( std::to_string(_myid / 100) + to_string((_myid % 100) / 10) + to_string(_myid % 10) );
|
||||
|
||||
const string namep = name + "." + tmp;
|
||||
ofstream ff(namep.c_str());
|
||||
ff.precision(6);
|
||||
ff.setf(ios::fixed, ios::floatfield);
|
||||
|
||||
// assumes tensor product grid in unit square; rowise numbered (as generated in class constructor)
|
||||
// output is provided for tensor product grid visualization ( similar to Matlab-surf() )
|
||||
auto const &xc = GetCoords();
|
||||
int k = 0;
|
||||
for (int j = 0; j <= _ny; ++j)
|
||||
{
|
||||
for (int i = 0; i <= _nx; ++i, ++k)
|
||||
ff << xc[2 * k + 0] << " " << xc[2 * k + 1] << " " << u[k] << endl;
|
||||
ff << endl;
|
||||
}
|
||||
|
||||
ff.close();
|
||||
return;
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::GetCoordsInRectangle(int const nx, int const ny,
|
||||
double const xl, double const xr, double const yb, double const yt,
|
||||
double xc[])
|
||||
{
|
||||
const double hx = (xr - xl) / nx,
|
||||
hy = (yt - yb) / ny;
|
||||
|
||||
int k = 0;
|
||||
for (int j = 0; j <= ny; ++j)
|
||||
{
|
||||
const double y0 = yb + j * hy;
|
||||
for (int i = 0; i <= nx; ++i, k += 2)
|
||||
{
|
||||
xc[k ] = xl + i * hx;
|
||||
xc[k + 1] = y0;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::GetConnectivityInRectangle(int const nx, int const ny, int ia[])
|
||||
{
|
||||
const int dx = nx + 1;
|
||||
int k = 0;
|
||||
int l = 0;
|
||||
for (int j = 0; j < ny; ++j, ++k)
|
||||
{
|
||||
for (int i = 0; i < nx; ++i, ++k)
|
||||
{
|
||||
ia[l ] = k;
|
||||
ia[l + 1] = k + 1;
|
||||
ia[l + 2] = k + dx + 1;
|
||||
l += 3;
|
||||
ia[l ] = k;
|
||||
ia[l + 1] = k + dx;
|
||||
ia[l + 2] = k + dx + 1;
|
||||
l += 3;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// #################### still some old code (--> MPI) ############################
|
||||
|
||||
|
||||
// Copies the values of w corresponding to the boundary
|
||||
// South (ib==1), East (ib==2), North (ib==3), West (ib==4)
|
||||
|
||||
void GetBound(int const ib, int const nx, int const ny, double const w[], double s[])
|
||||
{
|
||||
const int //dx = 1,
|
||||
dy = nx + 1,
|
||||
bl = 0,
|
||||
br = nx,
|
||||
tl = ny * (nx + 1),
|
||||
tr = (ny + 1) * (nx + 1) - 1;
|
||||
switch (ib)
|
||||
{
|
||||
case 1:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= br; ++i, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
for (int i = tl, j = 0; i <= tr; ++i, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= tl; i += dy, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
for (int i = br, j = 0; i <= tr; i += dy, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------
|
||||
// Computes w: = w + s at nodes on the boundary
|
||||
// South (ib == 1), East (ib == 2), North (ib == 3), West (ib == 4)
|
||||
|
||||
void AddBound(int const ib, int const nx, int const ny, double w[], double const s[])
|
||||
{
|
||||
int const dy = nx + 1,
|
||||
bl = 0,
|
||||
br = nx,
|
||||
tl = ny * (nx + 1),
|
||||
tr = (ny + 1) * (nx + 1) - 1;
|
||||
switch (ib)
|
||||
{
|
||||
case 1:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= br; ++i, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
for (int i = tl, j = 0; i <= tr; ++i, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= tl; i += dy, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
for (int i = br, j = 0; i <= tr; i += dy, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// ####################################################################
|
||||
|
||||
Mesh_2d_3_matlab::Mesh_2d_3_matlab(string const &fname)
|
||||
: Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
|
||||
bedges(0)
|
||||
{
|
||||
ifstream ifs(fname);
|
||||
if (!(ifs.is_open() && ifs.good()))
|
||||
{
|
||||
cerr << "Mesh_2d_3_matlab: Error cannot open file " << fname << endl;
|
||||
assert(ifs.is_open());
|
||||
}
|
||||
|
||||
int const OFFSET(1); // Matlab to C indexing
|
||||
cout << "ASCI file " << fname << " opened" << endl;
|
||||
|
||||
// Read some mesh constants
|
||||
int nnode, ndim, nelem, nvert_e;
|
||||
ifs >> nnode >> ndim >> nelem >> nvert_e;
|
||||
cout << nnode << " " << ndim << " " << nelem << " " << nvert_e << endl;
|
||||
assert(ndim == 2 && nvert_e == 3);
|
||||
|
||||
// Allocate memory
|
||||
Resize_Coords(nnode, ndim); // coordinates in 2D [nnode][ndim]
|
||||
Resize_Connectivity(nelem, nvert_e); // connectivity matrix [nelem][nvert]
|
||||
|
||||
// Read ccordinates
|
||||
auto &xc = GetCoords();
|
||||
for (int k = 0; k < nnode * ndim; ++k)
|
||||
{
|
||||
ifs >> xc[k];
|
||||
}
|
||||
|
||||
// Read connectivity
|
||||
auto &ia = GetConnectivity();
|
||||
for (int k = 0; k < nelem * nvert_e; ++k)
|
||||
{
|
||||
ifs >> ia[k];
|
||||
ia[k] -= OFFSET; // Matlab to C indexing
|
||||
}
|
||||
|
||||
// additional read of boundary information (only start/end point)
|
||||
int nbedges;
|
||||
ifs >> nbedges;
|
||||
|
||||
bedges.resize(nbedges * 2);
|
||||
for (int k = 0; k < nbedges * 2; ++k)
|
||||
{
|
||||
ifs >> bedges[k];
|
||||
bedges[k] -= OFFSET; // Matlab to C indexing
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// binary
|
||||
//{
|
||||
//ifstream ifs(fname, ios_base::in | ios_base::binary);
|
||||
//if(!(ifs.is_open() && ifs.good()))
|
||||
//{
|
||||
//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
|
||||
//assert(ifs.is_open());
|
||||
//}
|
||||
//cout << "ReadBinMatrix: file opened" << file << endl;
|
||||
|
||||
|
||||
//}
|
||||
|
||||
// binaryIO.cpp
|
||||
//void read_binMatrix(const string& file, vector<int> &cnt, vector<int> &col, vector<double> &ele)
|
||||
//{
|
||||
|
||||
//ifstream ifs(file, ios_base::in | ios_base::binary);
|
||||
|
||||
//if(!(ifs.is_open() && ifs.good()))
|
||||
//{
|
||||
//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
|
||||
//assert(ifs.is_open());
|
||||
//}
|
||||
//cout << "ReadBinMatrix: Opened file " << file << endl;
|
||||
|
||||
//int _size;
|
||||
|
||||
//ifs.read(reinterpret_cast<char*>(&_size), sizeof(int)); // old: ifs.read((char*)&_size, sizeof(int));
|
||||
//cnt.resize(_size);
|
||||
//cout << "ReadBinMatrix: cnt size: " << _size << endl;
|
||||
|
||||
//ifs.read((char*)&_size, sizeof(int));
|
||||
//col.resize(_size);
|
||||
//cout << "ReadBinMatrix: col size: " << _size << endl;
|
||||
|
||||
//ifs.read((char*)&_size, sizeof(int));
|
||||
//ele.resize(_size);
|
||||
//cout << "ReadBinMatrix: ele size: " << _size << endl;
|
||||
|
||||
|
||||
//ifs.read((char*)cnt.data(), cnt.size() * sizeof(int));
|
||||
//ifs.read((char*)col.data(), col.size() * sizeof(int));
|
||||
//ifs.read((char*)ele.data(), ele.size() * sizeof(double));
|
||||
|
||||
//ifs.close();
|
||||
//cout << "ReadBinMatrix: Finished reading matrix.." << endl;
|
||||
|
||||
//}
|
||||
|
||||
|
||||
std::vector<int> Mesh_2d_3_matlab::Index_DirichletNodes() const
|
||||
{
|
||||
vector<int> idx(bedges); // copy
|
||||
|
||||
sort(idx.begin(), idx.end()); // sort
|
||||
idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
381
ex3_benchmarks/geom.h
Normal file
381
ex3_benchmarks/geom.h
Normal file
|
|
@ -0,0 +1,381 @@
|
|||
#ifndef GEOM_FILE
|
||||
#define GEOM_FILE
|
||||
#include <array>
|
||||
#include <functional> // function; C++11
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* Basis class for finite element meshes.
|
||||
*/
|
||||
class Mesh
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Constructor initializing the members with default values.
|
||||
*
|
||||
* @param[in] ndim space dimensions (dimension for coordinates)
|
||||
* @param[in] nvert_e number of vertices per element (dimension for connectivity)
|
||||
* @param[in] ndof_e degrees of freedom per element (= @p nvert_e for linear elements)
|
||||
*/
|
||||
explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*
|
||||
* See clang warning on
|
||||
* <a href="https://stackoverflow.com/questions/28786473/clang-no-out-of-line-virtual-method-definitions-pure-abstract-c-class/40550578">weak-vtables</a>.
|
||||
*/
|
||||
virtual ~Mesh();
|
||||
|
||||
/**
|
||||
* Number of finite elements in (sub)domain.
|
||||
* @return number of elements.
|
||||
*/
|
||||
int Nelems() const
|
||||
{
|
||||
return _nelem;
|
||||
}
|
||||
|
||||
/**
|
||||
* Global number of vertices for each finite element.
|
||||
* @return number of vertices per element.
|
||||
*/
|
||||
int NverticesElements() const
|
||||
{
|
||||
return _nvert_e;
|
||||
}
|
||||
|
||||
/**
|
||||
* Global number of degrees of freedom (dof) for each finite element.
|
||||
* @return degrees of freedom per element.
|
||||
*/
|
||||
int NdofsElement() const
|
||||
{
|
||||
return _ndof_e;
|
||||
}
|
||||
|
||||
/**
|
||||
* Number of vertices in mesh.
|
||||
* @return number of vertices.
|
||||
*/
|
||||
int Nnodes() const
|
||||
{
|
||||
return _nnode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Space dimension.
|
||||
* @return number of dimensions.
|
||||
*/
|
||||
int Ndims() const
|
||||
{
|
||||
return _ndim;
|
||||
}
|
||||
|
||||
/**
|
||||
* (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
|
||||
*
|
||||
* @param[in] nelem number of elements
|
||||
* @param[in] nvert_e number of vertices per element
|
||||
*/
|
||||
void Resize_Connectivity(int nelem, int nvert_e)
|
||||
{
|
||||
SetNelem(nelem); // number of elements
|
||||
SetNverticesElement(nvert_e); // vertices per element
|
||||
_ia.resize(nelem * nvert_e);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read connectivity information (g1,g2,g3)_i.
|
||||
* @return convectivity vector [nelems*ndofs].
|
||||
*/
|
||||
const std::vector<int> &GetConnectivity() const
|
||||
{
|
||||
return _ia;
|
||||
}
|
||||
|
||||
/**
|
||||
* Access/Change connectivity information (g1,g2,g3)_i.
|
||||
* @return convectivity vector [nelems*ndofs].
|
||||
*/
|
||||
std::vector<int> &GetConnectivity()
|
||||
{
|
||||
return _ia;
|
||||
}
|
||||
|
||||
/**
|
||||
* (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
|
||||
*
|
||||
* @param[in] nnodes number of nodes
|
||||
* @param[in] ndim space dimension
|
||||
*/
|
||||
void Resize_Coords(int nnodes, int ndim)
|
||||
{
|
||||
SetNnode(nnodes); // number of nodes
|
||||
SetNdim(ndim); // space dimension
|
||||
_xc.resize(nnodes * ndim);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read coordinates of vertices (x,y)_i.
|
||||
* @return coordinates vector [nnodes*2].
|
||||
*/
|
||||
const std::vector<double> &GetCoords() const
|
||||
{
|
||||
return _xc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Access/Change coordinates of vertices (x,y)_i.
|
||||
* @return coordinates vector [nnodes*2].
|
||||
*/
|
||||
std::vector<double> &GetCoords()
|
||||
{
|
||||
return _xc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate values in vector @p v via function @p func(x,y)
|
||||
* @param[in] v vector
|
||||
* @param[in] func function of (x,y) returning a double value.
|
||||
*/
|
||||
void SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const;
|
||||
|
||||
/**
|
||||
* Prints the information for a finite element mesh
|
||||
*/
|
||||
void Debug() const;
|
||||
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions
|
||||
* @return index vector.
|
||||
*/
|
||||
virtual std::vector<int> Index_DirichletNodes() const = 0;
|
||||
|
||||
/**
|
||||
* Write vector @p v toghether with its mesh information to an ASCii file @p fname.
|
||||
*
|
||||
* The data are written in C-style.
|
||||
*
|
||||
* @param[in] fname file name
|
||||
* @param[in] v vector
|
||||
*/
|
||||
void Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const;
|
||||
|
||||
/**
|
||||
* Visualize @p v together with its mesh information via matlab or octave.
|
||||
*
|
||||
* Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
|
||||
* that are supported on your system.
|
||||
*
|
||||
* @param[in] v vector
|
||||
*
|
||||
* @warning matlab files ascii_read_meshvector.m visualize_results.m
|
||||
* must be in the executing directory.
|
||||
*/
|
||||
void Visualize(std::vector<double> const &v) const;
|
||||
|
||||
|
||||
protected:
|
||||
void SetNelem(int nelem)
|
||||
{
|
||||
_nelem = nelem;
|
||||
}
|
||||
|
||||
void SetNverticesElement(int nvert)
|
||||
{
|
||||
_nvert_e = nvert;
|
||||
}
|
||||
|
||||
void SetNdofsElement(int ndof)
|
||||
{
|
||||
_ndof_e = ndof;
|
||||
}
|
||||
|
||||
void SetNnode(int nnode)
|
||||
{
|
||||
_nnode = nnode;
|
||||
}
|
||||
|
||||
void SetNdim(int ndim)
|
||||
{
|
||||
_ndim = ndim;
|
||||
}
|
||||
|
||||
private:
|
||||
int _nelem; //!< number elements
|
||||
int _nvert_e; //!< number of vertices per element
|
||||
int _ndof_e; //!< degrees of freedom (d.o.f.) per element
|
||||
int _nnode; //!< number nodes/vertices
|
||||
int _ndim; //!< space dimension of the problem (1, 2, or 3)
|
||||
std::vector<int> _ia; //!< element connectivity
|
||||
std::vector<double> _xc; //!< coordinates
|
||||
};
|
||||
|
||||
/**
|
||||
* 2D finite element mesh of the square consiting of linear triangular elements.
|
||||
*/
|
||||
class Mesh_2d_3_square: public Mesh
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Generates the f.e. mesh for the unit square.
|
||||
*
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in] myid my MPI-rank / subdomain
|
||||
* @param[in] procx number of ranks/subdomains in x-direction
|
||||
* @param[in] procy number of processes in y-direction
|
||||
*/
|
||||
Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
~Mesh_2d_3_square() override
|
||||
{}
|
||||
|
||||
/**
|
||||
* Set solution vector based on a tensor product grid in the rectangle.
|
||||
* @param[in] u solution vector
|
||||
*/
|
||||
void SetU(std::vector<double> &u) const;
|
||||
|
||||
/**
|
||||
* Set right hand side (rhs) vector on a tensor product grid in the rectangle.
|
||||
* @param[in] f rhs vector
|
||||
*/
|
||||
void SetF(std::vector<double> &f) const;
|
||||
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions
|
||||
* @return index vector.
|
||||
*/
|
||||
std::vector<int> Index_DirichletNodes() const override;
|
||||
|
||||
/**
|
||||
* Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot.
|
||||
* The file stores rowise the x- and y- coordinates together with the value from @p u .
|
||||
* The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals.
|
||||
*
|
||||
* @param[in] name basename of file name (file name will be extended by the rank number)
|
||||
* @param[in] u local vector
|
||||
*
|
||||
* @warning Assumes tensor product grid in unit square; rowise numbered
|
||||
* (as generated in class constructor).
|
||||
* The output is provided for tensor product grid visualization
|
||||
* ( similar to Matlab-surf() ).
|
||||
*
|
||||
* @see Mesh_2d_3_square
|
||||
*/
|
||||
void SaveVectorP(std::string const &name, std::vector<double> const &u) const;
|
||||
|
||||
// here will still need to implement in the class
|
||||
// GetBound(), AddBound()
|
||||
// or better a generalized way with indices and their appropriate ranks for MPI communication
|
||||
|
||||
private:
|
||||
/**
|
||||
* Determines the coordinates of the dicretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt]
|
||||
* which is discretized into @p nx x @p ny intervals.
|
||||
*
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in] xl x-coordinate of left boundary
|
||||
* @param[in] xr x-coordinate of right boundary
|
||||
* @param[in] yb y-coordinate of lower boundary
|
||||
* @param[in] yt y-coordinate of upper boundary
|
||||
* @param[out] xc coordinate vector of length 2n with x(2*k,2*k+1) as coodinates of node k
|
||||
*/
|
||||
|
||||
void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt,
|
||||
double xc[]);
|
||||
/**
|
||||
* Determines the element connectivity of linear triangular elements of a FEM discretization
|
||||
* of a rectangle using @p nx x @p ny equidistant intervals for discretization.
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[out] ia element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s
|
||||
*/
|
||||
void GetConnectivityInRectangle(int nx, int ny, int ia[]);
|
||||
|
||||
private:
|
||||
int _myid; //!< my MPI rank
|
||||
int _procx; //!< number of MPI ranks in x-direction
|
||||
int _procy; //!< number of MPI ranks in y-direction
|
||||
std::array<int, 4> _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.)
|
||||
int _color; //!< red/black coloring (checker board) of subdomains
|
||||
|
||||
double _xl; //!< x coordinate of lower left corner of square
|
||||
double _xr; //!< x coordinate of lower right corner of square
|
||||
double _yb; //!< y coordinate or lower left corner of square
|
||||
double _yt; //!< y coordinate of upper right corner of square
|
||||
int _nx; //!< number of intervals in x-direction
|
||||
int _ny; //!< number of intervals in y-direction
|
||||
};
|
||||
|
||||
// #################### still some old code (--> MPI) ############################
|
||||
/**
|
||||
* Copies the values of @p w corresponding to boundary @p ib
|
||||
* onto vector s. South (ib==1), East (ib==2), North (ib==3), West (ib==4).
|
||||
* The vector @p s has to be long enough!!
|
||||
* @param[in] ib my local boundary
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in] w vector for all nodes of local discretization
|
||||
* @param[out] s short vector with values on boundary @p ib
|
||||
*/
|
||||
// GH_NOTE: Absicherung bei s !!
|
||||
void GetBound(int ib, int nx, int ny, double const w[], double s[]);
|
||||
|
||||
|
||||
/**
|
||||
* Computes @p w := @p w + @p s at the interface/boundary nodes on the
|
||||
* boundary @p ib . South (ib==1), East (ib==2), North (ib==3), West (ib==4)
|
||||
* @param[in] ib my local boundary
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in,out] w vector for all nodes of local discretization
|
||||
* @param[in] s short vector with values on boundary @p ib
|
||||
*/
|
||||
void AddBound(int ib, int nx, int ny, double w[], double const s[]);
|
||||
|
||||
// #################### Mesh from Matlab ############################
|
||||
/**
|
||||
* 2D finite element mesh of the square consiting of linear triangular elements.
|
||||
*/
|
||||
class Mesh_2d_3_matlab: public Mesh
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Reads mesh data from a binary file.
|
||||
*
|
||||
* File format, see ascii_write_mesh.m
|
||||
*
|
||||
* @param[in] fname file name
|
||||
*/
|
||||
explicit Mesh_2d_3_matlab(std::string const &fname);
|
||||
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions.
|
||||
* @return index vector.
|
||||
*
|
||||
* @warning All boundary nodes are considered as Dirchlet nodes.
|
||||
*/
|
||||
std::vector<int> Index_DirichletNodes() const override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions
|
||||
* @return index vector.
|
||||
*/
|
||||
int Nnbedges() const
|
||||
{
|
||||
return static_cast<int>(bedges.size());
|
||||
}
|
||||
|
||||
std::vector<int> bedges; //!< boundary edges [nbedges][2] storing start/end vertex
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
347
ex3_benchmarks/getmatrix.cpp
Normal file
347
ex3_benchmarks/getmatrix.cpp
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
#include "getmatrix.h"
|
||||
#include "userset.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
// general routine for lin. triangular elements
|
||||
|
||||
void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3])
|
||||
//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe)
|
||||
{
|
||||
const int i1 = 2 * ial[0], i2 = 2 * ial[1], i3 = 2 * ial[2];
|
||||
const double x13 = xc[i3 + 0] - xc[i1 + 0], y13 = xc[i3 + 1] - xc[i1 + 1],
|
||||
x21 = xc[i1 + 0] - xc[i2 + 0], y21 = xc[i1 + 1] - xc[i2 + 1],
|
||||
x32 = xc[i2 + 0] - xc[i3 + 0], y32 = xc[i2 + 1] - xc[i3 + 1];
|
||||
const double jac = fabs(x21 * y13 - x13 * y21);
|
||||
|
||||
ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
|
||||
ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
|
||||
ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
|
||||
ske[1][0] = ske[0][1];
|
||||
ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
|
||||
ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
|
||||
ske[2][0] = ske[0][2];
|
||||
ske[2][1] = ske[1][2];
|
||||
ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
|
||||
|
||||
const double xm = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
|
||||
ym = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
|
||||
//fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0;
|
||||
fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0;
|
||||
}
|
||||
|
||||
|
||||
// general routine for lin. triangular elements,
|
||||
// non-symm. matrix
|
||||
// node numbering in element: a s c e n d i n g indices !!
|
||||
// GH: deprecated
|
||||
void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
|
||||
int const id[], int const ik[], double sk[], double f[])
|
||||
{
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
const int ii = ial[i], // row ii (global index)
|
||||
id1 = id[ii], // start and
|
||||
id2 = id[ii + 1]; // end of row ii in matrix
|
||||
int ip = id1;
|
||||
for (int j = 0; j < 3; ++j) // no symmetry assumed
|
||||
{
|
||||
const int jj = ial[j];
|
||||
bool not_found = true;
|
||||
do // find entry jj (global index) in row ii
|
||||
{
|
||||
not_found = (ik[ip] != jj);
|
||||
++ip;
|
||||
}
|
||||
while (not_found && ip < id2);
|
||||
|
||||
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
|
||||
if (not_found) // no entry found !!
|
||||
{
|
||||
cout << "Error in AddElem: (" << ii << "," << jj << ") ["
|
||||
<< ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
|
||||
assert(!not_found);
|
||||
}
|
||||
#endif
|
||||
sk[ip - 1] += ske[i][j];
|
||||
}
|
||||
f[ii] += fe[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// ####################################################################
|
||||
|
||||
CRS_Matrix::CRS_Matrix(Mesh const &mesh)
|
||||
: _mesh(mesh), _nrows(0), _nnz(0), _id(0), _ik(0), _sk(0)
|
||||
{
|
||||
Derive_Matrix_Pattern();
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::Derive_Matrix_Pattern()
|
||||
{
|
||||
int const nelem(_mesh.Nelems());
|
||||
int const ndof_e(_mesh.NdofsElement());
|
||||
auto const &ia(_mesh.GetConnectivity());
|
||||
// Determine the number of matrix rows
|
||||
_nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
|
||||
++_nrows; // node numberng: 0 ... nnode-1
|
||||
assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
|
||||
|
||||
// Collect for each node those nodes it is connected to (multiple entries)
|
||||
// Detect the neighboring nodes
|
||||
vector< list<int> > cc(_nrows); // cc[i] is the list of nodes a node i is connected to
|
||||
for (int i = 0; i < nelem; ++i)
|
||||
{
|
||||
int const idx = ndof_e * i;
|
||||
for (int k = 0; k < ndof_e; ++k)
|
||||
{
|
||||
list<int> &cck = cc.at(ia[idx + k]);
|
||||
cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e );
|
||||
}
|
||||
}
|
||||
// Delete the multiple entries
|
||||
_nnz = 0;
|
||||
for (auto &it : cc)
|
||||
{
|
||||
it.sort();
|
||||
it.unique();
|
||||
_nnz += static_cast<int>(it.size());
|
||||
// cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator<int,char>(cout," ")); cout << endl;
|
||||
}
|
||||
|
||||
// CSR data allocation
|
||||
_id.resize(_nrows + 1); // Allocate memory for CSR row pointer
|
||||
_ik.resize(_nnz); // Allocate memory for CSR column index vector
|
||||
|
||||
// copy CSR data
|
||||
_id[0] = 0; // begin of first row
|
||||
for (size_t i = 0; i < cc.size(); ++i)
|
||||
{
|
||||
//cout << i << " " << nid.at(i) << endl;;
|
||||
const list<int> &ci = cc.at(i);
|
||||
const auto nci = static_cast<int>(ci.size());
|
||||
_id[i + 1] = _id[i] + nci; // begin of next line
|
||||
copy(ci.begin(), ci.end(), _ik.begin() + _id[i] );
|
||||
}
|
||||
|
||||
assert(_nnz == _id[_nrows]);
|
||||
_sk.resize(_nnz); // Allocate memory for CSR column index vector
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void CRS_Matrix::Debug() const
|
||||
{
|
||||
// ID points to first entry of row
|
||||
// no symmetry assumed
|
||||
cout << "\nMatrix (nnz = " << _id[_nrows] << ")\n";
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
cout << "Row " << row << " : ";
|
||||
int const id1 = _id[row];
|
||||
int const id2 = _id[row + 1];
|
||||
for (int j = id1; j < id2; ++j)
|
||||
{
|
||||
cout.setf(ios::right, ios::adjustfield);
|
||||
cout << "[" << setw(2) << _ik[j] << "] " << setw(4) << _sk[j] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::CalculateLaplace(vector<double> &f)
|
||||
{
|
||||
assert(_mesh.NdofsElement() == 3); // only for triangular, linear elements
|
||||
//cout << _nnz << " vs. " << _id[_nrows] << " " << _nrows<< endl;
|
||||
assert(_nnz == _id[_nrows]);
|
||||
|
||||
for (int k = 0; k < _nrows; ++k)
|
||||
{
|
||||
_sk[k] = 0.0;
|
||||
}
|
||||
for (int k = 0; k < _nrows; ++k)
|
||||
{
|
||||
f[k] = 0.0;
|
||||
}
|
||||
|
||||
double ske[3][3], fe[3];
|
||||
// Loop over all elements
|
||||
auto const nelem = _mesh.Nelems();
|
||||
auto const &ia = _mesh.GetConnectivity();
|
||||
auto const &xc = _mesh.GetCoords();
|
||||
|
||||
for (int i = 0; i < nelem; ++i)
|
||||
{
|
||||
CalcElem(ia.data() + 3 * i, xc.data(), ske, fe);
|
||||
//AddElem(ia.data()+3 * i, ske, fe, _id.data(), _ik.data(), _sk.data(), f.data()); // GH: deprecated
|
||||
AddElem_3(ia.data() + 3 * i, ske, fe, f);
|
||||
}
|
||||
|
||||
//Debug();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
|
||||
{
|
||||
double const PENALTY = 1e6;
|
||||
auto const idx = _mesh.Index_DirichletNodes();
|
||||
int const nidx = static_cast<int>(idx.size());
|
||||
|
||||
for (int row = 0; row < nidx; ++row)
|
||||
{
|
||||
int const k = idx[row];
|
||||
int const id1 = fetch(k, k); // Find diagonal entry of row
|
||||
assert(id1 >= 0);
|
||||
_sk[id1] += PENALTY; // matrix weighted scaling feasible
|
||||
f[k] += PENALTY * u[k];
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::GetDiag(vector<double> &d) const
|
||||
{
|
||||
assert( _nrows == static_cast<int>(d.size()) );
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
const int ia = fetch(row, row); // Find diagonal entry of row
|
||||
assert(ia >= 0);
|
||||
d[row] = _sk[ia];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const
|
||||
{
|
||||
bool bn = (nnode == _nrows); // number of rows
|
||||
if (!bn)
|
||||
{
|
||||
cout << "######### Error: " << "number of rows" << endl;
|
||||
}
|
||||
|
||||
bool bz = (id[nnode] == _nnz); // number of non zero elements
|
||||
if (!bz)
|
||||
{
|
||||
cout << "######### Error: " << "number of non zero elements" << endl;
|
||||
}
|
||||
|
||||
bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts
|
||||
if (!bd)
|
||||
{
|
||||
cout << "######### Error: " << "row starts" << endl;
|
||||
}
|
||||
|
||||
bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices
|
||||
if (!bk)
|
||||
{
|
||||
cout << "######### Error: " << "column indices" << endl;
|
||||
}
|
||||
|
||||
bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values
|
||||
if (!bv)
|
||||
{
|
||||
cout << "######### Error: " << "values" << endl;
|
||||
}
|
||||
|
||||
return bn && bz && bd && bk && bv;
|
||||
}
|
||||
|
||||
|
||||
void CRS_Matrix::Mult(vector<double> &w, vector<double> const &u) const
|
||||
{
|
||||
assert( _nrows == static_cast<int>(w.size()) );
|
||||
assert( w.size() == u.size() );
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
double wi = 0.0;
|
||||
for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
|
||||
{
|
||||
wi += _sk[ij] * u[ _ik[ij] ];
|
||||
}
|
||||
w[row] = wi;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::Defect(vector<double> &w,
|
||||
vector<double> const &f, vector<double> const &u) const
|
||||
{
|
||||
assert( _nrows == static_cast<int>(w.size()) );
|
||||
assert( w.size() == u.size() && u.size() == f.size() );
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
double wi = f[row];
|
||||
for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
|
||||
{
|
||||
wi -= _sk[ij] * u[ _ik[ij] ];
|
||||
}
|
||||
w[row] = wi;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int CRS_Matrix::fetch(int const row, int const col) const
|
||||
{
|
||||
int const id2 = _id[row + 1]; // end and
|
||||
int ip = _id[row]; // start of recent row (global index)
|
||||
|
||||
while (ip < id2 && _ik[ip] != col) // find index col (global index)
|
||||
{
|
||||
++ip;
|
||||
}
|
||||
if (ip >= id2)
|
||||
{
|
||||
ip = -1;
|
||||
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
|
||||
cout << "No column " << col << " in row " << row << endl;
|
||||
assert(ip >= id2);
|
||||
#endif
|
||||
}
|
||||
return ip;
|
||||
}
|
||||
|
||||
|
||||
void CRS_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector<double> &f)
|
||||
{
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
const int ii = ial[i]; // row ii (global index)
|
||||
for (int j = 0; j < 3; ++j) // no symmetry assumed
|
||||
{
|
||||
const int jj = ial[j]; // column jj (global index)
|
||||
int ip = fetch(ii, jj); // find column entry jj in row ii
|
||||
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
|
||||
if (ip < 0) // no entry found !!
|
||||
{
|
||||
cout << "Error in AddElem: (" << ii << "," << jj << ") ["
|
||||
<< ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
|
||||
assert(ip >= 0);
|
||||
}
|
||||
#endif
|
||||
_sk[ip] += ske[i][j];
|
||||
}
|
||||
f[ii] += fe[i];
|
||||
}
|
||||
}
|
||||
|
||||
178
ex3_benchmarks/getmatrix.h
Normal file
178
ex3_benchmarks/getmatrix.h
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
#ifndef GETMATRIX_FILE
|
||||
#define GETMATRIX_FILE
|
||||
|
||||
#include "geom.h"
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* Calculates the element stiffness matrix @p ske and the element load vector @p fe
|
||||
* of one triangular element with linear shape functions.
|
||||
* @param[in] ial node indices of the three element vertices
|
||||
* @param[in] xc vector of node coordinates with x(2*k,2*k+1) as coodinates of node k
|
||||
* @param[out] ske element stiffness matrix
|
||||
* @param[out] fe element load vector
|
||||
*/
|
||||
void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]);
|
||||
|
||||
/**
|
||||
* Adds the element stiffness matrix @p ske and the element load vector @p fe
|
||||
* of one triangular element with linear shape functions to the appropriate positions in
|
||||
* the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik)
|
||||
*
|
||||
* @param[in] ial node indices of the three element vertices
|
||||
* @param[in] ske element stiffness matrix
|
||||
* @param[in] fe element load vector
|
||||
* @param[out] sk vector non-zero entries of CSR matrix
|
||||
* @param[in] id index vector containing the first entry in a CSR row
|
||||
* @param[in] ik column index vector of CSR matrix
|
||||
* @param[out] f distributed local vector storing the right hand side
|
||||
*
|
||||
* @warning Algorithm requires indices in connectivity @p ial in ascending order.
|
||||
* Currently deprecated.
|
||||
*/
|
||||
void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
|
||||
int const id[], int const ik[], double sk[], double f[]);
|
||||
|
||||
|
||||
// #####################################################################
|
||||
/**
|
||||
* Square matrix in CRS format (compressed row storage; also named CSR),
|
||||
* see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
|
||||
*/
|
||||
class CRS_Matrix
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Intializes the CRS matrix structure from the given discetization in @p mesh.
|
||||
*
|
||||
* The sparse matrix pattern is generated but the values are 0.
|
||||
*
|
||||
* @param[in] mesh given discretization
|
||||
*
|
||||
* @warning A reference to the discretization @p mesh is stored inside this class.
|
||||
* Therefore, changing @p mesh outside requires also
|
||||
* to call method @p Derive_Matrix_Pattern explicitely.
|
||||
*
|
||||
* @see Derive_Matrix_Pattern
|
||||
*/
|
||||
explicit CRS_Matrix(Mesh const & mesh);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~CRS_Matrix()
|
||||
{}
|
||||
|
||||
/**
|
||||
* Generates the sparse matrix pattern and overwrites the existing pattern.
|
||||
*
|
||||
* The sparse matrix pattern is generated but the values are 0.
|
||||
*/
|
||||
void Derive_Matrix_Pattern();
|
||||
|
||||
/**
|
||||
* Calculates the entries of f.e. stiffness matrix and load/rhs vector @p f for the Laplace operator in 2D.
|
||||
* No memory is allocated.
|
||||
*
|
||||
* @param[in,out] f (preallocated) rhs/load vector
|
||||
*/
|
||||
void CalculateLaplace(std::vector<double> &f);
|
||||
|
||||
/**
|
||||
* Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
|
||||
* The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
|
||||
* is used for incorporating the given values @p u.
|
||||
*
|
||||
* @param[in] u (global) vector with Dirichlet data
|
||||
* @param[in,out] f load vector
|
||||
*/
|
||||
void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
|
||||
|
||||
/**
|
||||
* Extracts the diagonal elemenst of the sparse matrix.
|
||||
*
|
||||
* @param[in,out] d (prellocated) vector of diagonal elements
|
||||
*/
|
||||
void GetDiag(std::vector<double> &d) const;
|
||||
|
||||
/**
|
||||
* Performs the matrix-vector product w := K*u.
|
||||
*
|
||||
* @param[in,out] w resulting vector (preallocated)
|
||||
* @param[in] u vector
|
||||
*/
|
||||
void Mult(std::vector<double> &w, std::vector<double> const &u) const;
|
||||
|
||||
/**
|
||||
* Calculates the defect/residuum w := f - K*u.
|
||||
*
|
||||
* @param[in,out] w resulting vector (preallocated)
|
||||
* @param[in] f load vector
|
||||
* @param[in] u vector
|
||||
*/
|
||||
void Defect(std::vector<double> &w,
|
||||
std::vector<double> const &f, std::vector<double> const &u) const;
|
||||
|
||||
/**
|
||||
* Number rows in matrix.
|
||||
* @return number of rows.
|
||||
*/
|
||||
int Nrows() const
|
||||
{return _nrows;}
|
||||
|
||||
/**
|
||||
* Show the matrix entries.
|
||||
*/
|
||||
void Debug() const;
|
||||
|
||||
/**
|
||||
* Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
|
||||
*
|
||||
* @param[in] row row index
|
||||
* @param[in] col column index
|
||||
* @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
|
||||
*
|
||||
* @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
|
||||
*/
|
||||
int fetch(int row, int col) const;
|
||||
|
||||
/**
|
||||
* Adds the element stiffness matrix @p ske and the element load vector @p fe
|
||||
* of one triangular element with linear shape functions to the appropriate positions in
|
||||
* the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
|
||||
*
|
||||
* @param[in] ial node indices of the three element vertices
|
||||
* @param[in] ske element stiffness matrix
|
||||
* @param[in] fe element load vector
|
||||
* @param[in,out] f distributed local vector storing the right hand side
|
||||
*
|
||||
* @warning Algorithm assumes linear triangular elements (ndof_e==3).
|
||||
*/
|
||||
void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
|
||||
|
||||
/**
|
||||
* Compare @p this CRS matrix with an external CRS matrix stored in C-Style.
|
||||
*
|
||||
* The method prints statements on differences found.
|
||||
*
|
||||
* @param[in] nnode row number of external matrix
|
||||
* @param[in] id start indices of matrix rows of external matrix
|
||||
* @param[in] ik column indices of external matrix
|
||||
* @param[in] sk non-zero values of external matrix
|
||||
*
|
||||
* @return true iff all data are identical.
|
||||
*/
|
||||
bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const;
|
||||
|
||||
private:
|
||||
Mesh const & _mesh; //!< reference to discretization
|
||||
int _nrows; //!< number of rows in matrix
|
||||
int _nnz; //!< number of non-zero entries
|
||||
std::vector<int> _id; //!< start indices of matrix rows
|
||||
std::vector<int> _ik; //!< column indices
|
||||
std::vector<double> _sk; //!< non-zero values
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif
|
||||
BIN
ex3_benchmarks/gh_code_Schratter.pdf
Normal file
BIN
ex3_benchmarks/gh_code_Schratter.pdf
Normal file
Binary file not shown.
13
ex3_benchmarks/gh_response.txt
Normal file
13
ex3_benchmarks/gh_response.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
* added ../GCC_default.mk and ../CLANG_default.mk into your repository
|
||||
* missing objects in Makefile, see Makefile:18 [-1]
|
||||
* benchmark_tests.cpp:214
|
||||
double check_sum // GH: initialize!
|
||||
|
||||
* Some comprehensive documents on results would be good.
|
||||
|
||||
* benchmarks.cpp
|
||||
MatMat(): swao j and k loop // should be faster
|
||||
minor remarks in pdf.
|
||||
|
||||
|
||||
* make codecheck COMPILER=CLANG_ > codecheck_gh.out
|
||||
61
ex3_benchmarks/jacsolve.cpp
Normal file
61
ex3_benchmarks/jacsolve.cpp
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
#include "vdop.h"
|
||||
#include "getmatrix.h"
|
||||
#include "jacsolve.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
// #####################################################################
|
||||
// const int neigh[], const int color,
|
||||
// const MPI::Intracomm& icomm,
|
||||
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
|
||||
{
|
||||
const double omega = 1.0;
|
||||
const int maxiter = 1000;
|
||||
const double tol = 1e-5, // tolerance
|
||||
tol2 = tol * tol; // tolerance^2
|
||||
|
||||
int nrows = SK.Nrows(); // number of rows == number of columns
|
||||
assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
|
||||
|
||||
cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl;
|
||||
// Choose initial guess
|
||||
for (int k = 0; k < nrows; ++k)
|
||||
{
|
||||
u[k] = 0.0; // u := 0
|
||||
}
|
||||
|
||||
vector<double> dd(nrows); // matrix diagonal
|
||||
vector<double> r(nrows); // residual
|
||||
vector<double> w(nrows); // correction
|
||||
|
||||
SK.GetDiag(dd); // dd := diag(K)
|
||||
////DebugVector(dd);{int ijk; cin >> ijk;}
|
||||
|
||||
// Initial sweep
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
double sigma0 = dscapr(w, r); // s0 := <w,r>
|
||||
|
||||
// Iteration sweeps
|
||||
int iter = 0;
|
||||
double sigma = sigma0;
|
||||
while ( sigma > tol2 * sigma0 && maxiter > iter)
|
||||
{
|
||||
++iter;
|
||||
vdaxpy(u, u, omega, w ); // u := u + om*w
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
sigma = dscapr(w, r); // s0 := <w,r>
|
||||
// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
|
||||
}
|
||||
cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl;
|
||||
cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n";
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
18
ex3_benchmarks/jacsolve.h
Normal file
18
ex3_benchmarks/jacsolve.h
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
#ifndef JACSOLVE_FILE
|
||||
#define JACSOLVE_FILE
|
||||
#include "getmatrix.h"
|
||||
#include <vector>
|
||||
|
||||
|
||||
/**
|
||||
* Solves linear system of equations K @p u = @p f via the Jacobi iteration.
|
||||
* We use a distributed symmetric CSR matrix @p SK and initial guess of the
|
||||
* solution is set to 0.
|
||||
* @param[in] SK CSR matrix
|
||||
* @param[in] f distributed local vector storing the right hand side
|
||||
* @param[out] u accumulated local vector storing the solution.
|
||||
*/
|
||||
void JacobiSolve(CRS_Matrix const &SK, std::vector<double> const &f, std::vector<double> &u);
|
||||
|
||||
|
||||
#endif
|
||||
101
ex3_benchmarks/main.cpp
Normal file
101
ex3_benchmarks/main.cpp
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
#include "benchmarks.h"
|
||||
#include "benchmark_tests.h"
|
||||
#include "factorization_solve_tests.h"
|
||||
#include <iostream>
|
||||
|
||||
int main()
|
||||
{
|
||||
// ---------------------------------- 1. ----------------------------------
|
||||
// results in file "test_system.txt"
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 2. ----------------------------------
|
||||
// Memory FLOPS Read-write operations
|
||||
// (A) 2N 2N 2N
|
||||
// (B) MN + N 2NM 2NM + M
|
||||
// (C) ML + LM 2LNM 2LNM + MN
|
||||
// (D) (p+1) + N 3N(p+1) N(2 + 3(p+1))
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 3. ----------------------------------
|
||||
// implementation in file "benchmarks.cpp"
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 4.& 6. ----------------------------------
|
||||
vector<vector<double>> results;
|
||||
|
||||
results.push_back(test_A(250, 50000000, scalar));
|
||||
results.push_back(test_A(250, 50000000, Kahan_skalar));
|
||||
results.push_back(test_A(250, 50000000, scalar_cBLAS));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// scalar 0.039 2.4 19
|
||||
// Kahan_skalar 0.037 2.5 20
|
||||
// scalar_cBLAS 0.032 2.9 23
|
||||
|
||||
|
||||
results.push_back(test_B(100, 20000, 10000, MatVec));
|
||||
results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// MatVec 0.1 3.6 29
|
||||
// MatVec_cBLAS 0.074 5 40
|
||||
|
||||
|
||||
results.push_back(test_C(25, 500, 1000, 1500, MatMat));
|
||||
results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// MatMat 0.57 2.5 20
|
||||
// MatMat_cBLAS 0.019 75 6e+02 // unrealistic
|
||||
|
||||
|
||||
results.push_back(test_D(100, 100, 1000000));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// 0.11 2.5 20
|
||||
|
||||
|
||||
cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl;
|
||||
cout << "------------------------------" << endl;
|
||||
for (size_t i = 0; i < results.size(); ++i)
|
||||
cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl;
|
||||
cout << endl;
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 5. ----------------------------------
|
||||
// 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product.
|
||||
// Reason: only have to access entries of x, so less memory that has to be accessed
|
||||
//
|
||||
// 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 6. ----------------------------------
|
||||
// see 4.
|
||||
|
||||
|
||||
// ---------------------------------- 7. ----------------------------------
|
||||
CheckCorrectness();
|
||||
// Checked correctness by computing the inverse of A
|
||||
|
||||
CheckDuration(5000);
|
||||
// The solving time per RHS scales roughly with factor 1/n_rhs
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 8. ----------------------------------
|
||||
// done seperately
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
1826
ex3_benchmarks/small_Doxyfile
Normal file
1826
ex3_benchmarks/small_Doxyfile
Normal file
File diff suppressed because it is too large
Load diff
70
ex3_benchmarks/test_system.txt
Normal file
70
ex3_benchmarks/test_system.txt
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
---------------------------------- 1. ----------------------------------
|
||||
|
||||
|
||||
rm -f *.exe *.o
|
||||
gcc -O3 -c -o mysecond.o mysecond.c
|
||||
gcc -O3 -c mysecond.c
|
||||
gfortran -O3 -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 -c stream.f
|
||||
gfortran -O3 stream.o mysecond.o -o stream_f.exe
|
||||
gcc -O3 -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 stream.c -o stream_c.exe
|
||||
gcc -O3 -DUNIX flops.c -o flops.exe
|
||||
flops.c: In function ‘main’:
|
||||
flops.c:231:4: warning: implicit declaration of function ‘dtime’ [-Wimplicit-function-declaration]
|
||||
231 | dtime(TimeArray);
|
||||
| ^~~~~
|
||||
flops.c: At top level:
|
||||
flops.c:723:1: warning: return type defaults to ‘int’ [-Wimplicit-int]
|
||||
723 | dtime(p)
|
||||
| ^~~~~
|
||||
./stream_c.exe
|
||||
-------------------------------------------------------------
|
||||
STREAM version $Revision: 5.10 $
|
||||
-------------------------------------------------------------
|
||||
This system uses 8 bytes per array element.
|
||||
-------------------------------------------------------------
|
||||
Array size = 80000000 (elements), Offset = 0 (elements)
|
||||
Memory per array = 610.4 MiB (= 0.6 GiB).
|
||||
Total memory required = 1831.1 MiB (= 1.8 GiB).
|
||||
Each kernel will be executed 20 times.
|
||||
The *best* time for each kernel (excluding the first iteration)
|
||||
will be used to compute the reported bandwidth.
|
||||
-------------------------------------------------------------
|
||||
Your clock granularity/precision appears to be 1 microseconds.
|
||||
Each test below will take on the order of 79294 microseconds.
|
||||
(= 79294 clock ticks)
|
||||
Increase the size of the arrays if this shows that
|
||||
you are not getting at least 20 clock ticks per test.
|
||||
-------------------------------------------------------------
|
||||
WARNING -- The above is only a rough guideline.
|
||||
For best results, please be sure you know the
|
||||
precision of your system timer.
|
||||
-------------------------------------------------------------
|
||||
Function Best Rate MB/s Avg time Min time Max time
|
||||
Copy: 26720.6 0.057416 0.047903 0.098979
|
||||
Scale: 17008.6 0.087616 0.075256 0.133899
|
||||
Add: 19169.9 0.113818 0.100157 0.177676
|
||||
Triad: 19144.9 0.111248 0.100288 0.170877
|
||||
-------------------------------------------------------------
|
||||
Solution Validates: avg error less than 1.000000e-13 on all three arrays
|
||||
-------------------------------------------------------------
|
||||
./flops.exe
|
||||
|
||||
FLOPS C Program (Double Precision), V2.0 18 Dec 1992
|
||||
|
||||
Module Error RunTime MFLOPS
|
||||
(usec)
|
||||
1 4.0146e-13 0.0023 6183.8896
|
||||
2 -1.4166e-13 0.0006 11377.1999
|
||||
3 4.7184e-14 0.0034 5031.5222
|
||||
4 -1.2557e-13 0.0031 4841.2566
|
||||
5 -1.3800e-13 0.0056 5208.6586
|
||||
6 3.2380e-13 0.0053 5484.2426
|
||||
7 -8.4583e-11 0.0031 3832.8326
|
||||
8 3.4867e-13 0.0055 5410.0375
|
||||
|
||||
Iterations = 512000000
|
||||
NullTime (usec) = 0.0000
|
||||
MFLOPS(1) = 8055.7366
|
||||
MFLOPS(2) = 4732.2658
|
||||
MFLOPS(3) = 5164.0037
|
||||
MFLOPS(4) = 5257.0181
|
||||
16
ex3_benchmarks/userset.cpp
Normal file
16
ex3_benchmarks/userset.cpp
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#include "userset.h"
|
||||
#include <cmath>
|
||||
|
||||
|
||||
double FunctF(double const x, double const y)
|
||||
{
|
||||
// return std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
|
||||
// return 16.0*1024. ;
|
||||
// return (double)1.0 ;
|
||||
return x * x * std::sin(2.5 * 3.14159 * y);
|
||||
}
|
||||
|
||||
double FunctU(const double /* x */, double const /* y */)
|
||||
{
|
||||
return 1.0 ;
|
||||
}
|
||||
44
ex3_benchmarks/userset.h
Normal file
44
ex3_benchmarks/userset.h
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
#ifndef USERSET_FILE
|
||||
#define USERSET_FILE
|
||||
#include <cmath>
|
||||
/**
|
||||
* User function: f(@p x,@p y)
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value for right hand side f(@p x,@p y)
|
||||
*/
|
||||
double FunctF(double const x, double const y);
|
||||
|
||||
/**
|
||||
* User function: u(@p x,@p y)
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value for solution vector u(@p x,@p y)
|
||||
*/
|
||||
double FunctU(double const x, double const y);
|
||||
|
||||
|
||||
/**
|
||||
* User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$.
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value f(@p x,@p y)
|
||||
*/
|
||||
inline double fNice(double const x, double const y)
|
||||
{
|
||||
return x * x * std::sin(2.5 * 3.14159 * y);
|
||||
}
|
||||
|
||||
/**
|
||||
* User function: f(@p x,@p y) = 0$.
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value 0
|
||||
*/
|
||||
inline double f_zero(double const x, double const y)
|
||||
//double f_zero(double const /*x*/, double const /*y*/)
|
||||
{
|
||||
return 0.0 + 0.0*(x+y);
|
||||
}
|
||||
|
||||
#endif
|
||||
84
ex3_benchmarks/vdop.cpp
Normal file
84
ex3_benchmarks/vdop.cpp
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
#include "vdop.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
void vddiv(vector<double> &x, vector<double> const &y,
|
||||
vector<double> const &z)
|
||||
{
|
||||
assert( x.size() == y.size() && y.size() == z.size() );
|
||||
size_t n = x.size();
|
||||
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
{
|
||||
x[k] = y[k] / z[k];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
//******************************************************************************
|
||||
|
||||
void vdaxpy(std::vector<double> &x, std::vector<double> const &y,
|
||||
double alpha, std::vector<double> const &z )
|
||||
{
|
||||
assert( x.size() == y.size() && y.size() == z.size() );
|
||||
size_t n = x.size();
|
||||
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
{
|
||||
x[k] = y[k] + alpha * z[k];
|
||||
}
|
||||
return;
|
||||
}
|
||||
//******************************************************************************
|
||||
|
||||
double dscapr(std::vector<double> const &x, std::vector<double> const &y)
|
||||
{
|
||||
assert( x.size() == y.size());
|
||||
size_t n = x.size();
|
||||
|
||||
double s = 0.0;
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
{
|
||||
s += x[k] * y[k];
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
//******************************************************************************
|
||||
void DebugVector(vector<double> const &v)
|
||||
{
|
||||
cout << "\nVector (nnode = " << v.size() << ")\n";
|
||||
for (size_t j = 0; j < v.size(); ++j)
|
||||
{
|
||||
cout.setf(ios::right, ios::adjustfield);
|
||||
cout << v[j] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
//******************************************************************************
|
||||
bool CompareVectors(std::vector<double> const &x, int const n, double const y[], double const eps)
|
||||
{
|
||||
bool bn = (static_cast<int>(x.size()) == n);
|
||||
if (!bn)
|
||||
{
|
||||
cout << "######### Error: " << "number of elements" << endl;
|
||||
}
|
||||
//bool bv = equal(x.cbegin(),x.cend(),y);
|
||||
bool bv = equal(x.cbegin(), x.cend(), y,
|
||||
[eps](double a, double b) -> bool
|
||||
{ return std::abs(a - b) < eps * (1.0 + 0.5 * (std::abs(a) + std::abs(a))); }
|
||||
);
|
||||
if (!bv)
|
||||
{
|
||||
assert(static_cast<int>(x.size()) == n);
|
||||
cout << "######### Error: " << "values" << endl;
|
||||
}
|
||||
return bn && bv;
|
||||
}
|
||||
58
ex3_benchmarks/vdop.h
Normal file
58
ex3_benchmarks/vdop.h
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
#ifndef VDOP_FILE
|
||||
#define VDOP_FILE
|
||||
#include <vector>
|
||||
|
||||
/** @brief Element-wise vector divison x_k = y_k/z_k.
|
||||
*
|
||||
* @param[out] x target vector
|
||||
* @param[in] y source vector
|
||||
* @param[in] z source vector
|
||||
*
|
||||
*/
|
||||
void vddiv(std::vector<double> & x, std::vector<double> const& y,
|
||||
std::vector<double> const& z);
|
||||
|
||||
/** @brief Element-wise daxpy operation x(k) = y(k) + alpha*z(k).
|
||||
*
|
||||
* @param[out] x target vector
|
||||
* @param[in] y source vector
|
||||
* @param[in] alpha scalar
|
||||
* @param[in] z source vector
|
||||
*
|
||||
*/
|
||||
void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
|
||||
double alpha, std::vector<double> const& z );
|
||||
|
||||
|
||||
/** @brief Calculates the Euclidian inner product of two vectors.
|
||||
*
|
||||
* @param[in] x vector
|
||||
* @param[in] y vector
|
||||
* @return Euclidian inner product @f$\langle x,y \rangle@f$
|
||||
*
|
||||
*/
|
||||
double dscapr(std::vector<double> const& x, std::vector<double> const& y);
|
||||
|
||||
|
||||
/**
|
||||
* Print entries of a vector.
|
||||
* @param[in] v vector values
|
||||
*/
|
||||
void DebugVector(std::vector<double> const &v);
|
||||
|
||||
/** @brief Compares an STL vector with POD vector.
|
||||
*
|
||||
* The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$
|
||||
* follows the book by
|
||||
* <a href="https://www.springer.com/la/book/9783319446592">Stoyan/Baran</a>, p.8.
|
||||
*
|
||||
* @param[in] x STL vector
|
||||
* @param[in] n length of POD vector
|
||||
* @param[in] y POD vector
|
||||
* @param[in] eps relative accuracy criteria (default := 0.0).
|
||||
* @return true iff pairwise vector elements are relatively close to each other.
|
||||
*
|
||||
*/
|
||||
bool CompareVectors(std::vector<double> const& x, int n, double const y[], double const eps=0.0);
|
||||
|
||||
#endif
|
||||
BIN
ex4/Code.pdf
Normal file
BIN
ex4/Code.pdf
Normal file
Binary file not shown.
76
ex4/ex_4A.py
Normal file
76
ex4/ex_4A.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
np.set_printoptions(precision=2)
|
||||
|
||||
|
||||
N = 10 # number of elements
|
||||
a = 1
|
||||
alpha = 1
|
||||
g_b = 1
|
||||
f_const = 1
|
||||
|
||||
x = np.linspace(0, 1, N + 1)
|
||||
|
||||
|
||||
A = np.zeros((N + 1, N + 1))
|
||||
f_vec = np.zeros(N + 1)
|
||||
|
||||
for i in range(1, N + 1):
|
||||
h = x[i] - x[i - 1]
|
||||
|
||||
|
||||
a_11 = 1./h + a*h/3.
|
||||
a_12 = -1./h + a*h/6.
|
||||
a_21 = -1./h + a*h/6.
|
||||
a_22 = 1./h + a*h/3
|
||||
|
||||
A[i - 1, i - 1] += a_11
|
||||
A[i - 1, i] += a_12
|
||||
A[i, i - 1] += a_21
|
||||
A[i, i] += a_22
|
||||
|
||||
|
||||
f_vec[i] = f_const*h
|
||||
|
||||
print("A =\n", A)
|
||||
|
||||
# take Neumann data into account
|
||||
A[N, N] += alpha
|
||||
f_vec[N] += alpha*g_b
|
||||
|
||||
|
||||
# take Dirichlet data into account
|
||||
u_g = np.zeros(N + 1)
|
||||
u_g[0] = 0
|
||||
print("u_g =\n", u_g)
|
||||
|
||||
# remove first row of A
|
||||
A_g = A[1:N+1, :]
|
||||
#print("A_g =\n", A_g)
|
||||
|
||||
# remove first row of f_vec
|
||||
f_g = f_vec[1:N+1]
|
||||
|
||||
# assemble RHS with dirichlet data
|
||||
f_g -= A_g.dot(u_g)
|
||||
#print(f_g)
|
||||
|
||||
# matrix for the inner nodes (excluding nodes with dirichlet bcs)
|
||||
A_0 = A[1:N+1, 1:N+1]
|
||||
#print(A_0)
|
||||
|
||||
# solve for u_0 (free dofs)
|
||||
u_0 = np.linalg.solve(A_0, f_g)
|
||||
|
||||
# assemble "u = u_0 + u_g"
|
||||
u = np.concatenate([[0], u_0])
|
||||
|
||||
|
||||
print("u =\n", u)
|
||||
|
||||
plt.plot(x, u, '-')
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u_h(x)')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
83
ex4/ex_4B.py
Normal file
83
ex4/ex_4B.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
np.set_printoptions(precision=2)
|
||||
|
||||
|
||||
|
||||
N = 2 # number of elements
|
||||
split = np.sqrt(2)/2
|
||||
#split = 0.5
|
||||
|
||||
|
||||
N_lower = round(N*split) # number of elements in (0, split)
|
||||
print(N_lower, "elements below sqrt(2)/2")
|
||||
N_upper = N - N_lower # number of elements in (split, 1)
|
||||
print(N_upper, "elements above sqrt(2)/2")
|
||||
|
||||
assert(N == N_lower + N_upper)
|
||||
|
||||
|
||||
|
||||
x_lower = np.linspace(0, split, N_lower + 1)
|
||||
x_upper = np.linspace(split, 1, N_upper + 1)
|
||||
x = np.concatenate([x_lower, x_upper[1:]])
|
||||
print(x)
|
||||
|
||||
|
||||
A = np.zeros((N + 1, N + 1))
|
||||
|
||||
for i in range(1, N + 1):
|
||||
h = x[i] - x[i - 1]
|
||||
|
||||
lam = 1
|
||||
if(x[i] > split):
|
||||
lam = 10
|
||||
|
||||
|
||||
a_11 = lam/h
|
||||
a_12 = -lam/h
|
||||
a_21 = -lam/h
|
||||
a_22 = lam/h
|
||||
|
||||
A[i - 1, i - 1] += a_11
|
||||
A[i - 1, i] += a_12
|
||||
A[i, i - 1] += a_21
|
||||
A[i, i] += a_22
|
||||
|
||||
print("A =\n", A)
|
||||
|
||||
|
||||
|
||||
# take dirichlet data into account
|
||||
u_g = np.zeros(N + 1)
|
||||
u_g[0] = 0
|
||||
u_g[N] = 1
|
||||
print("u_g =\n", u_g)
|
||||
|
||||
# remove first and last row of A
|
||||
A_g = A[1:N, :]
|
||||
#print("A_g =\n", A_g)
|
||||
|
||||
# assemble RHS with dirichlet data
|
||||
f = -A_g.dot(u_g)
|
||||
#print(f)
|
||||
|
||||
# matrix for the inner nodes (excluding nodes with dirichlet bcs)
|
||||
A_0 = A[1:N, 1:N]
|
||||
#print(A_0)
|
||||
|
||||
# solve for u_0 (free dofs)
|
||||
u_0 = np.linalg.solve(A_0, f)
|
||||
|
||||
# assemble "u = u_0 + u_g"
|
||||
u = np.concatenate([[0], u_0, [1]])
|
||||
|
||||
|
||||
print("u =\n", u)
|
||||
|
||||
plt.plot(x, u, '-')
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u_h(x)')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
76
ex4/ex_4C.py
Normal file
76
ex4/ex_4C.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
np.set_printoptions(precision=2)
|
||||
|
||||
|
||||
p = 70
|
||||
N_vec = [10, 20, 30, 40, 70]
|
||||
for N in N_vec:
|
||||
|
||||
x = np.linspace(0, 1, N + 1)
|
||||
|
||||
A = np.zeros((N + 1, N + 1))
|
||||
|
||||
for i in range(1, N + 1):
|
||||
h = x[i] - x[i - 1]
|
||||
|
||||
|
||||
a_11 = 1./h - p/2.
|
||||
a_12 = -1./h + p/2.
|
||||
a_21 = -1./h - p/2.
|
||||
a_22 = 1./h + p/2.
|
||||
|
||||
A[i - 1, i - 1] += a_11
|
||||
A[i - 1, i] += a_12
|
||||
A[i, i - 1] += a_21
|
||||
A[i, i] += a_22
|
||||
|
||||
print("A =\n", A)
|
||||
|
||||
|
||||
|
||||
# take dirichlet data into account
|
||||
u_g = np.zeros(N + 1)
|
||||
u_g[0] = 0
|
||||
u_g[N] = 1
|
||||
print("u_g =\n", u_g)
|
||||
|
||||
# remove first and last row of A
|
||||
A_g = A[1:N, :]
|
||||
#print("A_g =\n", A_g)
|
||||
|
||||
# assemble RHS with dirichlet data
|
||||
f = -A_g.dot(u_g)
|
||||
#print(f)
|
||||
|
||||
# matrix for the inner nodes (excluding nodes with dirichlet bcs)
|
||||
A_0 = A[1:N, 1:N]
|
||||
#print(A_0)
|
||||
|
||||
# solve for u_0 (free dofs)
|
||||
u_0 = np.linalg.solve(A_0, f)
|
||||
|
||||
# assemble "u = u_0 + u_g"
|
||||
u = np.concatenate([[0], u_0, [1]])
|
||||
|
||||
|
||||
print("u =\n", u)
|
||||
|
||||
plt.plot(x, u, '-')
|
||||
|
||||
|
||||
# plotting the exact solution
|
||||
plt.plot(x, (np.exp(p*x) - 1.)/(np.exp(p) - 1.))
|
||||
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u_h(x)')
|
||||
plt.legend(N_vec + ['exact'])
|
||||
plt.title("Comparing discrete solution for increasing number of elements")
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BIN
ex4/ex_4_results.pdf
Normal file
BIN
ex4/ex_4_results.pdf
Normal file
Binary file not shown.
|
|
@ -1,30 +0,0 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp mylib.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
|
|
@ -1,142 +0,0 @@
|
|||
#include "check_env.h"
|
||||
#include "mylib.h"
|
||||
#include <cstdlib> // atoi()
|
||||
#include <cstring> // strncmp()
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <omp.h> // OpenMP
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char const *argv[])
|
||||
{
|
||||
omp_set_schedule(omp_sched_static, 2000000);
|
||||
//omp_set_schedule(omp_sched_dynamic, 1000000);
|
||||
//omp_set_schedule(omp_sched_guided, 1000000);
|
||||
//omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
|
||||
|
||||
|
||||
// Speedup for different number of cores (incl. hyperthreading)
|
||||
omp_set_num_threads(8);
|
||||
|
||||
// Print number of available processors
|
||||
cout << "Number of available processors: " << omp_get_num_procs() << endl;
|
||||
|
||||
// Currently executing parallel code? -> no
|
||||
cout << "Currently in parallel? " << omp_in_parallel() << endl;
|
||||
|
||||
|
||||
int const NLOOPS = 10; // chose a value such that the benchmark runs at least 10 sec.
|
||||
unsigned int N = 500000001;
|
||||
|
||||
|
||||
//##########################################################################
|
||||
// Read Parameter from command line (C++ style)
|
||||
cout << "Checking command line parameters for: -n <number> " << endl;
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
cout << " arg[" << i << "] = " << argv[i] << endl;
|
||||
string ss(argv[i]);
|
||||
if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
|
||||
{
|
||||
N = static_cast<unsigned int>(atoi(argv[i + 1]));
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << "Corect call: " << argv[0] << " -n <number>\n";
|
||||
}
|
||||
}
|
||||
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
check_env(argc, argv);
|
||||
//########################################################################
|
||||
int nthreads; // OpenMP
|
||||
#pragma omp parallel default(none) shared(cout,nthreads)
|
||||
{
|
||||
stringstream inparallel;
|
||||
inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
|
||||
|
||||
|
||||
int const th_id = omp_get_thread_num(); // OpenMP
|
||||
int const nthrds = omp_get_num_threads(); // OpenMP
|
||||
stringstream ss;
|
||||
ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
|
||||
#pragma omp critical
|
||||
{
|
||||
cout << ss.str(); // output to a shared ressource
|
||||
cout << inparallel.str() << endl;
|
||||
}
|
||||
#pragma omp master
|
||||
nthreads = nthrds; // transfer nn to to master thread
|
||||
}
|
||||
cout << " " << nthreads << " threads have been started." << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
//##########################################################################
|
||||
// Data initialization
|
||||
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
|
||||
for (unsigned int i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = i + 1;
|
||||
y[i] = 1.0 / x[i];
|
||||
}
|
||||
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking\n";
|
||||
|
||||
// Do calculation
|
||||
double tstart = omp_get_wtime(); // OpenMP
|
||||
|
||||
double sk(0.0);
|
||||
for (int i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
//sk = scalar(x, y);
|
||||
sk = scalar_parallel(x, y);
|
||||
//sk = scalar_trans(x, y);
|
||||
//sk = norm(x);
|
||||
}
|
||||
|
||||
double t1 = omp_get_wtime() - tstart; // OpenMP
|
||||
|
||||
t1 /= NLOOPS; // divide by number of function calls
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << sk << endl;
|
||||
if (static_cast<unsigned int>(sk) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "Total benchmarking time: " << t1*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t1 << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
|
||||
//#########################################################################
|
||||
|
||||
cout << "\n Try the reduction with an STL-vektor \n";
|
||||
|
||||
auto vr = reduction_vec_append(5);
|
||||
cout << "done\n";
|
||||
cout << vr << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
} // memory for x and y will be deallocated their destructors
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
#include "mylib.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <functional> // multiplies<>{}
|
||||
#include <list>
|
||||
#include <numeric> // iota()
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
|
||||
{
|
||||
const size_t nthreads = omp_get_num_threads();
|
||||
const size_t threadnum = omp_get_thread_num();
|
||||
const size_t chunksize = N/nthreads;
|
||||
|
||||
|
||||
size_t start = threadnum*chunksize;
|
||||
size_t end = start + chunksize;
|
||||
if (threadnum == nthreads - 1)
|
||||
end = N;
|
||||
|
||||
|
||||
for (size_t i = start; i < end; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<int> reduction_vec_append(int n)
|
||||
{
|
||||
vector<int> vec(n);
|
||||
#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp critical
|
||||
cout << omp_get_thread_num() << " : " << vec.size() << endl;
|
||||
#pragma omp barrier
|
||||
iota( vec.begin(),vec.end(), omp_get_thread_num() );
|
||||
#pragma omp barrier
|
||||
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
//sum += exp(x[i])*log(y[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
double norm(vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i]*x[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<int> reduction_vec(int n)
|
||||
{
|
||||
vector<int> vec(n);
|
||||
#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp critical
|
||||
cout << omp_get_thread_num() << " : " << vec.size() << endl;
|
||||
#pragma omp barrier
|
||||
iota( vec.begin(),vec.end(), omp_get_thread_num() );
|
||||
#pragma omp barrier
|
||||
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
double scalar_trans(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
vector<double> z(x.size());
|
||||
//list<double> z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-')
|
||||
// c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison
|
||||
|
||||
transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
|
||||
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (auto pi = cbegin(z); pi!=cend(z); ++pi)
|
||||
{
|
||||
sum += *pi;
|
||||
}
|
||||
//for (auto val: z)
|
||||
//{
|
||||
//sum += val;
|
||||
//}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
#pragma once
|
||||
#include <cassert>
|
||||
#include <iomanip> // setw()
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
/** Inner product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
|
||||
// Declare additional reduction operation in OpenMP for STL-vector
|
||||
#pragma omp declare reduction(VecAppend : std::vector<int> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
|
||||
initializer (omp_priv=omp_orig)
|
||||
|
||||
std::vector<int> reduction_vec_append(int n);
|
||||
|
||||
|
||||
/** l2-norm
|
||||
@param[in] x vector
|
||||
@return resulting Euclidian norm
|
||||
*/
|
||||
double norm(std::vector<double> const &x);
|
||||
|
||||
/** Vector @p b adds its elements to vector @p a .
|
||||
@param[in] a vector
|
||||
@param[in] b vector
|
||||
@return a+=b componentwise
|
||||
*/
|
||||
template<class T>
|
||||
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
|
||||
{
|
||||
assert(a.size()==b.size());
|
||||
for (size_t k = 0; k < a.size(); ++k) {
|
||||
a[k] += b[k];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Declare the reduction operation in OpenMP for an STL-vector
|
||||
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
|
||||
// ------------------------------------------------------------
|
||||
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
|
||||
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
|
||||
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
|
||||
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
|
||||
#pragma omp declare reduction(VecAdd : std::vector<int> : omp_out += omp_in) \
|
||||
initializer (omp_priv=omp_orig)
|
||||
|
||||
// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type.
|
||||
//template <class T>
|
||||
//#pragma omp declare reduction(VecAdd : std::vector<T> : omp_out += omp_in) initializer (omp_priv(omp_orig))
|
||||
// MS: template nach #pragma !?
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
/** Test for vector reduction.
|
||||
*
|
||||
* The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
|
||||
* Afterwards these vectors are accumulated, i.e.,
|
||||
* @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
|
||||
*
|
||||
* @param[in] n size of global/private vector
|
||||
* @return resulting global vector.
|
||||
*/
|
||||
std::vector<int> reduction_vec(int n);
|
||||
|
||||
|
||||
|
||||
/** Output of a vector.
|
||||
@param[in,out] s output stream
|
||||
@param[in] x vector
|
||||
@return modified output stream
|
||||
*/
|
||||
template <class T>
|
||||
std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
|
||||
{
|
||||
for (auto const &v : x) s << std::setw(4) << v << " ";
|
||||
return s;
|
||||
}
|
||||
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp mylib.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
|
|
@ -1,501 +0,0 @@
|
|||
141
|
||||
261
|
||||
87
|
||||
430
|
||||
258
|
||||
298
|
||||
425
|
||||
120
|
||||
496
|
||||
707
|
||||
244
|
||||
786
|
||||
75
|
||||
394
|
||||
4
|
||||
221
|
||||
2
|
||||
190
|
||||
143
|
||||
269
|
||||
175
|
||||
139
|
||||
599
|
||||
902
|
||||
940
|
||||
222
|
||||
483
|
||||
377
|
||||
524
|
||||
265
|
||||
69
|
||||
437
|
||||
174
|
||||
27
|
||||
955
|
||||
431
|
||||
962
|
||||
763
|
||||
8
|
||||
681
|
||||
706
|
||||
646
|
||||
553
|
||||
219
|
||||
773
|
||||
229
|
||||
371
|
||||
891
|
||||
857
|
||||
403
|
||||
319
|
||||
609
|
||||
911
|
||||
910
|
||||
592
|
||||
333
|
||||
854
|
||||
443
|
||||
905
|
||||
34
|
||||
533
|
||||
717
|
||||
180
|
||||
337
|
||||
188
|
||||
322
|
||||
404
|
||||
549
|
||||
49
|
||||
553
|
||||
275
|
||||
242
|
||||
244
|
||||
155
|
||||
957
|
||||
936
|
||||
819
|
||||
729
|
||||
176
|
||||
361
|
||||
189
|
||||
2
|
||||
317
|
||||
700
|
||||
626
|
||||
544
|
||||
440
|
||||
288
|
||||
502
|
||||
762
|
||||
763
|
||||
577
|
||||
748
|
||||
646
|
||||
124
|
||||
505
|
||||
348
|
||||
93
|
||||
148
|
||||
199
|
||||
673
|
||||
432
|
||||
695
|
||||
257
|
||||
10
|
||||
533
|
||||
280
|
||||
947
|
||||
907
|
||||
393
|
||||
25
|
||||
672
|
||||
838
|
||||
972
|
||||
57
|
||||
451
|
||||
583
|
||||
687
|
||||
720
|
||||
651
|
||||
727
|
||||
374
|
||||
582
|
||||
117
|
||||
58
|
||||
980
|
||||
285
|
||||
595
|
||||
963
|
||||
186
|
||||
194
|
||||
342
|
||||
933
|
||||
391
|
||||
274
|
||||
152
|
||||
398
|
||||
375
|
||||
132
|
||||
436
|
||||
92
|
||||
615
|
||||
11
|
||||
574
|
||||
790
|
||||
236
|
||||
449
|
||||
570
|
||||
62
|
||||
497
|
||||
643
|
||||
222
|
||||
838
|
||||
972
|
||||
847
|
||||
506
|
||||
279
|
||||
747
|
||||
237
|
||||
958
|
||||
621
|
||||
601
|
||||
173
|
||||
91
|
||||
256
|
||||
859
|
||||
912
|
||||
700
|
||||
726
|
||||
230
|
||||
577
|
||||
811
|
||||
404
|
||||
989
|
||||
90
|
||||
321
|
||||
512
|
||||
61
|
||||
726
|
||||
557
|
||||
530
|
||||
830
|
||||
859
|
||||
790
|
||||
318
|
||||
453
|
||||
753
|
||||
110
|
||||
110
|
||||
270
|
||||
525
|
||||
973
|
||||
711
|
||||
312
|
||||
292
|
||||
851
|
||||
912
|
||||
640
|
||||
256
|
||||
89
|
||||
839
|
||||
585
|
||||
949
|
||||
62
|
||||
585
|
||||
286
|
||||
828
|
||||
191
|
||||
443
|
||||
394
|
||||
827
|
||||
677
|
||||
208
|
||||
319
|
||||
134
|
||||
672
|
||||
571
|
||||
170
|
||||
148
|
||||
477
|
||||
909
|
||||
553
|
||||
33
|
||||
54
|
||||
806
|
||||
452
|
||||
383
|
||||
790
|
||||
365
|
||||
533
|
||||
712
|
||||
872
|
||||
329
|
||||
651
|
||||
975
|
||||
76
|
||||
588
|
||||
414
|
||||
310
|
||||
264
|
||||
759
|
||||
996
|
||||
187
|
||||
782
|
||||
196
|
||||
993
|
||||
803
|
||||
425
|
||||
729
|
||||
499
|
||||
809
|
||||
357
|
||||
74
|
||||
591
|
||||
911
|
||||
194
|
||||
433
|
||||
750
|
||||
40
|
||||
947
|
||||
764
|
||||
559
|
||||
184
|
||||
498
|
||||
518
|
||||
995
|
||||
855
|
||||
963
|
||||
679
|
||||
404
|
||||
935
|
||||
480
|
||||
232
|
||||
397
|
||||
706
|
||||
559
|
||||
757
|
||||
996
|
||||
963
|
||||
536
|
||||
964
|
||||
116
|
||||
52
|
||||
305
|
||||
581
|
||||
531
|
||||
902
|
||||
541
|
||||
432
|
||||
543
|
||||
713
|
||||
17
|
||||
801
|
||||
143
|
||||
479
|
||||
257
|
||||
370
|
||||
662
|
||||
170
|
||||
279
|
||||
199
|
||||
196
|
||||
327
|
||||
881
|
||||
472
|
||||
404
|
||||
180
|
||||
969
|
||||
408
|
||||
845
|
||||
616
|
||||
377
|
||||
878
|
||||
785
|
||||
465
|
||||
814
|
||||
899
|
||||
430
|
||||
335
|
||||
597
|
||||
902
|
||||
703
|
||||
378
|
||||
735
|
||||
955
|
||||
543
|
||||
541
|
||||
312
|
||||
72
|
||||
182
|
||||
93
|
||||
464
|
||||
10
|
||||
916
|
||||
643
|
||||
2
|
||||
31
|
||||
209
|
||||
455
|
||||
128
|
||||
9
|
||||
728
|
||||
355
|
||||
781
|
||||
437
|
||||
437
|
||||
50
|
||||
50
|
||||
92
|
||||
595
|
||||
242
|
||||
842
|
||||
858
|
||||
964
|
||||
489
|
||||
221
|
||||
227
|
||||
537
|
||||
763
|
||||
348
|
||||
462
|
||||
640
|
||||
918
|
||||
162
|
||||
716
|
||||
578
|
||||
434
|
||||
885
|
||||
394
|
||||
179
|
||||
634
|
||||
625
|
||||
328
|
||||
803
|
||||
1000
|
||||
981
|
||||
128
|
||||
233
|
||||
24
|
||||
608
|
||||
111
|
||||
408
|
||||
885
|
||||
549
|
||||
370
|
||||
209
|
||||
441
|
||||
957
|
||||
125
|
||||
471
|
||||
857
|
||||
44
|
||||
692
|
||||
979
|
||||
284
|
||||
134
|
||||
686
|
||||
910
|
||||
611
|
||||
900
|
||||
194
|
||||
755
|
||||
347
|
||||
419
|
||||
156
|
||||
820
|
||||
625
|
||||
739
|
||||
806
|
||||
68
|
||||
951
|
||||
498
|
||||
756
|
||||
743
|
||||
832
|
||||
157
|
||||
458
|
||||
619
|
||||
933
|
||||
836
|
||||
896
|
||||
583
|
||||
583
|
||||
855
|
||||
35
|
||||
886
|
||||
408
|
||||
37
|
||||
747
|
||||
155
|
||||
144
|
||||
606
|
||||
255
|
||||
325
|
||||
402
|
||||
407
|
||||
387
|
||||
610
|
||||
167
|
||||
189
|
||||
95
|
||||
324
|
||||
770
|
||||
235
|
||||
741
|
||||
693
|
||||
825
|
||||
828
|
||||
294
|
||||
310
|
||||
524
|
||||
326
|
||||
832
|
||||
811
|
||||
557
|
||||
263
|
||||
681
|
||||
234
|
||||
457
|
||||
385
|
||||
539
|
||||
992
|
||||
756
|
||||
981
|
||||
235
|
||||
529
|
||||
52
|
||||
757
|
||||
602
|
||||
858
|
||||
989
|
||||
930
|
||||
410
|
||||
1
|
||||
541
|
||||
208
|
||||
220
|
||||
326
|
||||
96
|
||||
748
|
||||
749
|
||||
544
|
||||
339
|
||||
833
|
||||
553
|
||||
958
|
||||
893
|
||||
357
|
||||
547
|
||||
347
|
||||
623
|
||||
797
|
||||
746
|
||||
126
|
||||
823
|
||||
26
|
||||
415
|
||||
732
|
||||
782
|
||||
368
|
||||
|
||||
Binary file not shown.
|
|
@ -1,130 +0,0 @@
|
|||
#include "mylib.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
// read vector from file
|
||||
vector<size_t> data_vector = {};
|
||||
|
||||
ifstream input_stream("data_1.txt");
|
||||
|
||||
size_t line;
|
||||
while(input_stream >> line)
|
||||
{
|
||||
data_vector.push_back(line);
|
||||
}
|
||||
data_vector.shrink_to_fit();
|
||||
|
||||
|
||||
|
||||
|
||||
// specify loops
|
||||
size_t NLOOPS = 10000;
|
||||
|
||||
|
||||
|
||||
// ############# Parallelization with openMP #############
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am_omp, gm_omp, hm_omp;
|
||||
|
||||
double tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
means_omp(data_vector, am_omp, gm_omp, hm_omp);
|
||||
|
||||
double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
// calculate minimum and maximum
|
||||
size_t min, max;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
minmax_omp(data_vector, min, max);
|
||||
|
||||
double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// ############# Parallelization with C++ algorithms #############
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am_cpp, gm_cpp, hm_cpp;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
|
||||
|
||||
double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
// calculate minimum and maximum
|
||||
size_t min_cpp, max_cpp;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
minmax_cpp(data_vector, min_cpp, max_cpp);
|
||||
|
||||
double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// print results
|
||||
cout << "####### OpenMP #######" << endl;
|
||||
cout << "minimum: " << min << endl;
|
||||
cout << "maximum: " << max << endl;
|
||||
cout << "duration: " << t_minmax_omp << endl << endl;
|
||||
|
||||
cout << "arithmetic mean: " << am_omp << endl;
|
||||
cout << "geometric mean: " << gm_omp << endl;
|
||||
cout << "harmonic mean: " << hm_omp << endl;
|
||||
cout << "duration: " << t_means_omp << endl << endl;
|
||||
|
||||
|
||||
cout << "####### C++ #######" << endl;
|
||||
cout << "minimum: " << min_cpp << endl;
|
||||
cout << "maximum: " << max_cpp << endl;
|
||||
cout << "duration: " << t_minmax_cpp << endl << endl;
|
||||
|
||||
cout << "arithmetic mean: " << am_cpp << endl;
|
||||
cout << "geometric mean: " << gm_cpp << endl;
|
||||
cout << "harmonic mean: " << hm_cpp << endl;
|
||||
cout << "duration: " << t_means_cpp << endl << endl;
|
||||
|
||||
|
||||
|
||||
// ####### OpenMP #######
|
||||
// minimum: 1
|
||||
// maximum: 1000
|
||||
// duration: 3.52086e-06
|
||||
|
||||
// arithmetic mean: 498.184
|
||||
// geometric mean: 364.412
|
||||
// harmonic mean: 95.6857
|
||||
// duration: 5.90171e-06
|
||||
|
||||
// ####### C++ #######
|
||||
// minimum: 1
|
||||
// maximum: 1000
|
||||
// duration: 1.76816e-05
|
||||
|
||||
// arithmetic mean: 498.184
|
||||
// geometric mean: 364.412
|
||||
// harmonic mean: 95.6857
|
||||
// duration: 2.35728e-05
|
||||
|
||||
// --> the openMP variant is faster in both cases
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
BIN
ex5/ex5_2/main.o
BIN
ex5/ex5_2/main.o
Binary file not shown.
|
|
@ -1,103 +0,0 @@
|
|||
#include "mylib.h"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <execution>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
am = 0.;
|
||||
gm = 0.;
|
||||
hm = 0.;
|
||||
|
||||
#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
am += numbers[i];
|
||||
gm += log(numbers[i]);
|
||||
hm += 1.0/numbers[i];
|
||||
|
||||
// #pragma omp critical
|
||||
// {
|
||||
// cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
|
||||
// }
|
||||
}
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
|
||||
|
||||
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
global_min = -1; // gives the maximum size_t value
|
||||
global_max = 0;
|
||||
|
||||
#pragma omp parallel shared(numbers, n, global_min, global_max)
|
||||
{
|
||||
const size_t nthreads = omp_get_num_threads();
|
||||
const size_t threadnum = omp_get_thread_num();
|
||||
const size_t chunksize = n/nthreads;
|
||||
|
||||
|
||||
size_t start = threadnum*chunksize;
|
||||
size_t end = start + chunksize;
|
||||
if (threadnum == nthreads - 1)
|
||||
end = n;
|
||||
|
||||
|
||||
size_t local_min = -1;
|
||||
size_t local_max = 0;
|
||||
for (size_t i = start; i < end ; ++i)
|
||||
{
|
||||
if (numbers[i] < local_min)
|
||||
local_min = numbers[i];
|
||||
|
||||
if (numbers[i] > local_max)
|
||||
local_max = numbers[i];
|
||||
}
|
||||
|
||||
#pragma omp critical
|
||||
{
|
||||
if (local_min < global_min)
|
||||
global_min = local_min;
|
||||
|
||||
if (local_max > global_max)
|
||||
global_max = local_max;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
am = reduce(std::execution::par, numbers.begin(), numbers.end());
|
||||
gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
|
||||
hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
|
||||
|
||||
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
|
||||
{
|
||||
auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
|
||||
auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
|
||||
|
||||
global_min = *min_it;
|
||||
global_max = *max_it;
|
||||
}
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
#include <vector>
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
Uses openMP parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates the minimum and maximum of a vector.
|
||||
Uses openMP parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] global_min minimum
|
||||
@param[out] global_max maximum
|
||||
*/
|
||||
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
Uses C++ parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates the minimum and maximum of a vector.
|
||||
Uses C++ parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] global_min minimum
|
||||
@param[out] global_max maximum
|
||||
*/
|
||||
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
|
||||
Binary file not shown.
|
|
@ -1,30 +0,0 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp goldbach.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#pragma once
|
||||
#include "mayer_primes.h"
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
|
||||
@param[in] k first integer
|
||||
@param[out] count number of decompositions
|
||||
*/
|
||||
size_t single_goldbach(size_t k);
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
|
||||
@param[in] n upper integer bound
|
||||
@param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index
|
||||
*/
|
||||
std::vector<size_t> count_goldbach(size_t n);
|
||||
|
||||
|
||||
/** Vector @p b adds its elements to vector @p a .
|
||||
@param[in] a vector
|
||||
@param[in] b vector
|
||||
@return a+=b componentwise
|
||||
*/
|
||||
template<class T>
|
||||
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
|
||||
{
|
||||
assert(a.size()==b.size());
|
||||
for (size_t k = 0; k < a.size(); ++k) {
|
||||
a[k] += b[k];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Declare the reduction operation in OpenMP for an STL-vector
|
||||
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
|
||||
// ------------------------------------------------------------
|
||||
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
|
||||
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
|
||||
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
|
||||
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
|
||||
#pragma omp declare reduction(VecAdd : std::vector<size_t> : omp_out += omp_in) initializer (omp_priv=omp_orig)
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
#include "goldbach.h"
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
|
||||
|
||||
for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
|
||||
{
|
||||
double t_start = omp_get_wtime();
|
||||
|
||||
auto goldbach_vector = count_goldbach(n);
|
||||
|
||||
auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
|
||||
size_t max_number = distance(goldbach_vector.begin(), max_it);
|
||||
|
||||
double t_end = omp_get_wtime() - t_start;
|
||||
|
||||
cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
|
||||
}
|
||||
|
||||
/*
|
||||
###### WITHOUT PARALLELIZATION ######
|
||||
The number 9240 has 329 decompositions. Duration: 0.00307696
|
||||
The number 99330 has 2168 decompositions. Duration: 0.189839
|
||||
The number 390390 has 7094 decompositions. Duration: 1.3042
|
||||
The number 990990 has 15594 decompositions. Duration: 5.45034
|
||||
The number 1981980 has 27988 decompositions. Duration: 47.1807
|
||||
|
||||
|
||||
###### WITH PARALLELIZATION ######
|
||||
The number 9240 has 329 decompositions. Duration: 0.000734854
|
||||
The number 99330 has 2168 decompositions. Duration: 0.0251322
|
||||
The number 390390 has 7094 decompositions. Duration: 0.487375
|
||||
The number 990990 has 15594 decompositions. Duration: 6.16972
|
||||
The number 1981980 has 27988 decompositions. Duration: 31.5699
|
||||
*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -1,73 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstring> //memset
|
||||
#include <vector>
|
||||
//using namespace std;
|
||||
|
||||
/** \brief Determines all prime numbers in interval [2, @p max].
|
||||
*
|
||||
* The sieve of Eratosthenes is used.
|
||||
*
|
||||
* The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
|
||||
*
|
||||
* \param[in] max end of interval for the prime number search.
|
||||
* \return vector of prime numbers @f$2,3,5, ..., p<=max @f$.
|
||||
*
|
||||
* \copyright
|
||||
* Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018)
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
template <class T>
|
||||
std::vector<T> get_primes(T max)
|
||||
{
|
||||
std::vector<T> primes;
|
||||
char *sieve;
|
||||
sieve = new char[max / 8 + 1];
|
||||
// Fill sieve with 1
|
||||
memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
|
||||
for (T x = 2; x <= max; x++)
|
||||
{
|
||||
if (sieve[x / 8] & (0x01 << (x % 8))) {
|
||||
primes.push_back(x);
|
||||
// Is prime. Mark multiplicates.
|
||||
for (T j = 2 * x; j <= max; j += x)
|
||||
{
|
||||
sieve[j / 8] &= ~(0x01 << (j % 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] sieve;
|
||||
return primes;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------
|
||||
//int main() // by Florian Mayer
|
||||
//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out
|
||||
// vector<unsigned long> primes;
|
||||
// primes = get_primes(10000000);
|
||||
// // return 0;
|
||||
// // Print out result.
|
||||
// vector<unsigned long>::iterator it;
|
||||
// for(it=primes.begin(); it < primes.end(); it++)
|
||||
// cout << *it << " ";
|
||||
//
|
||||
// cout << endl;
|
||||
// return 0;
|
||||
//}
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
|
|
@ -1,375 +0,0 @@
|
|||
#include "benchmark_tests.h"
|
||||
#include "benchmarks.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
using namespace std::chrono;
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N)
|
||||
{
|
||||
cout << "#################### (A) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = i % 219 + 1;
|
||||
y[i] = 1.0/x[i];
|
||||
}
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking scalar\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double check(0.0),ss(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
check = scalar_parallel(x, y);
|
||||
ss += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != N)
|
||||
cout << " !! W R O N G result !!\n";
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
|
||||
{
|
||||
cout << "#################### (A) sum ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = 1;
|
||||
}
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking sum\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double check(0.0),ss(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
check = sum(x);
|
||||
ss += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != N)
|
||||
cout << " !! W R O N G result !!\n";
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
|
||||
{
|
||||
cout << "#################### (B) ####################" << endl;
|
||||
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
cout << "\nM = " << M << endl;
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> A(M*N);
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
A[N*i + j] = (i + j) % 219 + 1;
|
||||
|
||||
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
x[j] = 1.0/A[N*17 + j];
|
||||
}
|
||||
|
||||
cout << "\nStart Benchmarking MatVec\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> b(M);
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
b = MatVec_parallel(A, x);
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <A[17,*],x> = " << b[17] << endl;
|
||||
if (static_cast<size_t>(b[17]) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
|
||||
{
|
||||
cout << "#################### (C) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nL = " << L << endl;
|
||||
cout << "\nM = " << M << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> A(M*L);
|
||||
vector<double> B(L*N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
A[L*i + k] = (i + k) % 219 + 1;
|
||||
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
B[N*k + j] = 1.0/A[L*17 + k];
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking MatMat\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> C(M*N);
|
||||
double check;
|
||||
double check_sum = 0;
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
C = MatMat_parallel(A, B, L);
|
||||
|
||||
check = C[N*17];
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
cout << check_sum;
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n C[17,0] = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != L)
|
||||
{
|
||||
cout << " !! W R O N G result !!, should be " << L <<"\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
|
||||
{
|
||||
cout << "#################### (D) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
cout << "\np = " << p << endl;
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> a(p + 1, 0);
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
x[j] = 1.0*j;
|
||||
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
|
||||
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking poly\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> y(N);
|
||||
double check;
|
||||
double check_sum;
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
y = poly_parallel(a, x);
|
||||
check = y[0];
|
||||
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n poly(" << x[0] << ") = " << check << endl;
|
||||
if (abs(check - 1.0) > 1.0/1e6)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N);
|
||||
|
||||
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
|
||||
|
|
@ -1,141 +0,0 @@
|
|||
#include "benchmarks.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <omp.h>
|
||||
|
||||
// (A) Inner product of two vectors (from skalar_stl)
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
|
||||
#pragma omp parallel for shared(x, y, N) reduction(+:sum)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// (A) Vector entry sum
|
||||
double sum(vector<double> const &x)
|
||||
{
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for shared(x) reduction(+:sum)
|
||||
for (size_t i = 0; i < x.size(); ++i)
|
||||
{
|
||||
sum += x[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
// (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
|
||||
{
|
||||
size_t const nelem = A.size();
|
||||
size_t const N = x.size();
|
||||
assert(nelem % N == 0); // make sure multiplication is possible
|
||||
size_t const M = nelem/N;
|
||||
|
||||
vector<double> b(M);
|
||||
|
||||
#pragma omp parallel for shared(A, x, N, M, b)
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
double tmp = 0.0;
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
tmp += A[N*i + j] * x[j];
|
||||
b[i] = tmp;
|
||||
}
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
// (C) Matrix-matrix product
|
||||
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
|
||||
{
|
||||
size_t const nelem_A = A.size();
|
||||
size_t const nelem_B = B.size();
|
||||
|
||||
assert(nelem_A % L == 0 && nelem_B % L == 0);
|
||||
|
||||
size_t const M = nelem_A/L;
|
||||
size_t const N = nelem_B/L;
|
||||
|
||||
|
||||
vector<double> C(M*N);
|
||||
|
||||
|
||||
#pragma omp parallel for shared(A, B, M, N, L, C)
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
C[N*i + j] += A[L*i + k]*B[N*k + j];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
|
||||
// (D) Evaluation of a polynomial function
|
||||
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
size_t const p = a.size() - 1;
|
||||
vector<double> y(N, 0);
|
||||
|
||||
#pragma omp parallel for shared(a, x, N, p, y)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
double x_temp = x[i];
|
||||
double y_temp = 0;
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
{
|
||||
y_temp += x_temp*y_temp + a[p - k];
|
||||
}
|
||||
y[i] = y_temp;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,55 +0,0 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
/** (A) Inner product of two vectors (from skalar_stl)
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
|
||||
/** (A) Sum entries of vector
|
||||
@param[in] x vector
|
||||
@return sum
|
||||
*/
|
||||
double sum(vector<double> const &x);
|
||||
|
||||
|
||||
/** (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
* @param[in] A dense matrix (1D access)
|
||||
* @param[in] u vector
|
||||
*
|
||||
* @return resulting vector
|
||||
*/
|
||||
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
|
||||
|
||||
|
||||
/** (C) Matrix-matrix product
|
||||
* @param[in] A MxL dense matrix (1D access)
|
||||
* @param[in] B LxN dense matrix (1D access)
|
||||
* @param[in] shared_dim shared dimension L
|
||||
*
|
||||
* @return resulting MxN matrix
|
||||
*/
|
||||
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
|
||||
|
||||
|
||||
/** (D) Evaluation of a polynomial function using Horner's scheme
|
||||
* @param[in] a coefficient vector
|
||||
* @param[in] x vector with input values
|
||||
*
|
||||
* @return vector with output values
|
||||
*/
|
||||
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
#include "benchmark_tests.h"
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
int main()
|
||||
{
|
||||
vector<vector<double>> results_scalar;
|
||||
results_scalar.push_back(test_A(2000000, pow(10,3)));
|
||||
results_scalar.push_back(test_A(1000000, pow(10,4)));
|
||||
results_scalar.push_back(test_A(100000, pow(10,5)));
|
||||
results_scalar.push_back(test_A(10000, pow(10,6)));
|
||||
results_scalar.push_back(test_A(750, pow(10,7)));
|
||||
results_scalar.push_back(test_A(125, pow(10,8)));
|
||||
|
||||
|
||||
vector<vector<double>> results_sum;
|
||||
results_sum.push_back(test_A_sum(3000000, pow(10,3)));
|
||||
results_sum.push_back(test_A_sum(2000000, pow(10,4)));
|
||||
results_sum.push_back(test_A_sum(1000000, pow(10,5)));
|
||||
results_sum.push_back(test_A_sum(50000, pow(10,6)));
|
||||
results_sum.push_back(test_A_sum(2000, pow(10,7)));
|
||||
results_sum.push_back(test_A_sum(250, pow(10,8)));
|
||||
|
||||
|
||||
test_B(100, 20000, 10000);
|
||||
|
||||
test_C(25, 500, 1000, 1500);
|
||||
|
||||
test_D(100, 100, 1000000);
|
||||
|
||||
|
||||
|
||||
cout << endl << "###### Scalar ######" << endl;
|
||||
cout << "Timing\tGFLOPS\tGiByte/s" << endl;
|
||||
cout << "------------------------------" << endl;
|
||||
for (size_t i = 0; i < results_scalar.size(); ++i)
|
||||
cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
|
||||
|
||||
cout << endl << "###### Sum ######" << endl;
|
||||
cout << "Timing\tGFLOPS\tGiByte/s" << endl;
|
||||
cout << "------------------------------" << endl;
|
||||
for (size_t i = 0; i < results_sum.size(); ++i)
|
||||
cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
// ###### Scalar ######
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------
|
||||
// 3.4e-06 0.54 4.3
|
||||
// 4.6e-06 4 32
|
||||
// 1.6e-05 12 95
|
||||
// 0.0011 1.7 13
|
||||
// 0.0097 1.9 15
|
||||
// 0.075 2.5 20
|
||||
|
||||
|
||||
// ###### Sum ######
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------
|
||||
// 5.5e-06 0.17 1.3
|
||||
// 5.4e-06 1.7 14
|
||||
// 1.5e-05 6.1 49
|
||||
// 0.00013 7.2 57
|
||||
// 0.0033 2.8 23
|
||||
// 0.032 2.9 23
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// ######### NOT PARALLEL (from exercise sheet 2) #########
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ----------------------------------
|
||||
// (A) 0.038 2.5 20
|
||||
// (B) 0.13 2.9 23
|
||||
// (C) 0.44 3.2 25
|
||||
// (D) 0.19 1.5 12
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
2875
utils/Doxyfile
Normal file
2875
utils/Doxyfile
Normal file
File diff suppressed because it is too large
Load diff
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue