commit 1bee3e8e5bddbbe1b2d795f6d82838f95b618cb6 Author: jakob.schratter Date: Tue Dec 9 22:06:13 2025 +0100 Pushing everything again, accidentally deleted my remote repository diff --git a/ex1/CLANG_default.mk b/ex1/CLANG_default.mk new file mode 100644 index 0000000..4bc290d --- /dev/null +++ b/ex1/CLANG_default.mk @@ -0,0 +1,123 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=CLANG_ + +#CLANGPATH=//usr/lib/llvm-10/bin/ +CC = ${CLANGPATH}clang +CXX = ${CLANGPATH}clang++ +#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0 +#F77 = gfortran +LINKER = ${CXX} + +#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages +WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1 +WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic +#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic + +CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS} +# don't use -Ofast +# -ftrapv +LINKFLAGS += -O3 + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +LINKFLAGS += -llapack -lblas +# -lopenblas +ifeq ($(UBUNTU),1) +# ubuntu +else +# on archlinux +LINKFLAGS += -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# very good check +# http://clang.llvm.org/extra/clang-tidy/ +# good check, see: http://llvm.org/docs/CodingStandards.html#include-style +SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init +SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration +#READABILITY=,readability*${SWITCH_OFF} +#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +#TIDYFLAGS += -checks='modernize* +# ??? +#TIDYFLAGS = -checks='cert*' -header-filter=.* +# MPI checks ?? +#TIDYFLAGS = -checks='mpi*' +# ?? +#TIDYFLAGS = -checks='performance*' -header-filter=.* +#TIDYFLAGS = -checks='portability-*' -header-filter=.* +#TIDYFLAGS = -checks='readability-*' -header-filter=.* + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +codecheck: tidy_check +tidy_check: + clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES} +# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html + +run: clean ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +codecheck: tidy_check diff --git a/ex1/GCC_AMD32_default.mk b/ex1/GCC_AMD32_default.mk new file mode 100644 index 0000000..a911b6b --- /dev/null +++ b/ex1/GCC_AMD32_default.mk @@ -0,0 +1,130 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=GCC_ + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +# on mephisto: +#CXXFLAGS += -I/share/apps/atlas/include +#LINKFLAGS += -L/share/apps/atlas/lib +#LINKFLAGS += -lcblas -latlas + +#LINKFLAGS += -lblas +# Der Header muss mit extern "C" versehen werden, damit g++ alles findet. + + +#WARNINGS = -pedantic -pedantic-errors -Wall -Wextra -Werror -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow +WARNINGS = -pedantic -Wall -Wextra -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +# -Wunreachable-code +CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS} +#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS} +#-msse3 +# -ftree-vectorizer-verbose=2 -DNDEBUG +# -ftree-vectorizer-verbose=5 +# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr + +# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details +# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2 +# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +LINKFLAGS += -O3 + +# BLAS, LAPACK +OPENBLAS_DIR = /opt/openblas_GCCseq +#OPENBLAS_DIR = /opt/openblas_GCC +OPENBLAS_LIBDIR = ${OPENBLAS_DIR}/lib +OPENBLAS_INCDIR = ${OPENBLAS_DIR}/include +CXXFLAGS += -I${OPENBLAS_INCDIR} +LINKFLAGS += -L${OPENBLAS_LIBDIR} -lopenblas + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + -@rm -f *_ *~ *.bak *.log *.out *.tar *.orig + -@rm -r html + +run: clean ${PROGRAM} +# time ./${PROGRAM} +# ./${PROGRAM} + ( export LD_LIBRARY_PATH=${OPENBLAS_LIBDIR}:${LD_LIBRARY_PATH} ; ./${PROGRAM} ) +# or 'export LD_LIBRARY_PATH=/opt/openblas_gcc/lib:${LD_LIBRARY_PATH}' in your ~/.bashrc + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +# no "-pg" in compile/link options +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + ./$^ + gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +#Trace your heap: +#> heaptrack ./main.GCC_ +#> heaptrack_gui heaptrack.main.GCC_..gz +heap: ${PROGRAM} + heaptrack ./$^ 11 + heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` & + + + +######################################################################## +# get the detailed status of all optimization flags +info: + echo "detailed status of all optimization flags" + $(CXX) --version + $(CXX) -Q $(CXXFLAGS) --help=optimizers diff --git a/ex1/GCC_default.mk b/ex1/GCC_default.mk new file mode 100644 index 0000000..803c060 --- /dev/null +++ b/ex1/GCC_default.mk @@ -0,0 +1,183 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=GCC_ + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS} +#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS} +#-msse3 +# -ftree-vectorizer-verbose=2 -DNDEBUG +# -ftree-vectorizer-verbose=5 +# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr + +# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details +# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2 +# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +LINKFLAGS += -O3 + +#architecture +#CPU = -march=znver2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +ifeq ($(UBUNTU),1) +LINKFLAGS += -llapack -lblas +# -lopenblas +else +# on archlinux +LINKFLAGS += -llapack -lopenblas -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# for debugging purpose (save code) +# -fsanitize=leak # only one out the three can be used +# -fsanitize=address +# -fsanitize=thread +SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \ + -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \ + -fsanitize=bool -fsanitize=enum -fsanitize=vptr +#CXXFLAGS += ${SANITARY} +#LINKFLAGS += ${SANITARY} + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg + + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + -@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt + -@rm -rf html + +run: clean ${PROGRAM} +#run: ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * +#find . -size +10M > large_files +#--exclude-from ${MY_DIR}/large_files + +zip: clean + @echo "Zip the directory: " ${MY_DIR} + @cd .. ;\ + zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} + +doc: + doxygen Doxyfile + +######################################################################### +.SUFFIXES: .f90 + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log) + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +.f90.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +# no "-pg" in compile/link options +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} +# Graphical interface +# valkyrie + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/ +# * install +# * sudo vi /etc/sysctl.conf +# add kernel.perf_event_paranoid = 0 + +#Trace your heap: +#> heaptrack ./main.GCC_ +#> heaptrack_gui heaptrack.main.GCC_..gz +heap: ${PROGRAM} + heaptrack ./$^ ${PARAMS} + heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` & + +codecheck: $(SOURCES) + cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^ + + +######################################################################## +# get the detailed status of all optimization flags +info: + echo "detailed status of all optimization flags" + $(CXX) --version + $(CXX) -Q $(CXXFLAGS) --help=optimizers + lscpu + inxi -C + lstopo + +# Excellent hardware info +# hardinfo +# Life monitoring of CPU frequency etc. +# sudo i7z + +# Memory consumption +# vmstat -at -SM 3 +# xfce4-taskmanager + + +# https://www.tecmint.com/check-linux-cpu-information/ +#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/ + +# Debugging: +# https://wiki.archlinux.org/index.php/Debugging diff --git a/ex1/ICC_default.mk b/ex1/ICC_default.mk new file mode 100644 index 0000000..d4bd4db --- /dev/null +++ b/ex1/ICC_default.mk @@ -0,0 +1,137 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ICC_ + +#BINDIR = /opt/intel/bin/ + +# special on my sony [GH] +#BINDIR = /opt/save.intel/bin/ +# very special on my sony [GH] +# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so + +# Error with g++-4.8 using icpc14.0,x: +# find directory wherein bits/c++config.h is located +# 'locate bits/c++config.h' +#FOUND_CONFIG = -I/usr/include/x86_64-linux-gnu/c++/4.8 + + +CC = ${BINDIR}icc +CXX = ${BINDIR}icpc +F77 = ${BINDIR}ifort +LINKER = ${CXX} + + +WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3 +# -Winline -Wredundant-decls -Wunreachable-code +CXXFLAGS += -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl ${FOUND_CONFIG} +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg +# -vec-report=3 +# -qopt-report=5 -qopt-report-phase=vec +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd +CXXFLAGS += -align + +# use MKL by INTEL +# https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/linking-your-application-with-the-intel-math-kernel-library/linking-quick-start/using-the-mkl-compiler-option.html +# https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html +# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +#LINKFLAGS += -O3 -L/opt/intel/mkl/lib -mkl +LINKFLAGS += -O3 -mkl=sequential + +# interprocedural optimization +CXXFLAGS += -ipo +LINKFLAGS += -ipo + +# annotated assembler file +ANNOTED = -fsource-asm -S + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# cache: ${PROGRAM} +# valgrind --tool=callgrind --simulate-cache=yes ./$^ +# # kcachegrind callgrind.out. & +# +# # Check for wrong memory accesses, memory leaks, ... +# # use smaller data sets +# mem: ${PROGRAM} +# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ +# +# # Simple run time profiling of your code +# # CXXFLAGS += -g -pg +# # LINKFLAGS += -pg +# prof: ${PROGRAM} +# ./$^ +# gprof -b ./$^ > gp.out +# # kprof -f gp.out -p gprof & +# + + +mem: inspector +prof: amplifier +cache: amplifier + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# alternatively to the solution above: + #edit file /etc/sysctl.d/10-ptrace.conf and set variable kernel.yama.ptrace_scope variable to 0 . + amplxe-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + inspxe-gui & + +advisor: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + advixe-gui & + +icc-info: + icpc -# main.cpp + + + + diff --git a/ex1/ONEAPI_default.mk b/ex1/ONEAPI_default.mk new file mode 100644 index 0000000..fe7b3fe --- /dev/null +++ b/ex1/ONEAPI_default.mk @@ -0,0 +1,176 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ONEAPI_ + +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# requires +# source /opt/intel/oneapi/setvars.sh +# on AMD: export MKL_DEBUG_CPU_TYPE=5 + +#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/ +#MKL_ROOT = /opt/intel/oneapi/mkl/latest/ +#export KMP_AFFINITY=verbose,compact + +CC = ${BINDIR}icc +CXX = ${BINDIR}dpcpp +F77 = ${BINDIR}ifort +LINKER = ${CXX} + +## Compiler flags +WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic +WARNINGS += -Wpessimizing-move -Wredundant-move +#-wd2015,2012,2014 -wn3 +# -Winline -Wredundant-decls -Wunreachable-code +# -qopt-subscript-in-range +# -vec-threshold0 + +CXXFLAGS += -O3 -std=c++17 ${WARNINGS} +#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include" +#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include" +LINKFLAGS += -O3 + +# interprocedural optimization +CXXFLAGS += -ipo +LINKFLAGS += -ipo +LINKFLAGS += -flto + +# annotated Assembler file +ANNOTED = -fsource-asm -S + +#architecture +CPU = -march=core-avx2 +#CPU += -mtp=zen +# -xCORE-AVX2 +# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# use MKL by INTEL +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# sequential MKL +# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64) +CXXFLAGS += -qmkl=sequential -UMKL_ILP64 +LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread + +# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static +# install intel-oneapi-compiler-static +# or +LINKFLAGS += -shared-intel + + +OPENMP = -qopenmp +CXXFLAGS += ${OPENMP} +LINKFLAGS += ${OPENMP} + + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg +# -vec-report=3 +# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg +# Redirect report from *.optrpt to stderr +# -qopt-report-file=stderr +# Guided paralellization +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +## run time checks +# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html + + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} *.optrpt + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html + +mem: inspector +prof: vtune +cache: inspector + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid + amplxe-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# inspxe-gui & + vtune-gui ./${PROGRAM} & + +advisor: + source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh +# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui & + advisor --collect=survey ./${PROGRAM} +# advisor --collect=roofline ./${PROGRAM} + advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv + +vtune: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new + export ADVIXE_EXPERIMENTAL=roofline + vtune -collect hotspots ./${PROGRAM} + vtune -report hotspots -r r000hs > vtune.out +# vtune-gui ./${PROGRAM} & + +icc-info: + icpc -# main.cpp + +# MKL on AMD +# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/ +# +# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu +# export MKL_DEBUG_CPU_TYPE=5 +# export MKL_NUM_THRAEDS=1 +# export MKL_DYNAMIC=false +# on Intel compiler +# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html diff --git a/ex1/PGI_default.mk b/ex1/PGI_default.mk new file mode 100644 index 0000000..40760e5 --- /dev/null +++ b/ex1/PGI_default.mk @@ -0,0 +1,93 @@ +# Basic Defintions for using PGI-compiler suite sequentially +# requires setting of COMPILER=PGI_ +# OPTIRUN = optirun + + +CC = pgcc +CXX = pgc++ +F77 = pgfortran +LINKER = ${CXX} + +# on mephisto: +#CXXFLAGS += -I/share/apps/atlas/include +#LINKFLAGS += -L/share/apps/atlas/lib +#LINKFLAGS += -lcblas -latlas + +#LINKFLAGS += -lblas +# Der Header muss mit extern "C" versehen werden, damit g++ alles findet. + +WARNINGS = -Minform=warn +# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow -Wredundant-decls +# -pedantic -Wunreachable-code -Wextra -Winline +# -Wunreachable-code + +#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel +PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity +# -Minfo +# -Mprof=time +# -Mprof=lines +# take care with option -Msafeptr +CXXFLAGS += -O3 -std=c++17 ${WARNINGS} +#CXXFLAGS += -O3 -std=c++11 -DNDEBUG ${PGI_PROFILING} ${WARNINGS} +# -fastsse -fargument-noalias ${WARNINGS} -msse3 -vec-report=3 + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Simple run time profiling of your code +# # CXXFLAGS += -g -pg +# # LINKFLAGS += -pg + + +# Profiling options PGI, see: pgcollect -help +# CPU_PROF = -allcache +CPU_PROF = -time +# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit +#GPU_PROF = -cuda=branch:cc20 +# +PROF_FILE = pgprof.out + +cache: prof + +prof: ${PROGRAM} + ${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^ + ${OPTIRUN} ${BINDIR}pgprof -exe ./$^ $(PROF_FILE) & + +info: + pgaccelinfo -v diff --git a/ex1/ex1A_mean_values/.vscode/tasks.json b/ex1/ex1A_mean_values/.vscode/tasks.json new file mode 100644 index 0000000..08d9005 --- /dev/null +++ b/ex1/ex1A_mean_values/.vscode/tasks.json @@ -0,0 +1,28 @@ +{ + "tasks": [ + { + "type": "cppbuild", + "label": "C/C++: gcc build active file", + "command": "/usr/bin/gcc", + "args": [ + "-fdiagnostics-color=always", + "-g", + "${file}", + "-o", + "${fileDirname}/${fileBasenameNoExtension}" + ], + "options": { + "cwd": "${fileDirname}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "detail": "Task generated by Debugger." + } + ], + "version": "2.0.0" +} \ No newline at end of file diff --git a/ex1/ex1A_mean_values/Makefile b/ex1/ex1A_mean_values/Makefile new file mode 100644 index 0000000..b854f57 --- /dev/null +++ b/ex1/ex1A_mean_values/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp ../ex1A_mean_values/means.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex1/ex1A_mean_values/main.GCC_ b/ex1/ex1A_mean_values/main.GCC_ new file mode 100755 index 0000000..bb53847 Binary files /dev/null and b/ex1/ex1A_mean_values/main.GCC_ differ diff --git a/ex1/ex1A_mean_values/main.cpp b/ex1/ex1A_mean_values/main.cpp new file mode 100644 index 0000000..03da30f --- /dev/null +++ b/ex1/ex1A_mean_values/main.cpp @@ -0,0 +1,35 @@ +#include "means.h" +#include +#include +using namespace std; + +int main(int argc, char **argv) +{ + double arithmetic_mean, geometric_mean, harmonic_mean; + + // Fixed version + calculate_means(1, 4, 16, arithmetic_mean, geometric_mean, harmonic_mean); + cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl; + + calculate_means(2, 3, 5, arithmetic_mean, geometric_mean, harmonic_mean); + cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl; + + calculate_means(1000, 4000, 16000, arithmetic_mean, geometric_mean, harmonic_mean); + cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl; + cout << "--------------------------------" << endl; + + + + // Scalable version + calculate_means(vector {1, 4, 16}, arithmetic_mean, geometric_mean, harmonic_mean); + cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl; + + calculate_means(vector {2, 3, 5}, arithmetic_mean, geometric_mean, harmonic_mean); + cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl; + + calculate_means(vector {1000, 4000, 16000}, arithmetic_mean, geometric_mean, harmonic_mean); + cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl; + + + return 0; +} diff --git a/ex1/ex1A_mean_values/main.o b/ex1/ex1A_mean_values/main.o new file mode 100644 index 0000000..3e4f266 Binary files /dev/null and b/ex1/ex1A_mean_values/main.o differ diff --git a/ex1/ex1A_mean_values/means.cpp b/ex1/ex1A_mean_values/means.cpp new file mode 100644 index 0000000..903a9e2 --- /dev/null +++ b/ex1/ex1A_mean_values/means.cpp @@ -0,0 +1,30 @@ +#include "../ex1A_mean_values/means.h" +#include +#include + +void calculate_means(int a, int b, int c, double &am, double &gm, double &hm) +{ + am = (a + b + c)/3.0; + gm = exp((log(a)+log(b)+log(c))/3); + hm = 3.0/(1.0/a + 1.0/b + 1.0/c); +} + +void calculate_means(std::vector numbers, double &am, double &gm, double &hm) +{ + int n = numbers.size(); + + am = 0.; + gm = 0.; + hm = 0.; + + for (int i = 0; i < n; ++i) + { + am += numbers[i]; + gm += log(numbers[i]); + hm += 1.0/numbers[i]; + } + + am /= n; + gm = exp(gm/n); + hm = n/hm; +} \ No newline at end of file diff --git a/ex1/ex1A_mean_values/means.h b/ex1/ex1A_mean_values/means.h new file mode 100644 index 0000000..4a12964 --- /dev/null +++ b/ex1/ex1A_mean_values/means.h @@ -0,0 +1,23 @@ +#pragma once +#include + +/** + This function calculates arithmetic mean, geometric mean and harmonic mean of three integers. + @param[in] a first integer + @param[in] b second integer + @param[in] c third integer + @param[out] am arithmetic mean + @param[out] gm geometric mean + @param[out] hm harmonic mean +*/ +void calculate_means(int a, int b, int c, double &am, double &gm, double &hm); + +/** + This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector. + @param[in] numbers vector containing integers + @param[out] am arithmetic mean + @param[out] gm geometric mean + @param[out] hm harmonic mean +*/ +void calculate_means(std::vector numbers, double &am, double &gm, double &hm); + diff --git a/ex1/ex1A_mean_values/means.o b/ex1/ex1A_mean_values/means.o new file mode 100644 index 0000000..035493b Binary files /dev/null and b/ex1/ex1A_mean_values/means.o differ diff --git a/ex1/ex1B_data-IO_and_vectors/Makefile b/ex1/ex1B_data-IO_and_vectors/Makefile new file mode 100644 index 0000000..b854f57 --- /dev/null +++ b/ex1/ex1B_data-IO_and_vectors/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp ../ex1A_mean_values/means.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex1/ex1B_data-IO_and_vectors/data_1.txt b/ex1/ex1B_data-IO_and_vectors/data_1.txt new file mode 100644 index 0000000..0cc4bb8 --- /dev/null +++ b/ex1/ex1B_data-IO_and_vectors/data_1.txt @@ -0,0 +1,501 @@ +141 +261 +87 +430 +258 +298 +425 +120 +496 +707 +244 +786 +75 +394 +4 +221 +2 +190 +143 +269 +175 +139 +599 +902 +940 +222 +483 +377 +524 +265 +69 +437 +174 +27 +955 +431 +962 +763 +8 +681 +706 +646 +553 +219 +773 +229 +371 +891 +857 +403 +319 +609 +911 +910 +592 +333 +854 +443 +905 +34 +533 +717 +180 +337 +188 +322 +404 +549 +49 +553 +275 +242 +244 +155 +957 +936 +819 +729 +176 +361 +189 +2 +317 +700 +626 +544 +440 +288 +502 +762 +763 +577 +748 +646 +124 +505 +348 +93 +148 +199 +673 +432 +695 +257 +10 +533 +280 +947 +907 +393 +25 +672 +838 +972 +57 +451 +583 +687 +720 +651 +727 +374 +582 +117 +58 +980 +285 +595 +963 +186 +194 +342 +933 +391 +274 +152 +398 +375 +132 +436 +92 +615 +11 +574 +790 +236 +449 +570 +62 +497 +643 +222 +838 +972 +847 +506 +279 +747 +237 +958 +621 +601 +173 +91 +256 +859 +912 +700 +726 +230 +577 +811 +404 +989 +90 +321 +512 +61 +726 +557 +530 +830 +859 +790 +318 +453 +753 +110 +110 +270 +525 +973 +711 +312 +292 +851 +912 +640 +256 +89 +839 +585 +949 +62 +585 +286 +828 +191 +443 +394 +827 +677 +208 +319 +134 +672 +571 +170 +148 +477 +909 +553 +33 +54 +806 +452 +383 +790 +365 +533 +712 +872 +329 +651 +975 +76 +588 +414 +310 +264 +759 +996 +187 +782 +196 +993 +803 +425 +729 +499 +809 +357 +74 +591 +911 +194 +433 +750 +40 +947 +764 +559 +184 +498 +518 +995 +855 +963 +679 +404 +935 +480 +232 +397 +706 +559 +757 +996 +963 +536 +964 +116 +52 +305 +581 +531 +902 +541 +432 +543 +713 +17 +801 +143 +479 +257 +370 +662 +170 +279 +199 +196 +327 +881 +472 +404 +180 +969 +408 +845 +616 +377 +878 +785 +465 +814 +899 +430 +335 +597 +902 +703 +378 +735 +955 +543 +541 +312 +72 +182 +93 +464 +10 +916 +643 +2 +31 +209 +455 +128 +9 +728 +355 +781 +437 +437 +50 +50 +92 +595 +242 +842 +858 +964 +489 +221 +227 +537 +763 +348 +462 +640 +918 +162 +716 +578 +434 +885 +394 +179 +634 +625 +328 +803 +1000 +981 +128 +233 +24 +608 +111 +408 +885 +549 +370 +209 +441 +957 +125 +471 +857 +44 +692 +979 +284 +134 +686 +910 +611 +900 +194 +755 +347 +419 +156 +820 +625 +739 +806 +68 +951 +498 +756 +743 +832 +157 +458 +619 +933 +836 +896 +583 +583 +855 +35 +886 +408 +37 +747 +155 +144 +606 +255 +325 +402 +407 +387 +610 +167 +189 +95 +324 +770 +235 +741 +693 +825 +828 +294 +310 +524 +326 +832 +811 +557 +263 +681 +234 +457 +385 +539 +992 +756 +981 +235 +529 +52 +757 +602 +858 +989 +930 +410 +1 +541 +208 +220 +326 +96 +748 +749 +544 +339 +833 +553 +958 +893 +357 +547 +347 +623 +797 +746 +126 +823 +26 +415 +732 +782 +368 + diff --git a/ex1/ex1B_data-IO_and_vectors/main.GCC_ b/ex1/ex1B_data-IO_and_vectors/main.GCC_ new file mode 100755 index 0000000..cb311be Binary files /dev/null and b/ex1/ex1B_data-IO_and_vectors/main.GCC_ differ diff --git a/ex1/ex1B_data-IO_and_vectors/main.cpp b/ex1/ex1B_data-IO_and_vectors/main.cpp new file mode 100644 index 0000000..0acb48f --- /dev/null +++ b/ex1/ex1B_data-IO_and_vectors/main.cpp @@ -0,0 +1,53 @@ +#include "../ex1A_mean_values/means.h" +#include +#include +#include +#include +#include +using namespace std; + + +int main(int argc, char **argv) +{ + // read vector from file + vector data_vector = {}; + + ifstream input_stream("data_1.txt"); + + int line; + while(input_stream >> line) + { + data_vector.push_back(line); + } + data_vector.shrink_to_fit(); + + + // calculate minimum and maximum + vector::iterator min_it = min_element(data_vector.begin(), data_vector.end()); + vector::iterator max_it = max_element(data_vector.begin(), data_vector.end()); + + // calculate arithmetic mean, geometric mean and harmonic mean + double am, gm, hm; + calculate_means(data_vector, am, gm, hm); + + + // calculate standard deviation + double sd = 0.; + int n = data_vector.size(); + for (int i = 0; i < n; ++i) + { + sd += pow(data_vector[i] - am, 2); + } + sd = sqrt(sd/n); + + + // print results + cout << "minimum: " << *min_it << endl; + cout << "maximum: " << *max_it << endl; + cout << "arithmetic mean: " << am << endl; + cout << "geometric mean: " << gm << endl; + cout << "harmonic mean: " << hm << endl; + cout << "standard deviation: " << sd << endl; + + return 0; +} diff --git a/ex1/ex1B_data-IO_and_vectors/main.o b/ex1/ex1B_data-IO_and_vectors/main.o new file mode 100644 index 0000000..ba487ec Binary files /dev/null and b/ex1/ex1B_data-IO_and_vectors/main.o differ diff --git a/ex1/ex1C_summation_of_specified_numbers/main.cpp b/ex1/ex1C_summation_of_specified_numbers/main.cpp new file mode 100644 index 0000000..f469cd2 --- /dev/null +++ b/ex1/ex1C_summation_of_specified_numbers/main.cpp @@ -0,0 +1,31 @@ +#include "special_sum.h" +#include "../utils/timing.h" +#include +#include +#include +using namespace std; + +int main(int argc, char **argv) +{ + // check results and compare speeds + for(size_t n : {15, 1001, 1432987}) + { + cout << "n = " << n << endl; + size_t sum_1, sum_2; + + tic(); + for(size_t i = 0; i < 1000; ++i) + sum_1 = special_sum_loop(n); + double time_1 = toc(); + + tic(); + for(size_t i = 0; i < 1000; ++i) + sum_2 = special_sum_noloop(n); + double time_2 = toc(); + + cout << "loop: " << sum_1 << "\t\tDuration: " << time_1 << endl; + cout << "no loop: " << sum_2 << "\t\tDuration: " << time_2 << endl << "---------------------------------------------------" << endl; + } + + return 0; +} diff --git a/ex1/ex1C_summation_of_specified_numbers/special_sum.cpp b/ex1/ex1C_summation_of_specified_numbers/special_sum.cpp new file mode 100644 index 0000000..d2860ff --- /dev/null +++ b/ex1/ex1C_summation_of_specified_numbers/special_sum.cpp @@ -0,0 +1,28 @@ +#include "special_sum.h" + +size_t gauss_sum(size_t n) +{ + return (n*(n+1))/2; +} + +size_t special_sum_loop(size_t n) +{ + size_t sum = 0; + for (size_t i = 1; i < n+1; ++i) + { + if (i % 3 == 0 || i % 5 == 0) + { + sum += i; + } + } + return sum; +} + +size_t special_sum_noloop(size_t n) +{ + size_t factor_3 = gauss_sum(n/3); // dividing int by int automatically gets rounded off + size_t factor_5 = gauss_sum(n/5); + size_t factor_15 = gauss_sum(n/15); + + return factor_3*3 + factor_5*5 - factor_15*15; +} \ No newline at end of file diff --git a/ex1/ex1C_summation_of_specified_numbers/special_sum.h b/ex1/ex1C_summation_of_specified_numbers/special_sum.h new file mode 100644 index 0000000..a77964b --- /dev/null +++ b/ex1/ex1C_summation_of_specified_numbers/special_sum.h @@ -0,0 +1,18 @@ +#include + +/** + This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITH using a loop. + @param[in] n + @param[out] M +*/ +size_t special_sum_loop(size_t n); + + +/** + This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITHOUT using a loop. + Example: For n=15, we have 60 = 3+5+6+9+10+12+15 = (1+2+3+4+5)*3 + (1+2+3)*5 - 1*15 + Formula: M = (\sum_{i=1}^{k_3} i)*3 + (\sum_{i=1}^{k_5} i)*5 - (\sum_{i=1}^{k_15} i)*15 + @param[in] n + @param[out] M +*/ +size_t special_sum_noloop(size_t n); \ No newline at end of file diff --git a/ex1/ex1D_kahan_summation/main b/ex1/ex1D_kahan_summation/main new file mode 100755 index 0000000..9b4ff37 Binary files /dev/null and b/ex1/ex1D_kahan_summation/main differ diff --git a/ex1/ex1D_kahan_summation/main.cpp b/ex1/ex1D_kahan_summation/main.cpp new file mode 100644 index 0000000..8d93e96 --- /dev/null +++ b/ex1/ex1D_kahan_summation/main.cpp @@ -0,0 +1,33 @@ +#include "mylib.h" +#include +#include +using namespace std; + +int main(int argc, char **argv) +{ + for(size_t i = 1; i < 8; ++i) + { + size_t n = pow(10,i); + vector x(n); + for (size_t k = 0; k < n; ++k) + x[k] = 1.0/(k + 1); + + + // compute scalar products + double sum_1 = scalar(x, x); + double sum_2 = Kahan_skalar(x, x); + + // compute error + double err_1 = abs(sum_1 - pow(M_PI,2)/6); + double err_2 = abs(sum_2 - pow(M_PI,2)/6); + + cout << "n = " << n << endl; + cout << "Normal scalar product: " << sum_1 << "\terror: " << err_1 << endl; + cout << "Kahan scalar product: " << sum_2 << "\terror: " << err_2 << endl; + cout << endl; + } + + + + return 0; +} diff --git a/ex1/ex1D_kahan_summation/mylib.cpp b/ex1/ex1D_kahan_summation/mylib.cpp new file mode 100644 index 0000000..11fc394 --- /dev/null +++ b/ex1/ex1D_kahan_summation/mylib.cpp @@ -0,0 +1,34 @@ +#include "mylib.h" +#include // assert() +#include +#include +using namespace std; + +double scalar(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); + size_t const N = x.size(); + double sum = 0.0; + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; + } + return sum; +} + + + +double Kahan_skalar(vector const &x, vector const &y) +{ + double sum = 0; + double c = 0; + size_t n = x.size(); + for (size_t i = 0; i < n; ++i) + { + double z = x[i]*y[i] - c; // c is the part that got lost in the last iteration + double t = sum + z; // when adding sum + z, the lower digits are lost if sum is large + c = (t - sum) - z; // now we recover the lower digits to add in the next iteration + sum = t; + } + return sum; +} \ No newline at end of file diff --git a/ex1/ex1D_kahan_summation/mylib.h b/ex1/ex1D_kahan_summation/mylib.h new file mode 100644 index 0000000..7914250 --- /dev/null +++ b/ex1/ex1D_kahan_summation/mylib.h @@ -0,0 +1,30 @@ +#ifndef FILE_MYLIB +#define FILE_MYLIB +#include + +/** Inner product + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar(std::vector const &x, std::vector const &y); + +/** Inner product using BLAS routines + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar_cblas(std::vector const &x, std::vector const &y); +float scalar_cblas(std::vector const &x, std::vector const &y); + + +/** L_2 Norm of a vector + @param[in] x vector + @return resulting Euclidian norm +*/ +double norm(std::vector const &x); + +double Kahan_skalar(std::vector const &x, std::vector const &y); + + +#endif diff --git a/ex1/ex1E_vector_vs_list/main b/ex1/ex1E_vector_vs_list/main new file mode 100755 index 0000000..e451b42 Binary files /dev/null and b/ex1/ex1E_vector_vs_list/main differ diff --git a/ex1/ex1E_vector_vs_list/main.cpp b/ex1/ex1E_vector_vs_list/main.cpp new file mode 100644 index 0000000..53a70a4 --- /dev/null +++ b/ex1/ex1E_vector_vs_list/main.cpp @@ -0,0 +1,63 @@ +#include "../utils/timing.h" +#include +#include +#include +#include +#include +#include +using namespace std; + + +size_t random_integer(int lower_bound, int upper_bound) +{ + unsigned seed = chrono::system_clock::now().time_since_epoch().count(); + + minstd_rand0 generator (seed); + + return lower_bound + generator() % (upper_bound - lower_bound + 1); +} + +int main(int argc, char **argv) +{ + // start with generating a sorted vector/list + size_t n = 10000; + vector x_vec(n); + list x_list(n); + for(size_t k = 0; k < n; ++k) + { + x_vec[k] = k + 1; + x_list.push_back(k + 1); + } + + + // insert new random entries such that the container stays sorted + tic(); + for(size_t i = 0; i < n; ++i) + { + size_t new_entry = random_integer(1,n); + auto it = lower_bound(x_vec.begin(), x_vec.end(), new_entry); + x_vec.insert(it, new_entry); + } + double time_1 = toc(); + + tic(); + for(size_t i = 0; i < n; ++i) + { + size_t new_entry = random_integer(1,n); + auto it = lower_bound(x_list.begin(), x_list.end(), new_entry); + x_list.insert(it, new_entry); + } + double time_2 = toc(); + + + // check results + cout << "New vector is sorted: " << std::boolalpha << is_sorted(x_vec.cbegin(), x_vec.cend()) << "\tsize: " << x_vec.size() << "\tduration: " << time_1 << endl; + cout << "New list is sorted: " << std::boolalpha << is_sorted(x_list.cbegin(), x_list.cend()) << "\tsize: " << x_list.size() << "\tduration: " << time_2 << endl; + + // Vector stores 3 pointers + // List stores two pointers for every element: one to the previous, one to the next element + + + + return 0; +} diff --git a/ex1/ex1F_goldbachs_conjecture/goldbach b/ex1/ex1F_goldbachs_conjecture/goldbach new file mode 100755 index 0000000..da03bf6 Binary files /dev/null and b/ex1/ex1F_goldbachs_conjecture/goldbach differ diff --git a/ex1/ex1F_goldbachs_conjecture/goldbach.cpp b/ex1/ex1F_goldbachs_conjecture/goldbach.cpp new file mode 100644 index 0000000..858d40b --- /dev/null +++ b/ex1/ex1F_goldbachs_conjecture/goldbach.cpp @@ -0,0 +1,42 @@ +#include "goldbach.h" + +size_t single_goldbach(size_t k) +{ + const std::vector relevant_primes = get_primes(k); + size_t m = relevant_primes.size(); + + size_t counter = 0; + + for(size_t i = 0; i < m; ++i) + { + for(size_t j = i; j < m; ++j) + { + if(relevant_primes[i] + relevant_primes[j] == k) + ++counter; + } + } + + return counter; +} + + +std::vector count_goldbach(size_t n) +{ + const std::vector relevant_primes = get_primes(n); + size_t m = relevant_primes.size(); + + std::vector counter_vector(n + 1, 0); + + + for(size_t i = 0; i < m; ++i) + { + for(size_t j = i; j < m; ++j) + { + size_t sum = relevant_primes[i] + relevant_primes[j]; + if(sum <= n) + ++counter_vector[relevant_primes[i] + relevant_primes[j]]; + } + } + + return counter_vector; +} diff --git a/ex1/ex1F_goldbachs_conjecture/goldbach.h b/ex1/ex1F_goldbachs_conjecture/goldbach.h new file mode 100644 index 0000000..5794ad4 --- /dev/null +++ b/ex1/ex1F_goldbachs_conjecture/goldbach.h @@ -0,0 +1,21 @@ +#pragma once +#include "mayer_primes.h" +#include +#include +#include +#include + + +/** + This function returns the number of possible decompositions of an integer into a sum of two prime numbers. + @param[in] k first integer + @param[out] count number of decompositions +*/ +size_t single_goldbach(size_t k); + +/** + This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n]. + @param[in] n upper integer bound + @param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index +*/ +std::vector count_goldbach(size_t n); \ No newline at end of file diff --git a/ex1/ex1F_goldbachs_conjecture/main b/ex1/ex1F_goldbachs_conjecture/main new file mode 100755 index 0000000..da03bf6 Binary files /dev/null and b/ex1/ex1F_goldbachs_conjecture/main differ diff --git a/ex1/ex1F_goldbachs_conjecture/main.cpp b/ex1/ex1F_goldbachs_conjecture/main.cpp new file mode 100644 index 0000000..e85654f --- /dev/null +++ b/ex1/ex1F_goldbachs_conjecture/main.cpp @@ -0,0 +1,37 @@ +#include "../utils/timing.h" +#include "goldbach.h" +#include +#include +using namespace std; + + +int main(int argc, char **argv) +{ + cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl; + + + for(size_t n : {10000, 100000, 400000, 1000000, 2000000}) + { + tic(); + + auto goldbach_vector = count_goldbach(n); + + auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end()); + size_t max_number = distance(goldbach_vector.begin(), max_it); + + double time = toc(); + + cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << time << endl; + } + + /* + The number 9240 has 329 decompositions. Duration: 0.00572876 + The number 99330 has 2168 decompositions. Duration: 0.3342 + The number 390390 has 7094 decompositions. Duration: 4.23734 + The number 990990 has 15594 decompositions. Duration: 29.5817 + The number 1981980 has 27988 decompositions. Duration: 135.985 + */ + + + return 0; +} \ No newline at end of file diff --git a/ex1/ex1F_goldbachs_conjecture/mayer_primes.h b/ex1/ex1F_goldbachs_conjecture/mayer_primes.h new file mode 100644 index 0000000..6c6dc23 --- /dev/null +++ b/ex1/ex1F_goldbachs_conjecture/mayer_primes.h @@ -0,0 +1,73 @@ +#pragma once + +#include //memset +#include +//using namespace std; + +/** \brief Determines all prime numbers in interval [2, @p max]. + * + * The sieve of Eratosthenes is used. + * + * The implementation originates from Florian Mayer. + * + * \param[in] max end of interval for the prime number search. + * \return vector of prime numbers @f$2,3,5, ..., p<=max @f$. + * + * \copyright + * Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ +template +std::vector get_primes(T max) +{ + std::vector primes; + char *sieve; + sieve = new char[max / 8 + 1]; + // Fill sieve with 1 + memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char)); + for (T x = 2; x <= max; x++) + { + if (sieve[x / 8] & (0x01 << (x % 8))) { + primes.push_back(x); + // Is prime. Mark multiplicates. + for (T j = 2 * x; j <= max; j += x) + { + sieve[j / 8] &= ~(0x01 << (j % 8)); + } + } + } + delete[] sieve; + return primes; +} + +//--------------------------------------------------------------- +//int main() // by Florian Mayer +//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out +// vector primes; +// primes = get_primes(10000000); +// // return 0; +// // Print out result. +// vector::iterator it; +// for(it=primes.begin(); it < primes.end(); it++) +// cout << *it << " "; +// +// cout << endl; +// return 0; +//} diff --git a/ex1/ex1G_dense_matrices_access/DenseMatrix.h b/ex1/ex1G_dense_matrices_access/DenseMatrix.h new file mode 100644 index 0000000..edf5160 --- /dev/null +++ b/ex1/ex1G_dense_matrices_access/DenseMatrix.h @@ -0,0 +1,71 @@ +#pragma once +#include "sigmoid.h" +#include +#include +using namespace std; + +class DenseMatrix +{ + private: + vector M; + size_t n; + size_t m; + + + public: + vector Mult(const vector &x) const + { + vector y(n,0); + for(size_t i = 0; i < n; ++i) // iterate row + { + for(size_t j = 0; j < m; ++j) // iterate column + { + y[i] += M[i*m + j]*x[j]; + } + } + return y; + } + + vector MultT(const vector &y) const + { + vector x(m,0); + for(size_t j = 0; j < m; ++j) // iterate column + { + for(size_t i = 0; i < n; ++i) // iterate row + { + x[j] += M[i*m + j]*y[i]; + } + } + return x; + } + + void Print() const + { + for(size_t i = 0; i < n; ++i) // iterate row + { + for(size_t j = 0; j < m; ++j) // iterate column + { + cout << M[i*m + j] << " "; + } + cout << endl; + } + cout << endl; + } + + + DenseMatrix(size_t n, size_t m) + { + this->n = n; + this->m = m; + M = vector(n*m); + size_t nm = max(n,m); + + for(size_t i = 0; i < n; ++i) // iterate row + { + for(size_t j = 0; j < m; ++j) // iterate column + { + M[i*m + j] = sigmoid(x_entry(i,nm))*sigmoid(x_entry(j,nm)); + } + } + } +}; \ No newline at end of file diff --git a/ex1/ex1G_dense_matrices_access/ProductMatrix.h b/ex1/ex1G_dense_matrices_access/ProductMatrix.h new file mode 100644 index 0000000..8f40764 --- /dev/null +++ b/ex1/ex1G_dense_matrices_access/ProductMatrix.h @@ -0,0 +1,52 @@ +#pragma once +#include +#include +#include +using namespace std; + +class ProductMatrix +{ + private: + vector u; + vector v; + size_t n; + size_t m; + + public: + vector Mult(const vector &x) const + { + vector y(n,0); + for(int i = 0; i < n; ++i) + { + for(int j = 0; j < m; ++j) + { + y[i] += v[j]*x[j]; + } + y[i] *= u[i]; + } + return y; + } + + vector MultT(const vector &y) const + { + vector x(m,0); + for(int j = 0; j < m; ++j) + { + for(int i = 0; i < n; ++i) + { + x[j] += y[i]*u[i]; + } + x[j] *= v[j]; + } + return x; + } + + ProductMatrix(const vector &u, const vector &v) + { + n = u.size(); + m = v.size(); + this->u = u; + this->v = v; + + } +}; \ No newline at end of file diff --git a/ex1/ex1G_dense_matrices_access/main b/ex1/ex1G_dense_matrices_access/main new file mode 100755 index 0000000..1a31f75 Binary files /dev/null and b/ex1/ex1G_dense_matrices_access/main differ diff --git a/ex1/ex1G_dense_matrices_access/main.cpp b/ex1/ex1G_dense_matrices_access/main.cpp new file mode 100644 index 0000000..fede0cc --- /dev/null +++ b/ex1/ex1G_dense_matrices_access/main.cpp @@ -0,0 +1,93 @@ +#include "../utils/timing.h" +#include "DenseMatrix.h" +#include "ProductMatrix.h" +#include + +int main() +{ + // b) ------------------------------------------------------------------------------------------------------ + DenseMatrix const M(5,3); + vector const u{{1, 2, 3}}; + vector f1 = M.Mult(u); + vector const v{{-1, 2, -3, 4, -5}}; + vector f2 = M.MultT(v); + + M.Print(); + + for(size_t i = 0; i < f1.size(); ++i) + cout << f1[i] << endl; + cout << endl; + + + for(size_t j = 0; j < f2.size(); ++j) + cout << f2[j] << " "; + cout << endl << "-------------------------------------------------" << endl; + + // c) ------------------------------------------------------------------------------------------------------ + size_t n = pow(10,3); + DenseMatrix const M_1(n,n); + vector x(n, 1.0); + + size_t n_loops = 100; + vector y_1; + vector y_2; + + double time_1 = 0; + double time_2 = 0; + + tic(); + for(int l = 0; l < n_loops; ++l) + y_1 = M_1.Mult(x); + time_1 += toc(); + + tic(); + for(int l = 0; l < n_loops; ++l) + y_2 = M_1.MultT(x); + time_2 += toc(); + + vector error_vec(n,0); + for(int i = 0; i < n; ++i) + error_vec[i] = abs(y_1[i] - y_2[i]); + double sup_error = *max_element(error_vec.begin(), error_vec.end()); + + + cout << "n = " << n << endl; + cout << "Average duration for Mult: " << time_1/n_loops << endl; + cout << "Average duration for MultT: " << time_2/n_loops << endl; + cout << "sup-error: " << sup_error << endl; + cout << "-------------------------------------------------" << endl; + + // d) ------------------------------------------------------------------------------------------------------ + vector u_M(n,0); + for(int i = 0; i < n; ++i) + u_M[i] = sigmoid(x_entry(i, n)); + + ProductMatrix const M_2(u_M, u_M); + + time_1 = 0; + time_2 = 0; + + tic(); + for(int l = 0; l < n_loops; ++l) + y_1 = M_2.Mult(x); + time_1 += toc(); + + tic(); + for(int l = 0; l < n_loops; ++l) + y_2 = M_2.MultT(x); + time_2 += toc(); + + for(int i = 0; i < n; ++i) + error_vec[i] = abs(y_1[i] - y_2[i]); + sup_error = *max_element(error_vec.begin(), error_vec.end()); + + + cout << "n = " << n << endl; + cout << "Average duration for Mult: " << time_1/n_loops << endl; + cout << "Average duration for MultT: " << time_2/n_loops << endl; + cout << "sup-error: " << sup_error << endl; + cout << "-------------------------------------------------" << endl; + + + return 0; +} diff --git a/ex1/ex1G_dense_matrices_access/sigmoid.h b/ex1/ex1G_dense_matrices_access/sigmoid.h new file mode 100644 index 0000000..9fd581f --- /dev/null +++ b/ex1/ex1G_dense_matrices_access/sigmoid.h @@ -0,0 +1,12 @@ +#pragma once +#include + +double sigmoid(double x) +{ + return 1./(1. + exp(-x)); +} + +double x_entry(size_t k, size_t nm) +{ + return (10.*k)/(nm - 1) - 5.; +} \ No newline at end of file diff --git a/ex2/ex2_C_main.py b/ex2/ex2_C_main.py new file mode 100644 index 0000000..d8cb768 --- /dev/null +++ b/ex2/ex2_C_main.py @@ -0,0 +1,18 @@ +import numpy as np +import matplotlib.pyplot as plt + +def u(x,p): + return (np.exp(p*x) - 1)/(np.exp(p) - 1) + +x = np.linspace(0, 1, 200) + + +plt.figure(figsize=(8, 5)) +for p in [ -3, -1, -0.5, 0.5, 2, 4]: + plt.plot(x, u(x, p), label="p ="+ str(p)) + +plt.xlabel('x') +plt.ylabel('u(x)') +plt.legend() +plt.grid(True) +plt.show() diff --git a/ex2/ex_2_Corrected.pdf b/ex2/ex_2_Corrected.pdf new file mode 100644 index 0000000..9f2ed1a Binary files /dev/null and b/ex2/ex_2_Corrected.pdf differ diff --git a/ex3/.vscode/settings.json b/ex3/.vscode/settings.json new file mode 100644 index 0000000..bedc9af --- /dev/null +++ b/ex3/.vscode/settings.json @@ -0,0 +1,71 @@ +{ + "files.associations": { + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "cctype": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "format": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "semaphore": "cpp", + "span": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "cinttypes": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "variant": "cpp", + "cassert": "cpp", + "list": "cpp", + "fstream": "cpp" + } +} \ No newline at end of file diff --git a/ex3/CLANG_default.mk b/ex3/CLANG_default.mk new file mode 100644 index 0000000..4bc290d --- /dev/null +++ b/ex3/CLANG_default.mk @@ -0,0 +1,123 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=CLANG_ + +#CLANGPATH=//usr/lib/llvm-10/bin/ +CC = ${CLANGPATH}clang +CXX = ${CLANGPATH}clang++ +#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0 +#F77 = gfortran +LINKER = ${CXX} + +#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages +WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1 +WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic +#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic + +CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS} +# don't use -Ofast +# -ftrapv +LINKFLAGS += -O3 + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +LINKFLAGS += -llapack -lblas +# -lopenblas +ifeq ($(UBUNTU),1) +# ubuntu +else +# on archlinux +LINKFLAGS += -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# very good check +# http://clang.llvm.org/extra/clang-tidy/ +# good check, see: http://llvm.org/docs/CodingStandards.html#include-style +SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init +SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration +#READABILITY=,readability*${SWITCH_OFF} +#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +#TIDYFLAGS += -checks='modernize* +# ??? +#TIDYFLAGS = -checks='cert*' -header-filter=.* +# MPI checks ?? +#TIDYFLAGS = -checks='mpi*' +# ?? +#TIDYFLAGS = -checks='performance*' -header-filter=.* +#TIDYFLAGS = -checks='portability-*' -header-filter=.* +#TIDYFLAGS = -checks='readability-*' -header-filter=.* + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +codecheck: tidy_check +tidy_check: + clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES} +# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html + +run: clean ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +codecheck: tidy_check diff --git a/ex3/GCC_AMD32_default.mk b/ex3/GCC_AMD32_default.mk new file mode 100644 index 0000000..a911b6b --- /dev/null +++ b/ex3/GCC_AMD32_default.mk @@ -0,0 +1,130 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=GCC_ + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +# on mephisto: +#CXXFLAGS += -I/share/apps/atlas/include +#LINKFLAGS += -L/share/apps/atlas/lib +#LINKFLAGS += -lcblas -latlas + +#LINKFLAGS += -lblas +# Der Header muss mit extern "C" versehen werden, damit g++ alles findet. + + +#WARNINGS = -pedantic -pedantic-errors -Wall -Wextra -Werror -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow +WARNINGS = -pedantic -Wall -Wextra -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +# -Wunreachable-code +CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS} +#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS} +#-msse3 +# -ftree-vectorizer-verbose=2 -DNDEBUG +# -ftree-vectorizer-verbose=5 +# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr + +# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details +# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2 +# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +LINKFLAGS += -O3 + +# BLAS, LAPACK +OPENBLAS_DIR = /opt/openblas_GCCseq +#OPENBLAS_DIR = /opt/openblas_GCC +OPENBLAS_LIBDIR = ${OPENBLAS_DIR}/lib +OPENBLAS_INCDIR = ${OPENBLAS_DIR}/include +CXXFLAGS += -I${OPENBLAS_INCDIR} +LINKFLAGS += -L${OPENBLAS_LIBDIR} -lopenblas + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + -@rm -f *_ *~ *.bak *.log *.out *.tar *.orig + -@rm -r html + +run: clean ${PROGRAM} +# time ./${PROGRAM} +# ./${PROGRAM} + ( export LD_LIBRARY_PATH=${OPENBLAS_LIBDIR}:${LD_LIBRARY_PATH} ; ./${PROGRAM} ) +# or 'export LD_LIBRARY_PATH=/opt/openblas_gcc/lib:${LD_LIBRARY_PATH}' in your ~/.bashrc + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +# no "-pg" in compile/link options +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + ./$^ + gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +#Trace your heap: +#> heaptrack ./main.GCC_ +#> heaptrack_gui heaptrack.main.GCC_..gz +heap: ${PROGRAM} + heaptrack ./$^ 11 + heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` & + + + +######################################################################## +# get the detailed status of all optimization flags +info: + echo "detailed status of all optimization flags" + $(CXX) --version + $(CXX) -Q $(CXXFLAGS) --help=optimizers diff --git a/ex3/GCC_default.mk b/ex3/GCC_default.mk new file mode 100644 index 0000000..803c060 --- /dev/null +++ b/ex3/GCC_default.mk @@ -0,0 +1,183 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=GCC_ + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS} +#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS} +#-msse3 +# -ftree-vectorizer-verbose=2 -DNDEBUG +# -ftree-vectorizer-verbose=5 +# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr + +# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details +# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2 +# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +LINKFLAGS += -O3 + +#architecture +#CPU = -march=znver2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +ifeq ($(UBUNTU),1) +LINKFLAGS += -llapack -lblas +# -lopenblas +else +# on archlinux +LINKFLAGS += -llapack -lopenblas -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# for debugging purpose (save code) +# -fsanitize=leak # only one out the three can be used +# -fsanitize=address +# -fsanitize=thread +SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \ + -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \ + -fsanitize=bool -fsanitize=enum -fsanitize=vptr +#CXXFLAGS += ${SANITARY} +#LINKFLAGS += ${SANITARY} + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg + + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + -@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt + -@rm -rf html + +run: clean ${PROGRAM} +#run: ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * +#find . -size +10M > large_files +#--exclude-from ${MY_DIR}/large_files + +zip: clean + @echo "Zip the directory: " ${MY_DIR} + @cd .. ;\ + zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} + +doc: + doxygen Doxyfile + +######################################################################### +.SUFFIXES: .f90 + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log) + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +.f90.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +# no "-pg" in compile/link options +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} +# Graphical interface +# valkyrie + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/ +# * install +# * sudo vi /etc/sysctl.conf +# add kernel.perf_event_paranoid = 0 + +#Trace your heap: +#> heaptrack ./main.GCC_ +#> heaptrack_gui heaptrack.main.GCC_..gz +heap: ${PROGRAM} + heaptrack ./$^ ${PARAMS} + heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` & + +codecheck: $(SOURCES) + cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^ + + +######################################################################## +# get the detailed status of all optimization flags +info: + echo "detailed status of all optimization flags" + $(CXX) --version + $(CXX) -Q $(CXXFLAGS) --help=optimizers + lscpu + inxi -C + lstopo + +# Excellent hardware info +# hardinfo +# Life monitoring of CPU frequency etc. +# sudo i7z + +# Memory consumption +# vmstat -at -SM 3 +# xfce4-taskmanager + + +# https://www.tecmint.com/check-linux-cpu-information/ +#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/ + +# Debugging: +# https://wiki.archlinux.org/index.php/Debugging diff --git a/ex3/ICC_default.mk b/ex3/ICC_default.mk new file mode 100644 index 0000000..d4bd4db --- /dev/null +++ b/ex3/ICC_default.mk @@ -0,0 +1,137 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ICC_ + +#BINDIR = /opt/intel/bin/ + +# special on my sony [GH] +#BINDIR = /opt/save.intel/bin/ +# very special on my sony [GH] +# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so + +# Error with g++-4.8 using icpc14.0,x: +# find directory wherein bits/c++config.h is located +# 'locate bits/c++config.h' +#FOUND_CONFIG = -I/usr/include/x86_64-linux-gnu/c++/4.8 + + +CC = ${BINDIR}icc +CXX = ${BINDIR}icpc +F77 = ${BINDIR}ifort +LINKER = ${CXX} + + +WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3 +# -Winline -Wredundant-decls -Wunreachable-code +CXXFLAGS += -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl ${FOUND_CONFIG} +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg +# -vec-report=3 +# -qopt-report=5 -qopt-report-phase=vec +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd +CXXFLAGS += -align + +# use MKL by INTEL +# https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/linking-your-application-with-the-intel-math-kernel-library/linking-quick-start/using-the-mkl-compiler-option.html +# https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html +# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +#LINKFLAGS += -O3 -L/opt/intel/mkl/lib -mkl +LINKFLAGS += -O3 -mkl=sequential + +# interprocedural optimization +CXXFLAGS += -ipo +LINKFLAGS += -ipo + +# annotated assembler file +ANNOTED = -fsource-asm -S + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# cache: ${PROGRAM} +# valgrind --tool=callgrind --simulate-cache=yes ./$^ +# # kcachegrind callgrind.out. & +# +# # Check for wrong memory accesses, memory leaks, ... +# # use smaller data sets +# mem: ${PROGRAM} +# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ +# +# # Simple run time profiling of your code +# # CXXFLAGS += -g -pg +# # LINKFLAGS += -pg +# prof: ${PROGRAM} +# ./$^ +# gprof -b ./$^ > gp.out +# # kprof -f gp.out -p gprof & +# + + +mem: inspector +prof: amplifier +cache: amplifier + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# alternatively to the solution above: + #edit file /etc/sysctl.d/10-ptrace.conf and set variable kernel.yama.ptrace_scope variable to 0 . + amplxe-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + inspxe-gui & + +advisor: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + advixe-gui & + +icc-info: + icpc -# main.cpp + + + + diff --git a/ex3/ONEAPI_default.mk b/ex3/ONEAPI_default.mk new file mode 100644 index 0000000..fe7b3fe --- /dev/null +++ b/ex3/ONEAPI_default.mk @@ -0,0 +1,176 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ONEAPI_ + +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# requires +# source /opt/intel/oneapi/setvars.sh +# on AMD: export MKL_DEBUG_CPU_TYPE=5 + +#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/ +#MKL_ROOT = /opt/intel/oneapi/mkl/latest/ +#export KMP_AFFINITY=verbose,compact + +CC = ${BINDIR}icc +CXX = ${BINDIR}dpcpp +F77 = ${BINDIR}ifort +LINKER = ${CXX} + +## Compiler flags +WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic +WARNINGS += -Wpessimizing-move -Wredundant-move +#-wd2015,2012,2014 -wn3 +# -Winline -Wredundant-decls -Wunreachable-code +# -qopt-subscript-in-range +# -vec-threshold0 + +CXXFLAGS += -O3 -std=c++17 ${WARNINGS} +#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include" +#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include" +LINKFLAGS += -O3 + +# interprocedural optimization +CXXFLAGS += -ipo +LINKFLAGS += -ipo +LINKFLAGS += -flto + +# annotated Assembler file +ANNOTED = -fsource-asm -S + +#architecture +CPU = -march=core-avx2 +#CPU += -mtp=zen +# -xCORE-AVX2 +# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# use MKL by INTEL +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# sequential MKL +# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64) +CXXFLAGS += -qmkl=sequential -UMKL_ILP64 +LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread + +# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static +# install intel-oneapi-compiler-static +# or +LINKFLAGS += -shared-intel + + +OPENMP = -qopenmp +CXXFLAGS += ${OPENMP} +LINKFLAGS += ${OPENMP} + + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg +# -vec-report=3 +# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg +# Redirect report from *.optrpt to stderr +# -qopt-report-file=stderr +# Guided paralellization +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +## run time checks +# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html + + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} *.optrpt + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html + +mem: inspector +prof: vtune +cache: inspector + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid + amplxe-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# inspxe-gui & + vtune-gui ./${PROGRAM} & + +advisor: + source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh +# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui & + advisor --collect=survey ./${PROGRAM} +# advisor --collect=roofline ./${PROGRAM} + advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv + +vtune: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new + export ADVIXE_EXPERIMENTAL=roofline + vtune -collect hotspots ./${PROGRAM} + vtune -report hotspots -r r000hs > vtune.out +# vtune-gui ./${PROGRAM} & + +icc-info: + icpc -# main.cpp + +# MKL on AMD +# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/ +# +# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu +# export MKL_DEBUG_CPU_TYPE=5 +# export MKL_NUM_THRAEDS=1 +# export MKL_DYNAMIC=false +# on Intel compiler +# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html diff --git a/ex3/PGI_default.mk b/ex3/PGI_default.mk new file mode 100644 index 0000000..40760e5 --- /dev/null +++ b/ex3/PGI_default.mk @@ -0,0 +1,93 @@ +# Basic Defintions for using PGI-compiler suite sequentially +# requires setting of COMPILER=PGI_ +# OPTIRUN = optirun + + +CC = pgcc +CXX = pgc++ +F77 = pgfortran +LINKER = ${CXX} + +# on mephisto: +#CXXFLAGS += -I/share/apps/atlas/include +#LINKFLAGS += -L/share/apps/atlas/lib +#LINKFLAGS += -lcblas -latlas + +#LINKFLAGS += -lblas +# Der Header muss mit extern "C" versehen werden, damit g++ alles findet. + +WARNINGS = -Minform=warn +# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow -Wredundant-decls +# -pedantic -Wunreachable-code -Wextra -Winline +# -Wunreachable-code + +#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel +PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity +# -Minfo +# -Mprof=time +# -Mprof=lines +# take care with option -Msafeptr +CXXFLAGS += -O3 -std=c++17 ${WARNINGS} +#CXXFLAGS += -O3 -std=c++11 -DNDEBUG ${PGI_PROFILING} ${WARNINGS} +# -fastsse -fargument-noalias ${WARNINGS} -msse3 -vec-report=3 + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Simple run time profiling of your code +# # CXXFLAGS += -g -pg +# # LINKFLAGS += -pg + + +# Profiling options PGI, see: pgcollect -help +# CPU_PROF = -allcache +CPU_PROF = -time +# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit +#GPU_PROF = -cuda=branch:cc20 +# +PROF_FILE = pgprof.out + +cache: prof + +prof: ${PROGRAM} + ${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^ + ${OPTIRUN} ${BINDIR}pgprof -exe ./$^ $(PROF_FILE) & + +info: + pgaccelinfo -v diff --git a/ex3/ex3/Makefile b/ex3/ex3/Makefile new file mode 100644 index 0000000..4b0446d --- /dev/null +++ b/ex3/ex3/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp vdop.cpp getmatrix.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex3/ex3/benchmark_tests.cpp b/ex3/ex3/benchmark_tests.cpp new file mode 100644 index 0000000..1a873d5 --- /dev/null +++ b/ex3/ex3/benchmark_tests.cpp @@ -0,0 +1,333 @@ +#include "benchmark_tests.h" +#include "benchmarks.h" +#include +#include +#include +using namespace std::chrono; + +vector test_A(const size_t &NLOOPS, const size_t &N, const function&, const vector&)>& scalar_function) +{ + cout << "#################### (A) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N), y(N); + + cout.precision(2); + cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization +// Special: x_i = i+1; y_i = 1/x_i ==> == N + + for (size_t i = 0; i < N; ++i) + { + x[i] = i % 219 + 1; + y[i] = 1.0/x[i]; + } + + + cout << "\nStart Benchmarking scalar\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + double check(0.0),ss(0.0); + for (size_t i = 0; i < NLOOPS; ++i) + { + check = scalar_function(x, y); + ss += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n = " << check << endl; + if (static_cast(check) != N) + cout << " !! W R O N G result !!\n"; + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + +//########################################################################## + cout << "\nStart Benchmarking norm\n"; + + auto t3 = system_clock::now(); // start timer +// Do calculation + double ss2(0.0); + for (size_t i = 0; i < NLOOPS; ++i) + { + auto sk1 = sqrt(scalar(x, x)); + ss2 += sk1; // prevents the optimizer from removing unused calculation results. + } + + auto t4 = system_clock::now(); // stop timer + auto duration2 = duration_cast(t4 - t3); // duration in microseconds + double t_diff2 = static_cast(duration2.count()) / 1e6; // overall duration in seconds + t_diff2 = t_diff2/NLOOPS; // duration per loop seconds + + + cout << "ss(norm): " << ss2 << endl; + cout << "Timing in sec. : " << t_diff2 << endl; + + + + + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function(const vector&, const vector&)>& MatVec_function) +{ + cout << "#################### (B) ####################" << endl; + + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + cout << "\nM = " << M << endl; + +// Memory allocation + cout << "Memory allocation\n"; + + vector A(M*N); + vector x(N); + + cout.precision(2); + cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +// Data initialization + + for (size_t i = 0; i < M; ++i) + for (size_t j = 0; j < N; ++j) + A[N*i + j] = (i + j) % 219 + 1; + + + for (size_t j = 0; j < N; ++j) + { + x[j] = 1.0/A[N*17 + j]; + } + + cout << "\nStart Benchmarking MatVec\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector b(M); + + for (size_t i = 0; i < NLOOPS; ++i) + { + b = MatVec_function(A, x); + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + +// Check the correct result + cout << "\n = " << b[17] << endl; + if (static_cast(b[17]) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function(const vector&, const vector&, size_t const &shared_dim)>& MatMat_function) +{ + cout << "#################### (C) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nL = " << L << endl; + cout << "\nM = " << M << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector A(M*L); + vector B(L*N); + + cout.precision(2); + cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization + + for (size_t i = 0; i < M; ++i) + for (size_t k = 0; k < L; ++k) + A[L*i + k] = (i + k) % 219 + 1; + + for (size_t k = 0; k < L; ++k) + for (size_t j = 0; j < N; ++j) + B[N*k + j] = 1.0/A[L*17 + k]; + + + cout << "\nStart Benchmarking MatMat\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector C(M*N); + double check; + double check_sum = 0; + + for (size_t i = 0; i < NLOOPS; ++i) + { + C = MatMat_function(A, B, L); + + check = C[N*17]; + check_sum += check; // prevents the optimizer from removing unused calculation results. + } + cout << check_sum; + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + +// Check the correct result + cout << "\n C[17,0] = " << check << endl; + if (static_cast(check) != L) + { + cout << " !! W R O N G result !!, should be " << L <<"\n"; + } + cout << endl; + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_D(const size_t &NLOOPS, const size_t &N, const size_t &p) +{ + cout << "#################### (D) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + cout << "\np = " << p << endl; + +// Memory allocation + cout << "Memory allocation\n"; + + vector a(p + 1, 0); + vector x(N); + + cout.precision(2); + cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +// Data initialization + + for (size_t j = 0; j < N; ++j) + x[j] = 1.0*j; + + for (size_t k = 0; k < p + 1; ++k) + a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ... + + + + cout << "\nStart Benchmarking poly\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector y(N); + double check; + double check_sum; + + for (size_t i = 0; i < NLOOPS; ++i) + { + y = poly(a, x); + check = y[0]; + + check_sum += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n poly(" << x[0] << ") = " << check << endl; + if (abs(check - 1.0) > 1.0/1e6) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} \ No newline at end of file diff --git a/ex3/ex3/benchmark_tests.h b/ex3/ex3/benchmark_tests.h new file mode 100644 index 0000000..3cb41b0 --- /dev/null +++ b/ex3/ex3/benchmark_tests.h @@ -0,0 +1,15 @@ +#pragma once +#include +#include +using namespace std; + + + + +vector test_A(const size_t &NLOOPS, const size_t &N, const function&, const vector&)>& scalar_function); + +vector test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function(const vector&, const vector&)>& MatVec_function); + +vector test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function(const vector&, const vector&, size_t const &shared_dim)>& MatMat_function); + +vector test_D(const size_t &NLOOPS, const size_t &N, const size_t &p); \ No newline at end of file diff --git a/ex3/ex3/benchmarks.cpp b/ex3/ex3/benchmarks.cpp new file mode 100644 index 0000000..358008b --- /dev/null +++ b/ex3/ex3/benchmarks.cpp @@ -0,0 +1,245 @@ +#include "benchmarks.h" +#include "vdop.h" +#include +#include +#include +#include // assert() + + +#ifdef __INTEL_CLANG_COMPILER +#pragma message(" ########## Use of MKL ###############") +#include +#else +#pragma message(" ########## Use of CBLAS ###############") + +#include // cBLAS Library +#include // Lapack + +#endif + + + +// (A) Inner product of two vectors (from skalar_stl) +double scalar(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); + size_t const N = x.size(); + double sum = 0.0; + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; + } + return sum; +} + +// (A) 5.(b) Kahan scalar product +double Kahan_skalar(vector const &x, vector const &y) +{ + double sum = 0.0; + double c = 0.0; + size_t n = x.size(); + for (size_t i = 0; i < n; ++i) + { + double z = x[i]*y[i] - c; // c is the part that got lost in the last iteration + double t = sum + z; // when adding sum + z, the lower digits are lost if sum is large + c = (t - sum) - z; // now we recover the lower digits to add in the next iteration + sum = t; + } + return sum; +} + +// (A) 6. cBLAS scalar product +double scalar_cBLAS(vector const &x, vector const &y) +{ + return cblas_ddot(x.size(), x.data(), 1, y.data(), 1); // x.data() = &x[0] +} + + +// (B) Matrix-vector product (from intro_vector_densematrix) +vector MatVec(vector const &A, vector const &x) +{ + size_t const nelem = A.size(); + size_t const N = x.size(); + assert(nelem % N == 0); // make sure multiplication is possible + size_t const M = nelem/N; + + + vector b(M); + + for (size_t i = 0; i < M; ++i) + { + double tmp = 0.0; + for (size_t j = 0; j < N; ++j) + tmp += A[N*i + j] * x[j]; + b[i] = tmp; + } + + return b; +} + +// (B) cBLAS Matrix-vector product +vector MatVec_cBLAS(vector const &A, vector const &x) +{ + size_t const nelem = A.size(); + size_t const N = x.size(); + assert(nelem % N == 0); // make sure multiplication is possible + size_t const M = nelem/N; + + + vector b(M); + + cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0.0, b.data(), 1); + + return b; +} + + +// (C) Matrix-matrix product +vector MatMat(vector const &A, vector const &B, size_t const &L) +{ + size_t const nelem_A = A.size(); + size_t const nelem_B = B.size(); + + assert(nelem_A % L == 0 && nelem_B % L == 0); + + size_t const M = nelem_A/L; + size_t const N = nelem_B/L; + + + vector C(M*N); + + + + for (size_t i = 0; i < M; ++i) + { + for (size_t k = 0; k < L; ++k) + { + for (size_t j = 0; j < N; ++j) + { + C[N*i + j] += A[L*i + k]*B[N*k + j]; + } + + } + } + + return C; +} + +// (C) cBLAS matrix-matrix product +vector MatMat_cBLAS(vector const &A, vector const &B, size_t const &L) +{ + size_t const nelem_A = A.size(); + size_t const nelem_B = B.size(); + + assert(nelem_A % L == 0 && nelem_B % L == 0); + + size_t const M = nelem_A/L; + size_t const N = nelem_B/L; + + + vector C(M*N); + + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N); + + return C; +} + + +// (D) Evaluation of a polynomial function +vector poly(vector const &a, vector const &x) +{ + size_t const N = x.size(); + size_t const p = a.size() - 1; + vector y(N, 0); + + for (size_t i = 0; i < N; ++i) + { + double x_temp = x[i]; + double y_temp = 0; + for (size_t k = 0; k < p + 1; ++k) + { + y_temp += x_temp*y_temp + a[p - k]; + } + y[i] = y_temp; + } + + return y; +} + + +// (E) Solves linear system of equations +void JacobiSolve(CRS_Matrix const &SK, vector const &f, vector &u) +{ + const double omega = 1.0; + const int maxiter = 1000; + const double tol = 1e-5, // tolerance + tol2 = tol * tol; // tolerance^2 + + int nrows = SK.Nrows(); // number of rows == number of columns + assert( nrows == static_cast(f.size()) && f.size() == u.size() ); + + cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl; + // Choose initial guess + for (int k = 0; k < nrows; ++k) + { + u[k] = 0.0; // u := 0 + } + + vector dd(nrows); // matrix diagonal + vector r(nrows); // residual + vector w(nrows); // correction + + SK.GetDiag(dd); // dd := diag(K) + ////DebugVector(dd);{int ijk; cin >> ijk;} + + // Initial sweep + SK.Defect(r, f, u); // r := f - K*u + + vddiv(w, r, dd); // w := D^{-1}*r + double sigma0 = dscapr(w, r); // s0 := + + // Iteration sweeps + int iter = 0; + double sigma = sigma0; + while ( sigma > tol2 * sigma0 && maxiter > iter) + { + ++iter; + vdaxpy(u, u, omega, w ); // u := u + om*w + SK.Defect(r, f, u); // r := f - K*u + vddiv(w, r, dd); // w := D^{-1}*r + sigma = dscapr(w, r); // s0 := +// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl; + } + cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl; + cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n"; + + return; +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ex3/ex3/benchmarks.h b/ex3/ex3/benchmarks.h new file mode 100644 index 0000000..a4d7900 --- /dev/null +++ b/ex3/ex3/benchmarks.h @@ -0,0 +1,89 @@ +#pragma once +#include "getmatrix.h" +#include + +using namespace std; + +/** (A) Inner product of two vectors (from skalar_stl) + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar(vector const &x, vector const &y); + +/** (A) 5.(b) Inner product of two vectors using the Kahan scalar product + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double Kahan_skalar(vector const &x, vector const &y); + +/** (A) 6. cBLAS scalar product of two vectors + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar_cBLAS(vector const &x, vector const &y); + + +/** (B) Matrix-vector product (from intro_vector_densematrix) + * @param[in] A dense matrix (1D access) + * @param[in] u vector + * + * @return resulting vector +*/ +vector MatVec(vector const &A, vector const &x); + +/** (B) 6. cBLAS Matrix-vector product + * @param[in] A dense matrix (1D access) + * @param[in] u vector + * + * @return resulting vector +*/ +vector MatVec_cBLAS(vector const &A, vector const &x); + + +/** (C) Matrix-matrix product + * @param[in] A MxL dense matrix (1D access) + * @param[in] B LxN dense matrix (1D access) + * @param[in] shared_dim shared dimension L + * + * @return resulting MxN matrix +*/ +vector MatMat(vector const &A, vector const &B, size_t const &shared_dim); + +/** (C) 6. cBLAS Matrix-matrix product + * @param[in] A MxL dense matrix (1D access) + * @param[in] B LxN dense matrix (1D access) + * @param[in] shared_dim shared dimension L + * + * @return resulting MxN matrix +*/ +vector MatMat_cBLAS(vector const &A, vector const &B, size_t const &shared_dim); + + +/** (D) Evaluation of a polynomial function using Horner's scheme + * @param[in] a coefficient vector + * @param[in] x vector with input values + * + * @return vector with output values +*/ +vector poly(vector const &a, vector const &x); + + +/** (E) Solves linear system of equations K @p u = @p f via the Jacobi iteration (from jaboci_oo_stl) + * We use a distributed symmetric CSR matrix @p SK and initial guess of the + * solution is set to 0. + * @param[in] SK CSR matrix + * @param[in] f distributed local vector storing the right hand side + * @param[out] u accumulated local vector storing the solution. +*/ +void JacobiSolve(CRS_Matrix const &SK, vector const &f, vector &u); + + + + + + + + diff --git a/ex3/ex3/factorization_solve.cpp b/ex3/ex3/factorization_solve.cpp new file mode 100644 index 0000000..adff955 --- /dev/null +++ b/ex3/ex3/factorization_solve.cpp @@ -0,0 +1,39 @@ +#include "factorization_solve.h" +#include +#include +#include +#include +#include + +using namespace std; + + +void factorization_solve(vector &A, vector &b, const int32_t &n_rhs) +{ + int32_t nelem = A.size(); + int32_t N = sqrt(nelem); + assert(N == static_cast(b.size())/n_rhs); + + vector IPIV(N); // pivot indices + int info_factorization; + + dgetrf_(&N, &N, A.data(), &N, IPIV.data(), &info_factorization); + + const char transp = 'N'; + int info_solve; + dgetrs_(&transp, &N, &n_rhs, A.data(), &N, IPIV.data(), b.data(), &N, &info_solve, 1); // 1 is length of parameter 'N' + +} + +void print_matrix(vector &A, size_t M, size_t N) +{ + for (size_t i = 0; i < M; ++i) + { + for (size_t j = 0; j < N; ++j) + { + cout << A[N*i + j] << "\t"; + } + cout << endl; + } + cout << endl; +} \ No newline at end of file diff --git a/ex3/ex3/factorization_solve.h b/ex3/ex3/factorization_solve.h new file mode 100644 index 0000000..20e15db --- /dev/null +++ b/ex3/ex3/factorization_solve.h @@ -0,0 +1,21 @@ +#pragma once +#include +#include +using namespace std; + + +/** Solve linear system of equations with multiple right hand sides using LU factorization + * @param[inout] A NxN Matrix (1D access), gets modified to contain the LU decomposition of A + * @param[inout] B N x n_rhs Matrix of right-hand-sides (1D access), gets modified to contain the solution vectors x + * @param[in] n_rhs number of right-hand-sides b + * +*/ +void factorization_solve(vector &A, vector &b, const int32_t &n_rhs); + +/** Print a matrix to console + * @param[in] A MxN Matrix (1D access) + * @param[in] M rows + * @param[in] N columns + * +*/ +void print_matrix(vector &A, size_t M, size_t N); \ No newline at end of file diff --git a/ex3/ex3/factorization_solve_tests.cpp b/ex3/ex3/factorization_solve_tests.cpp new file mode 100644 index 0000000..1a88355 --- /dev/null +++ b/ex3/ex3/factorization_solve_tests.cpp @@ -0,0 +1,89 @@ +#include "factorization_solve_tests.h" +#include "factorization_solve.h" +#include "benchmarks.h" +#include +#include + +using namespace std::chrono; + +void CheckCorrectness() +{ + cout.precision(4); + + size_t N = 5; + size_t n_rhs = 5; + vector A(N*N); + vector b(N*n_rhs); + + // initialize A + for (size_t i = 0; i < N; ++i) + { + for (size_t j = 0; j < N; ++j) + if (i == j) + A[N*i + j] = 4.0; + else + A[N*i + j] = 1.0/((i - j)*(i - j)); + } + const vector A_copy = A; + + // initialize b as identity matrix + for (size_t j = 0; j < N; ++j) + { + for (size_t l = 0; l < n_rhs; ++l) + if (l == j) + b[N*l + j] = 1.0; + else + b[N*l + j] = 0.0; + } + + cout << "A = " << endl; + print_matrix(A, N, N); + cout << "b = " << endl; + print_matrix(b, N, n_rhs); + + // solve system + factorization_solve(A, b, n_rhs); + + vector check_matrix_identity = MatMat_cBLAS(A_copy, b, N); + cout << "A*A^{-1} = " << endl; + print_matrix(check_matrix_identity, N, n_rhs); +} + + +void CheckDuration(const size_t &N) +{ + for (size_t n_rhs : {1, 2, 4, 8, 16, 32}) + { + auto time_start = system_clock::now(); + + vector A(N*N); + vector b(N*n_rhs); + + // initialize A + for (size_t i = 0; i < N; ++i) + { + for (size_t j = 0; j < N; ++j) + if (i == j) + A[N*i + j] = 4.0; + else + A[N*i + j] = 1.0/((i - j)*(i - j)); + } + + // initialize b + for (size_t j = 0; j < N; ++j) + for (size_t l = 0; l < n_rhs; ++l) + b[N*l + j] = 1.0; + + + // solve system + factorization_solve(A, b, n_rhs); + + + + auto time_end = system_clock::now(); + auto duration = duration_cast(time_end - time_start); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + + cout << "n_rhs = " << n_rhs << "\tTime per rhs: " << t_diff/n_rhs << endl; + } +} diff --git a/ex3/ex3/factorization_solve_tests.h b/ex3/ex3/factorization_solve_tests.h new file mode 100644 index 0000000..573f17f --- /dev/null +++ b/ex3/ex3/factorization_solve_tests.h @@ -0,0 +1,7 @@ +#pragma once +#include + + +void CheckCorrectness(); + +void CheckDuration(const size_t &N); \ No newline at end of file diff --git a/ex3/ex3/geom.cpp b/ex3/ex3/geom.cpp new file mode 100644 index 0000000..01c674b --- /dev/null +++ b/ex3/ex3/geom.cpp @@ -0,0 +1,522 @@ +// see: http://llvm.org/docs/CodingStandards.html#include-style +#include "geom.h" + +#include +#include +#include +#include +#include +#include +#include +using namespace std; + +Mesh::Mesh(int ndim, int nvert_e, int ndof_e) + : _nelem(0), _nvert_e(nvert_e), _ndof_e(ndof_e), _nnode(0), _ndim(ndim), _ia(0), _xc(0) +{ +} + +Mesh::~Mesh() +{} + +void Mesh::SetValues(std::vector &v, const std::function &func) const +{ + int const nnode = Nnodes(); // number of vertices in mesh + assert( nnode == static_cast(v.size()) ); + for (int k = 0; k < nnode; ++k) + { + v[k] = func( _xc[2 * k], _xc[2 * k + 1] ); + } +} + + +void Mesh::Debug() const +{ + cout << "\n ############### Debug M E S H ###################\n"; + cout << "\n ............... Coordinates ...................\n"; + for (int k = 0; k < _nnode; ++k) + { + cout << k << " : " ; + for (int i = 0; i < _ndof_e; ++i ) + { + cout << _xc[2*k+i] << " "; + } + cout << endl; + } + cout << "\n ............... Elements ...................\n"; + for (int k = 0; k < _nelem; ++k) + { + cout << k << " : "; + for (int i = 0; i < _ndof_e; ++i ) + cout << _ia[_ndof_e * k + i] << " "; + cout << endl; + } + return; +} + +void Mesh::Write_ascii_matlab(std::string const &fname, std::vector const &v) const +{ + assert(Nnodes() == static_cast(v.size())); // fits vector length to mesh information? + + ofstream fout(fname); // open file ASCII mode + if ( !fout.is_open() ) + { + cout << "\nFile " << fname << " has not been opened.\n\n" ; + assert( fout.is_open() && "File not opened." ); + } + + string const DELIMETER(" "); // define the same delimeter as in matlab/ascii_read*.m + int const OFFSET(1); // convert C-indexing to matlab + + // Write data: #nodes, #space dimensions, #elements, #vertices per element + fout << Nnodes() << DELIMETER << Ndims() << DELIMETER << Nelems() << DELIMETER << NverticesElements() << endl; + + // Write cordinates: x_k, y_k in seperate lines + assert( Nnodes()*Ndims() == static_cast(_xc.size())); + for (int k = 0, kj = 0; k < Nnodes(); ++k) + { + for (int j = 0; j < Ndims(); ++j, ++kj) + { + fout << _xc[kj] << DELIMETER; + } + fout << endl; + } + + // Write connectivity: ia_k,0, ia_k,1 etc in seperate lines + assert( Nelems()*NverticesElements() == static_cast(_ia.size())); + for (int k = 0, kj = 0; k < Nelems(); ++k) + { + for (int j = 0; j < NverticesElements(); ++j, ++kj) + { + fout << _ia[kj] + OFFSET << DELIMETER; // C to matlab + } + fout << endl; + } + + // Write vector + for (int k = 0; k < Nnodes(); ++k) + { + fout << v[k] << endl; + } + + fout.close(); + return; +} + + +void Mesh::Visualize(std::vector const &v) const +{ + // define external command + const string exec_m("matlab -nosplash < visualize_results.m"); // Matlab + //const string exec_m("octave --no-window-system --no-gui visualize_results.m"); // Octave + //const string exec_m("flatpak run org.octave.Octave visualize_results.m"); // Octave (flatpak): desktop GH + + const string fname("uv.txt"); + Write_ascii_matlab(fname, v); + + int ierror = system(exec_m.c_str()); // call external command + + if (ierror != 0) + { + cout << endl << "Check path to Matlab/octave on your system" << endl; + } + cout << endl; + return; +} + + + + +// ##################################################################### +Mesh_2d_3_square::Mesh_2d_3_square(int nx, int ny, int myid, int procx, int procy) + : Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs + _myid(myid), _procx(procx), _procy(procy), _neigh{{-1, -1, -1, -1}}, _color(0), +_xl(0.0), _xr(1.0), _yb(0.0), _yt(1.0), _nx(nx), _ny(ny) +{ + //void IniGeom(int const myid, int const procx, int const procy, int neigh[], int &color) + int const ky = _myid / _procx; + int const kx = _myid % _procy; // MOD(myid,procx) + // Determine the neighbors of domain/rank myid + _neigh[0] = (ky == 0) ? -1 : _myid - _procx; // South + _neigh[1] = (kx == _procx - 1) ? -1 : _myid + 1; // East + _neigh[2] = (ky == _procy - 1) ? -1 : _myid + _procx; // North + _neigh[3] = (kx == 0) ? -1 : _myid - 1; // West + + _color = (kx + ky) & 1 ; + + // subdomain is part of unit square + double const hx = 1. / _procx; + double const hy = 1. / _procy; + _xl = kx * hx; // left + _xr = (kx + 1) * hx; // right + _yb = ky * hy; // bottom + _yt = (ky + 1) * hy; // top + + // Calculate coordinates + int const nnode = (_nx + 1) * (_ny + 1); // number of nodes + Resize_Coords(nnode, 2); // coordinates in 2D [nnode][ndim] + GetCoordsInRectangle(_nx, _ny, _xl, _xr, _yb, _yt, GetCoords().data()); + + // Calculate element connectivity (linear triangles) + int const nelem = 2 * _nx * _ny; // number of elements + Resize_Connectivity(nelem, 3); // connectivity matrix [nelem][3] + GetConnectivityInRectangle(_nx, _ny, GetConnectivity().data()); + + return; +} + + +void Mesh_2d_3_square::SetU(std::vector &u) const +{ + int dx = _nx + 1; + for (int j = 0; j <= _ny; ++j) + { + int k = j * dx; + for (int i = 0; i <= _nx; ++i, ++k) + { + u[k] = 0.0; + } + } + +} + +void Mesh_2d_3_square::SetF(std::vector &f) const +{ + int dx = _nx + 1; + for (int j = 0; j <= _ny; ++j) + { + int k = j * dx; + for (int i = 0; i <= _nx; ++i, ++k) + { + f[k] = 1.0; + } + } + +} + + +std::vector Mesh_2d_3_square::Index_DirichletNodes() const +{ + int const dx = 1, + dy = _nx + 1, + bl = 0, + br = _nx, + tl = _ny * (_nx + 1), + tr = (_ny + 1) * (_nx + 1) - 1; + int const start[4] = { bl, br, tl, bl}, + end[4] = { br, tr, tr, tl}, + step[4] = { dx, dy, dx, dy}; + + vector idx(0); + for (int j = 0; j < 4; j++) + { + if (_neigh[j] < 0) + { + for (int i = start[j]; i <= end[j]; i += step[j]) + { + idx.push_back(i); // node i is Dirichlet node + } + } + } + // remove multiple elements + sort(idx.begin(), idx.end()); // sort + idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data + + return idx; +} + +void Mesh_2d_3_square::SaveVectorP(std::string const &name, vector const &u) const +{ +// construct the file name for subdomain myid + const string tmp( std::to_string(_myid / 100) + to_string((_myid % 100) / 10) + to_string(_myid % 10) ); + + const string namep = name + "." + tmp; + ofstream ff(namep.c_str()); + ff.precision(6); + ff.setf(ios::fixed, ios::floatfield); + + // assumes tensor product grid in unit square; rowise numbered (as generated in class constructor) + // output is provided for tensor product grid visualization ( similar to Matlab-surf() ) + auto const &xc = GetCoords(); + int k = 0; + for (int j = 0; j <= _ny; ++j) + { + for (int i = 0; i <= _nx; ++i, ++k) + ff << xc[2 * k + 0] << " " << xc[2 * k + 1] << " " << u[k] << endl; + ff << endl; + } + + ff.close(); + return; +} + +void Mesh_2d_3_square::GetCoordsInRectangle(int const nx, int const ny, + double const xl, double const xr, double const yb, double const yt, + double xc[]) +{ + const double hx = (xr - xl) / nx, + hy = (yt - yb) / ny; + + int k = 0; + for (int j = 0; j <= ny; ++j) + { + const double y0 = yb + j * hy; + for (int i = 0; i <= nx; ++i, k += 2) + { + xc[k ] = xl + i * hx; + xc[k + 1] = y0; + } + } + + return; +} + +void Mesh_2d_3_square::GetConnectivityInRectangle(int const nx, int const ny, int ia[]) +{ + const int dx = nx + 1; + int k = 0; + int l = 0; + for (int j = 0; j < ny; ++j, ++k) + { + for (int i = 0; i < nx; ++i, ++k) + { + ia[l ] = k; + ia[l + 1] = k + 1; + ia[l + 2] = k + dx + 1; + l += 3; + ia[l ] = k; + ia[l + 1] = k + dx; + ia[l + 2] = k + dx + 1; + l += 3; + } + } + return; +} + +// #################### still some old code (--> MPI) ############################ + + +// Copies the values of w corresponding to the boundary +// South (ib==1), East (ib==2), North (ib==3), West (ib==4) + +void GetBound(int const ib, int const nx, int const ny, double const w[], double s[]) +{ + const int //dx = 1, + dy = nx + 1, + bl = 0, + br = nx, + tl = ny * (nx + 1), + tr = (ny + 1) * (nx + 1) - 1; + switch (ib) + { + case 1: + { + for (int i = bl, j = 0; i <= br; ++i, ++j) + s[j] = w[i]; + break; + } + case 3: + { + for (int i = tl, j = 0; i <= tr; ++i, ++j) + s[j] = w[i]; + break; + } + case 4: + { + for (int i = bl, j = 0; i <= tl; i += dy, ++j) + s[j] = w[i]; + break; + } + case 2: + { + for (int i = br, j = 0; i <= tr; i += dy, ++j) + s[j] = w[i]; + break; + } + default: + { + cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl; + } + } + return; +} + +// ---------------------------------------------------------------------------------------------------------- +// Computes w: = w + s at nodes on the boundary +// South (ib == 1), East (ib == 2), North (ib == 3), West (ib == 4) + +void AddBound(int const ib, int const nx, int const ny, double w[], double const s[]) +{ + int const dy = nx + 1, + bl = 0, + br = nx, + tl = ny * (nx + 1), + tr = (ny + 1) * (nx + 1) - 1; + switch (ib) + { + case 1: + { + for (int i = bl, j = 0; i <= br; ++i, ++j) + w[i] += s[j]; + break; + } + case 3: + { + for (int i = tl, j = 0; i <= tr; ++i, ++j) + w[i] += s[j]; + break; + } + case 4: + { + for (int i = bl, j = 0; i <= tl; i += dy, ++j) + w[i] += s[j]; + break; + } + case 2: + { + for (int i = br, j = 0; i <= tr; i += dy, ++j) + w[i] += s[j]; + break; + } + default: + { + cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl; + } + } + return; +} + + +// #################################################################### + +Mesh_2d_3_matlab::Mesh_2d_3_matlab(string const &fname) + : Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs + bedges(0) +{ + ifstream ifs(fname); + if (!(ifs.is_open() && ifs.good())) + { + cerr << "Mesh_2d_3_matlab: Error cannot open file " << fname << endl; + assert(ifs.is_open()); + } + + int const OFFSET(1); // Matlab to C indexing + cout << "ASCI file " << fname << " opened" << endl; + + // Read some mesh constants + int nnode, ndim, nelem, nvert_e; + ifs >> nnode >> ndim >> nelem >> nvert_e; + cout << nnode << " " << ndim << " " << nelem << " " << nvert_e << endl; + assert(ndim == 2 && nvert_e == 3); + + // Allocate memory + Resize_Coords(nnode, ndim); // coordinates in 2D [nnode][ndim] + Resize_Connectivity(nelem, nvert_e); // connectivity matrix [nelem][nvert] + + // Read ccordinates + auto &xc = GetCoords(); + for (int k = 0; k < nnode * ndim; ++k) + { + ifs >> xc[k]; + } + + // Read connectivity + auto &ia = GetConnectivity(); + for (int k = 0; k < nelem * nvert_e; ++k) + { + ifs >> ia[k]; + ia[k] -= OFFSET; // Matlab to C indexing + } + + // additional read of boundary information (only start/end point) + int nbedges; + ifs >> nbedges; + + bedges.resize(nbedges * 2); + for (int k = 0; k < nbedges * 2; ++k) + { + ifs >> bedges[k]; + bedges[k] -= OFFSET; // Matlab to C indexing + } + + return; +} + +// binary +//{ +//ifstream ifs(fname, ios_base::in | ios_base::binary); +//if(!(ifs.is_open() && ifs.good())) +//{ +//cerr << "ReadBinMatrix: Error cannot open file " << file << endl; +//assert(ifs.is_open()); +//} +//cout << "ReadBinMatrix: file opened" << file << endl; + + +//} + +// binaryIO.cpp +//void read_binMatrix(const string& file, vector &cnt, vector &col, vector &ele) +//{ + +//ifstream ifs(file, ios_base::in | ios_base::binary); + +//if(!(ifs.is_open() && ifs.good())) +//{ +//cerr << "ReadBinMatrix: Error cannot open file " << file << endl; +//assert(ifs.is_open()); +//} +//cout << "ReadBinMatrix: Opened file " << file << endl; + +//int _size; + +//ifs.read(reinterpret_cast(&_size), sizeof(int)); // old: ifs.read((char*)&_size, sizeof(int)); +//cnt.resize(_size); +//cout << "ReadBinMatrix: cnt size: " << _size << endl; + +//ifs.read((char*)&_size, sizeof(int)); +//col.resize(_size); +//cout << "ReadBinMatrix: col size: " << _size << endl; + +//ifs.read((char*)&_size, sizeof(int)); +//ele.resize(_size); +//cout << "ReadBinMatrix: ele size: " << _size << endl; + + +//ifs.read((char*)cnt.data(), cnt.size() * sizeof(int)); +//ifs.read((char*)col.data(), col.size() * sizeof(int)); +//ifs.read((char*)ele.data(), ele.size() * sizeof(double)); + +//ifs.close(); +//cout << "ReadBinMatrix: Finished reading matrix.." << endl; + +//} + + +std::vector Mesh_2d_3_matlab::Index_DirichletNodes() const +{ + vector idx(bedges); // copy + + sort(idx.begin(), idx.end()); // sort + idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data + + return idx; +} + + + + + + + + + + + + + + + + + + + diff --git a/ex3/ex3/geom.h b/ex3/ex3/geom.h new file mode 100644 index 0000000..14328fa --- /dev/null +++ b/ex3/ex3/geom.h @@ -0,0 +1,381 @@ +#ifndef GEOM_FILE +#define GEOM_FILE +#include +#include // function; C++11 +#include +#include + +/** + * Basis class for finite element meshes. + */ +class Mesh +{ +public: + /** + * Constructor initializing the members with default values. + * + * @param[in] ndim space dimensions (dimension for coordinates) + * @param[in] nvert_e number of vertices per element (dimension for connectivity) + * @param[in] ndof_e degrees of freedom per element (= @p nvert_e for linear elements) + */ + explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0); + + /** + * Destructor. + * + * See clang warning on + * weak-vtables. + */ + virtual ~Mesh(); + + /** + * Number of finite elements in (sub)domain. + * @return number of elements. + */ + int Nelems() const + { + return _nelem; + } + + /** + * Global number of vertices for each finite element. + * @return number of vertices per element. + */ + int NverticesElements() const + { + return _nvert_e; + } + + /** + * Global number of degrees of freedom (dof) for each finite element. + * @return degrees of freedom per element. + */ + int NdofsElement() const + { + return _ndof_e; + } + + /** + * Number of vertices in mesh. + * @return number of vertices. + */ + int Nnodes() const + { + return _nnode; + } + + /** + * Space dimension. + * @return number of dimensions. + */ + int Ndims() const + { + return _ndim; + } + + /** + * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions. + * + * @param[in] nelem number of elements + * @param[in] nvert_e number of vertices per element + */ + void Resize_Connectivity(int nelem, int nvert_e) + { + SetNelem(nelem); // number of elements + SetNverticesElement(nvert_e); // vertices per element + _ia.resize(nelem * nvert_e); + } + + /** + * Read connectivity information (g1,g2,g3)_i. + * @return convectivity vector [nelems*ndofs]. + */ + const std::vector &GetConnectivity() const + { + return _ia; + } + + /** + * Access/Change connectivity information (g1,g2,g3)_i. + * @return convectivity vector [nelems*ndofs]. + */ + std::vector &GetConnectivity() + { + return _ia; + } + + /** + * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions. + * + * @param[in] nnodes number of nodes + * @param[in] ndim space dimension + */ + void Resize_Coords(int nnodes, int ndim) + { + SetNnode(nnodes); // number of nodes + SetNdim(ndim); // space dimension + _xc.resize(nnodes * ndim); + } + + /** + * Read coordinates of vertices (x,y)_i. + * @return coordinates vector [nnodes*2]. + */ + const std::vector &GetCoords() const + { + return _xc; + } + + /** + * Access/Change coordinates of vertices (x,y)_i. + * @return coordinates vector [nnodes*2]. + */ + std::vector &GetCoords() + { + return _xc; + } + + /** + * Calculate values in vector @p v via function @p func(x,y) + * @param[in] v vector + * @param[in] func function of (x,y) returning a double value. + */ + void SetValues(std::vector &v, const std::function &func) const; + + /** + * Prints the information for a finite element mesh + */ + void Debug() const; + + /** + * Determines the indices of those vertices with Dirichlet boundary conditions + * @return index vector. + */ + virtual std::vector Index_DirichletNodes() const = 0; + + /** + * Write vector @p v toghether with its mesh information to an ASCii file @p fname. + * + * The data are written in C-style. + * + * @param[in] fname file name + * @param[in] v vector + */ + void Write_ascii_matlab(std::string const &fname, std::vector const &v) const; + + /** + * Visualize @p v together with its mesh information via matlab or octave. + * + * Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp) + * that are supported on your system. + * + * @param[in] v vector + * + * @warning matlab files ascii_read_meshvector.m visualize_results.m + * must be in the executing directory. + */ + void Visualize(std::vector const &v) const; + + +protected: + void SetNelem(int nelem) + { + _nelem = nelem; + } + + void SetNverticesElement(int nvert) + { + _nvert_e = nvert; + } + + void SetNdofsElement(int ndof) + { + _ndof_e = ndof; + } + + void SetNnode(int nnode) + { + _nnode = nnode; + } + + void SetNdim(int ndim) + { + _ndim = ndim; + } + +private: + int _nelem; //!< number elements + int _nvert_e; //!< number of vertices per element + int _ndof_e; //!< degrees of freedom (d.o.f.) per element + int _nnode; //!< number nodes/vertices + int _ndim; //!< space dimension of the problem (1, 2, or 3) + std::vector _ia; //!< element connectivity + std::vector _xc; //!< coordinates +}; + +/** + * 2D finite element mesh of the square consiting of linear triangular elements. + */ +class Mesh_2d_3_square: public Mesh +{ +public: + /** + * Generates the f.e. mesh for the unit square. + * + * @param[in] nx number of discretization intervals in x-direction + * @param[in] ny number of discretization intervals in y-direction + * @param[in] myid my MPI-rank / subdomain + * @param[in] procx number of ranks/subdomains in x-direction + * @param[in] procy number of processes in y-direction + */ + Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1); + + /** + * Destructor + */ + ~Mesh_2d_3_square() override + {} + + /** + * Set solution vector based on a tensor product grid in the rectangle. + * @param[in] u solution vector + */ + void SetU(std::vector &u) const; + + /** + * Set right hand side (rhs) vector on a tensor product grid in the rectangle. + * @param[in] f rhs vector + */ + void SetF(std::vector &f) const; + + /** + * Determines the indices of those vertices with Dirichlet boundary conditions + * @return index vector. + */ + std::vector Index_DirichletNodes() const override; + + /** + * Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot. + * The file stores rowise the x- and y- coordinates together with the value from @p u . + * The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals. + * + * @param[in] name basename of file name (file name will be extended by the rank number) + * @param[in] u local vector + * + * @warning Assumes tensor product grid in unit square; rowise numbered + * (as generated in class constructor). + * The output is provided for tensor product grid visualization + * ( similar to Matlab-surf() ). + * + * @see Mesh_2d_3_square + */ + void SaveVectorP(std::string const &name, std::vector const &u) const; + + // here will still need to implement in the class + // GetBound(), AddBound() + // or better a generalized way with indices and their appropriate ranks for MPI communication + +private: + /** + * Determines the coordinates of the dicretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt] + * which is discretized into @p nx x @p ny intervals. + * + * @param[in] ny number of discretization intervals in y-direction + * @param[in] xl x-coordinate of left boundary + * @param[in] xr x-coordinate of right boundary + * @param[in] yb y-coordinate of lower boundary + * @param[in] yt y-coordinate of upper boundary + * @param[out] xc coordinate vector of length 2n with x(2*k,2*k+1) as coodinates of node k + */ + + void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt, + double xc[]); + /** + * Determines the element connectivity of linear triangular elements of a FEM discretization + * of a rectangle using @p nx x @p ny equidistant intervals for discretization. + * @param[in] nx number of discretization intervals in x-direction + * @param[in] ny number of discretization intervals in y-direction + * @param[out] ia element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s + */ + void GetConnectivityInRectangle(int nx, int ny, int ia[]); + +private: + int _myid; //!< my MPI rank + int _procx; //!< number of MPI ranks in x-direction + int _procy; //!< number of MPI ranks in y-direction + std::array _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.) + int _color; //!< red/black coloring (checker board) of subdomains + + double _xl; //!< x coordinate of lower left corner of square + double _xr; //!< x coordinate of lower right corner of square + double _yb; //!< y coordinate or lower left corner of square + double _yt; //!< y coordinate of upper right corner of square + int _nx; //!< number of intervals in x-direction + int _ny; //!< number of intervals in y-direction +}; + +// #################### still some old code (--> MPI) ############################ +/** + * Copies the values of @p w corresponding to boundary @p ib + * onto vector s. South (ib==1), East (ib==2), North (ib==3), West (ib==4). + * The vector @p s has to be long enough!! + * @param[in] ib my local boundary + * @param[in] nx number of discretization intervals in x-direction + * @param[in] ny number of discretization intervals in y-direction + * @param[in] w vector for all nodes of local discretization + * @param[out] s short vector with values on boundary @p ib +*/ +// GH_NOTE: Absicherung bei s !! +void GetBound(int ib, int nx, int ny, double const w[], double s[]); + + +/** + * Computes @p w := @p w + @p s at the interface/boundary nodes on the + * boundary @p ib . South (ib==1), East (ib==2), North (ib==3), West (ib==4) + * @param[in] ib my local boundary + * @param[in] nx number of discretization intervals in x-direction + * @param[in] ny number of discretization intervals in y-direction + * @param[in,out] w vector for all nodes of local discretization + * @param[in] s short vector with values on boundary @p ib +*/ +void AddBound(int ib, int nx, int ny, double w[], double const s[]); + +// #################### Mesh from Matlab ############################ +/** + * 2D finite element mesh of the square consiting of linear triangular elements. + */ +class Mesh_2d_3_matlab: public Mesh +{ +public: + /** + * Reads mesh data from a binary file. + * + * File format, see ascii_write_mesh.m + * + * @param[in] fname file name + */ + explicit Mesh_2d_3_matlab(std::string const &fname); + + /** + * Determines the indices of those vertices with Dirichlet boundary conditions. + * @return index vector. + * + * @warning All boundary nodes are considered as Dirchlet nodes. + */ + std::vector Index_DirichletNodes() const override; + +private: + /** + * Determines the indices of those vertices with Dirichlet boundary conditions + * @return index vector. + */ + int Nnbedges() const + { + return static_cast(bedges.size()); + } + + std::vector bedges; //!< boundary edges [nbedges][2] storing start/end vertex + +}; + +#endif diff --git a/ex3/ex3/getmatrix.cpp b/ex3/ex3/getmatrix.cpp new file mode 100644 index 0000000..479caa5 --- /dev/null +++ b/ex3/ex3/getmatrix.cpp @@ -0,0 +1,347 @@ +#include "getmatrix.h" +#include "userset.h" + +#include +#include +#include +#include +#include +#include +#include +using namespace std; + + +// general routine for lin. triangular elements + +void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]) +//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe) +{ + const int i1 = 2 * ial[0], i2 = 2 * ial[1], i3 = 2 * ial[2]; + const double x13 = xc[i3 + 0] - xc[i1 + 0], y13 = xc[i3 + 1] - xc[i1 + 1], + x21 = xc[i1 + 0] - xc[i2 + 0], y21 = xc[i1 + 1] - xc[i2 + 1], + x32 = xc[i2 + 0] - xc[i3 + 0], y32 = xc[i2 + 1] - xc[i3 + 1]; + const double jac = fabs(x21 * y13 - x13 * y21); + + ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32); + ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32); + ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32); + ske[1][0] = ske[0][1]; + ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13); + ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13); + ske[2][0] = ske[0][2]; + ske[2][1] = ske[1][2]; + ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21); + + const double xm = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0, + ym = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0; + //fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0; + fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0; +} + + +// general routine for lin. triangular elements, +// non-symm. matrix +// node numbering in element: a s c e n d i n g indices !! +// GH: deprecated +void AddElem(int const ial[3], double const ske[3][3], double const fe[3], + int const id[], int const ik[], double sk[], double f[]) +{ + for (int i = 0; i < 3; ++i) + { + const int ii = ial[i], // row ii (global index) + id1 = id[ii], // start and + id2 = id[ii + 1]; // end of row ii in matrix + int ip = id1; + for (int j = 0; j < 3; ++j) // no symmetry assumed + { + const int jj = ial[j]; + bool not_found = true; + do // find entry jj (global index) in row ii + { + not_found = (ik[ip] != jj); + ++ip; + } + while (not_found && ip < id2); + +#ifndef NDEBUG // compiler option -DNDEBUG switches off the check + if (not_found) // no entry found !! + { + cout << "Error in AddElem: (" << ii << "," << jj << ") [" + << ial[0] << "," << ial[1] << "," << ial[2] << "]\n"; + assert(!not_found); + } +#endif + sk[ip - 1] += ske[i][j]; + } + f[ii] += fe[i]; + } +} + + +// ---------------------------------------------------------------------------- + + + + +// #################################################################### + +CRS_Matrix::CRS_Matrix(Mesh const &mesh) + : _mesh(mesh), _nrows(0), _nnz(0), _id(0), _ik(0), _sk(0) +{ + Derive_Matrix_Pattern(); + return; +} + +void CRS_Matrix::Derive_Matrix_Pattern() +{ + int const nelem(_mesh.Nelems()); + int const ndof_e(_mesh.NdofsElement()); + auto const &ia(_mesh.GetConnectivity()); +// Determine the number of matrix rows + _nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem); + ++_nrows; // node numberng: 0 ... nnode-1 + assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ? + +// Collect for each node those nodes it is connected to (multiple entries) +// Detect the neighboring nodes + vector< list > cc(_nrows); // cc[i] is the list of nodes a node i is connected to + for (int i = 0; i < nelem; ++i) + { + int const idx = ndof_e * i; + for (int k = 0; k < ndof_e; ++k) + { + list &cck = cc.at(ia[idx + k]); + cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e ); + } + } +// Delete the multiple entries + _nnz = 0; + for (auto &it : cc) + { + it.sort(); + it.unique(); + _nnz += static_cast(it.size()); + // cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator(cout," ")); cout << endl; + } + +// CSR data allocation + _id.resize(_nrows + 1); // Allocate memory for CSR row pointer + _ik.resize(_nnz); // Allocate memory for CSR column index vector + +// copy CSR data + _id[0] = 0; // begin of first row + for (size_t i = 0; i < cc.size(); ++i) + { + //cout << i << " " << nid.at(i) << endl;; + const list &ci = cc.at(i); + const auto nci = static_cast(ci.size()); + _id[i + 1] = _id[i] + nci; // begin of next line + copy(ci.begin(), ci.end(), _ik.begin() + _id[i] ); + } + + assert(_nnz == _id[_nrows]); + _sk.resize(_nnz); // Allocate memory for CSR column index vector + return; +} + + +void CRS_Matrix::Debug() const +{ +// ID points to first entry of row +// no symmetry assumed + cout << "\nMatrix (nnz = " << _id[_nrows] << ")\n"; + + for (int row = 0; row < _nrows; ++row) + { + cout << "Row " << row << " : "; + int const id1 = _id[row]; + int const id2 = _id[row + 1]; + for (int j = id1; j < id2; ++j) + { + cout.setf(ios::right, ios::adjustfield); + cout << "[" << setw(2) << _ik[j] << "] " << setw(4) << _sk[j] << " "; + } + cout << endl; + } + return; +} + +void CRS_Matrix::CalculateLaplace(vector &f) +{ + assert(_mesh.NdofsElement() == 3); // only for triangular, linear elements + //cout << _nnz << " vs. " << _id[_nrows] << " " << _nrows<< endl; + assert(_nnz == _id[_nrows]); + + for (int k = 0; k < _nrows; ++k) + { + _sk[k] = 0.0; + } + for (int k = 0; k < _nrows; ++k) + { + f[k] = 0.0; + } + + double ske[3][3], fe[3]; + // Loop over all elements + auto const nelem = _mesh.Nelems(); + auto const &ia = _mesh.GetConnectivity(); + auto const &xc = _mesh.GetCoords(); + + for (int i = 0; i < nelem; ++i) + { + CalcElem(ia.data() + 3 * i, xc.data(), ske, fe); + //AddElem(ia.data()+3 * i, ske, fe, _id.data(), _ik.data(), _sk.data(), f.data()); // GH: deprecated + AddElem_3(ia.data() + 3 * i, ske, fe, f); + } + + //Debug(); + + return; +} + +void CRS_Matrix::ApplyDirichletBC(std::vector const &u, std::vector &f) +{ + double const PENALTY = 1e6; + auto const idx = _mesh.Index_DirichletNodes(); + int const nidx = static_cast(idx.size()); + + for (int row = 0; row < nidx; ++row) + { + int const k = idx[row]; + int const id1 = fetch(k, k); // Find diagonal entry of row + assert(id1 >= 0); + _sk[id1] += PENALTY; // matrix weighted scaling feasible + f[k] += PENALTY * u[k]; + } + + return; +} + +void CRS_Matrix::GetDiag(vector &d) const +{ + assert( _nrows == static_cast(d.size()) ); + + for (int row = 0; row < _nrows; ++row) + { + const int ia = fetch(row, row); // Find diagonal entry of row + assert(ia >= 0); + d[row] = _sk[ia]; + } + return; +} + +bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const +{ + bool bn = (nnode == _nrows); // number of rows + if (!bn) + { + cout << "######### Error: " << "number of rows" << endl; + } + + bool bz = (id[nnode] == _nnz); // number of non zero elements + if (!bz) + { + cout << "######### Error: " << "number of non zero elements" << endl; + } + + bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts + if (!bd) + { + cout << "######### Error: " << "row starts" << endl; + } + + bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices + if (!bk) + { + cout << "######### Error: " << "column indices" << endl; + } + + bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values + if (!bv) + { + cout << "######### Error: " << "values" << endl; + } + + return bn && bz && bd && bk && bv; +} + + +void CRS_Matrix::Mult(vector &w, vector const &u) const +{ + assert( _nrows == static_cast(w.size()) ); + assert( w.size() == u.size() ); + + for (int row = 0; row < _nrows; ++row) + { + double wi = 0.0; + for (int ij = _id[row]; ij < _id[row + 1]; ++ij) + { + wi += _sk[ij] * u[ _ik[ij] ]; + } + w[row] = wi; + } + return; +} + +void CRS_Matrix::Defect(vector &w, + vector const &f, vector const &u) const +{ + assert( _nrows == static_cast(w.size()) ); + assert( w.size() == u.size() && u.size() == f.size() ); + + for (int row = 0; row < _nrows; ++row) + { + double wi = f[row]; + for (int ij = _id[row]; ij < _id[row + 1]; ++ij) + { + wi -= _sk[ij] * u[ _ik[ij] ]; + } + w[row] = wi; + } + return; +} + +int CRS_Matrix::fetch(int const row, int const col) const +{ + int const id2 = _id[row + 1]; // end and + int ip = _id[row]; // start of recent row (global index) + + while (ip < id2 && _ik[ip] != col) // find index col (global index) + { + ++ip; + } + if (ip >= id2) + { + ip = -1; +#ifndef NDEBUG // compiler option -DNDEBUG switches off the check + cout << "No column " << col << " in row " << row << endl; + assert(ip >= id2); +#endif + } + return ip; +} + + +void CRS_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector &f) +{ + for (int i = 0; i < 3; ++i) + { + const int ii = ial[i]; // row ii (global index) + for (int j = 0; j < 3; ++j) // no symmetry assumed + { + const int jj = ial[j]; // column jj (global index) + int ip = fetch(ii, jj); // find column entry jj in row ii +#ifndef NDEBUG // compiler option -DNDEBUG switches off the check + if (ip < 0) // no entry found !! + { + cout << "Error in AddElem: (" << ii << "," << jj << ") [" + << ial[0] << "," << ial[1] << "," << ial[2] << "]\n"; + assert(ip >= 0); + } +#endif + _sk[ip] += ske[i][j]; + } + f[ii] += fe[i]; + } +} + diff --git a/ex3/ex3/getmatrix.h b/ex3/ex3/getmatrix.h new file mode 100644 index 0000000..922606c --- /dev/null +++ b/ex3/ex3/getmatrix.h @@ -0,0 +1,178 @@ +#ifndef GETMATRIX_FILE +#define GETMATRIX_FILE + +#include "geom.h" +#include + +/** + * Calculates the element stiffness matrix @p ske and the element load vector @p fe + * of one triangular element with linear shape functions. + * @param[in] ial node indices of the three element vertices + * @param[in] xc vector of node coordinates with x(2*k,2*k+1) as coodinates of node k + * @param[out] ske element stiffness matrix + * @param[out] fe element load vector + */ +void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]); + +/** + * Adds the element stiffness matrix @p ske and the element load vector @p fe + * of one triangular element with linear shape functions to the appropriate positions in + * the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik) + * + * @param[in] ial node indices of the three element vertices + * @param[in] ske element stiffness matrix + * @param[in] fe element load vector + * @param[out] sk vector non-zero entries of CSR matrix + * @param[in] id index vector containing the first entry in a CSR row + * @param[in] ik column index vector of CSR matrix + * @param[out] f distributed local vector storing the right hand side + * + * @warning Algorithm requires indices in connectivity @p ial in ascending order. + * Currently deprecated. +*/ +void AddElem(int const ial[3], double const ske[3][3], double const fe[3], + int const id[], int const ik[], double sk[], double f[]); + + +// ##################################################################### +/** + * Square matrix in CRS format (compressed row storage; also named CSR), + * see an introduction. + */ +class CRS_Matrix +{ + public: + /** + * Intializes the CRS matrix structure from the given discetization in @p mesh. + * + * The sparse matrix pattern is generated but the values are 0. + * + * @param[in] mesh given discretization + * + * @warning A reference to the discretization @p mesh is stored inside this class. + * Therefore, changing @p mesh outside requires also + * to call method @p Derive_Matrix_Pattern explicitely. + * + * @see Derive_Matrix_Pattern + */ + explicit CRS_Matrix(Mesh const & mesh); + + /** + * Destructor. + */ + ~CRS_Matrix() + {} + + /** + * Generates the sparse matrix pattern and overwrites the existing pattern. + * + * The sparse matrix pattern is generated but the values are 0. + */ + void Derive_Matrix_Pattern(); + + /** + * Calculates the entries of f.e. stiffness matrix and load/rhs vector @p f for the Laplace operator in 2D. + * No memory is allocated. + * + * @param[in,out] f (preallocated) rhs/load vector + */ + void CalculateLaplace(std::vector &f); + + /** + * Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f. + * The penalty method + * is used for incorporating the given values @p u. + * + * @param[in] u (global) vector with Dirichlet data + * @param[in,out] f load vector + */ + void ApplyDirichletBC(std::vector const &u, std::vector &f); + + /** + * Extracts the diagonal elemenst of the sparse matrix. + * + * @param[in,out] d (prellocated) vector of diagonal elements + */ + void GetDiag(std::vector &d) const; + + /** + * Performs the matrix-vector product w := K*u. + * + * @param[in,out] w resulting vector (preallocated) + * @param[in] u vector + */ + void Mult(std::vector &w, std::vector const &u) const; + + /** + * Calculates the defect/residuum w := f - K*u. + * + * @param[in,out] w resulting vector (preallocated) + * @param[in] f load vector + * @param[in] u vector + */ + void Defect(std::vector &w, + std::vector const &f, std::vector const &u) const; + + /** + * Number rows in matrix. + * @return number of rows. + */ + int Nrows() const + {return _nrows;} + + /** + * Show the matrix entries. + */ + void Debug() const; + + /** + * Finds in a CRS matrix the access index for an entry at row @p row and column @p col. + * + * @param[in] row row index + * @param[in] col column index + * @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned. + * + * @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist. + */ + int fetch(int row, int col) const; + + /** + * Adds the element stiffness matrix @p ske and the element load vector @p fe + * of one triangular element with linear shape functions to the appropriate positions in + * the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik). + * + * @param[in] ial node indices of the three element vertices + * @param[in] ske element stiffness matrix + * @param[in] fe element load vector + * @param[in,out] f distributed local vector storing the right hand side + * + * @warning Algorithm assumes linear triangular elements (ndof_e==3). + */ + void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector &f); + + /** + * Compare @p this CRS matrix with an external CRS matrix stored in C-Style. + * + * The method prints statements on differences found. + * + * @param[in] nnode row number of external matrix + * @param[in] id start indices of matrix rows of external matrix + * @param[in] ik column indices of external matrix + * @param[in] sk non-zero values of external matrix + * + * @return true iff all data are identical. + */ + bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const; + + private: + Mesh const & _mesh; //!< reference to discretization + int _nrows; //!< number of rows in matrix + int _nnz; //!< number of non-zero entries + std::vector _id; //!< start indices of matrix rows + std::vector _ik; //!< column indices + std::vector _sk; //!< non-zero values + +}; + + +#endif diff --git a/ex3/ex3/jacsolve.cpp b/ex3/ex3/jacsolve.cpp new file mode 100644 index 0000000..30766d2 --- /dev/null +++ b/ex3/ex3/jacsolve.cpp @@ -0,0 +1,61 @@ +#include "vdop.h" +#include "getmatrix.h" +#include "jacsolve.h" + +#include +#include +#include +#include +using namespace std; + +// ##################################################################### +// const int neigh[], const int color, +// const MPI::Intracomm& icomm, +void JacobiSolve(CRS_Matrix const &SK, vector const &f, vector &u) +{ + const double omega = 1.0; + const int maxiter = 1000; + const double tol = 1e-5, // tolerance + tol2 = tol * tol; // tolerance^2 + + int nrows = SK.Nrows(); // number of rows == number of columns + assert( nrows == static_cast(f.size()) && f.size() == u.size() ); + + cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl; + // Choose initial guess + for (int k = 0; k < nrows; ++k) + { + u[k] = 0.0; // u := 0 + } + + vector dd(nrows); // matrix diagonal + vector r(nrows); // residual + vector w(nrows); // correction + + SK.GetDiag(dd); // dd := diag(K) + ////DebugVector(dd);{int ijk; cin >> ijk;} + + // Initial sweep + SK.Defect(r, f, u); // r := f - K*u + + vddiv(w, r, dd); // w := D^{-1}*r + double sigma0 = dscapr(w, r); // s0 := + + // Iteration sweeps + int iter = 0; + double sigma = sigma0; + while ( sigma > tol2 * sigma0 && maxiter > iter) + { + ++iter; + vdaxpy(u, u, omega, w ); // u := u + om*w + SK.Defect(r, f, u); // r := f - K*u + vddiv(w, r, dd); // w := D^{-1}*r + sigma = dscapr(w, r); // s0 := +// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl; + } + cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl; + cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n"; + + return; +} + diff --git a/ex3/ex3/jacsolve.h b/ex3/ex3/jacsolve.h new file mode 100644 index 0000000..dfa3802 --- /dev/null +++ b/ex3/ex3/jacsolve.h @@ -0,0 +1,18 @@ +#ifndef JACSOLVE_FILE +#define JACSOLVE_FILE +#include "getmatrix.h" +#include + + +/** + * Solves linear system of equations K @p u = @p f via the Jacobi iteration. + * We use a distributed symmetric CSR matrix @p SK and initial guess of the + * solution is set to 0. + * @param[in] SK CSR matrix + * @param[in] f distributed local vector storing the right hand side + * @param[out] u accumulated local vector storing the solution. +*/ +void JacobiSolve(CRS_Matrix const &SK, std::vector const &f, std::vector &u); + + +#endif diff --git a/ex3/ex3/main.cpp b/ex3/ex3/main.cpp new file mode 100644 index 0000000..088efa4 --- /dev/null +++ b/ex3/ex3/main.cpp @@ -0,0 +1,101 @@ +#include "benchmarks.h" +#include "benchmark_tests.h" +#include "factorization_solve_tests.h" +#include + +int main() +{ + // ---------------------------------- 1. ---------------------------------- + // results in file "test_system.txt" + + + + // ---------------------------------- 2. ---------------------------------- + // Memory FLOPS Read-write operations + // (A) 2N 2N 2N + // (B) MN + N 2NM 2NM + M + // (C) ML + LM 2LNM 2LNM + MN + // (D) (p+1) + N 3N(p+1) N(2 + 3(p+1)) + + + + // ---------------------------------- 3. ---------------------------------- + // implementation in file "benchmarks.cpp" + + + + // ---------------------------------- 4.& 6. ---------------------------------- + vector> results; + + results.push_back(test_A(250, 50000000, scalar)); + results.push_back(test_A(250, 50000000, Kahan_skalar)); + results.push_back(test_A(250, 50000000, scalar_cBLAS)); + // Timing GFLOPS GiByte/s + // ------------------------------------------ + // scalar 0.039 2.4 19 + // Kahan_skalar 0.037 2.5 20 + // scalar_cBLAS 0.032 2.9 23 + + + results.push_back(test_B(100, 20000, 10000, MatVec)); + results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS)); + // Timing GFLOPS GiByte/s + // ------------------------------------------ + // MatVec 0.1 3.6 29 + // MatVec_cBLAS 0.074 5 40 + + + results.push_back(test_C(25, 500, 1000, 1500, MatMat)); + results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS)); + // Timing GFLOPS GiByte/s + // ------------------------------------------ + // MatMat 0.57 2.5 20 + // MatMat_cBLAS 0.019 75 6e+02 // unrealistic + + + results.push_back(test_D(100, 100, 1000000)); + // Timing GFLOPS GiByte/s + // ------------------------------------------ + // 0.11 2.5 20 + + + cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl; + cout << "------------------------------" << endl; + for (size_t i = 0; i < results.size(); ++i) + cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl; + cout << endl; + + + + // ---------------------------------- 5. ---------------------------------- + // 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product. + // Reason: only have to access entries of x, so less memory that has to be accessed + // + // 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product + + + + // ---------------------------------- 6. ---------------------------------- + // see 4. + + + // ---------------------------------- 7. ---------------------------------- + CheckCorrectness(); + // Checked correctness by computing the inverse of A + + CheckDuration(5000); + // The solving time per RHS scales roughly with factor 1/n_rhs + + + + // ---------------------------------- 8. ---------------------------------- + // done seperately + + + + + + + return 0; + +} diff --git a/ex3/ex3/userset.cpp b/ex3/ex3/userset.cpp new file mode 100644 index 0000000..b263b8f --- /dev/null +++ b/ex3/ex3/userset.cpp @@ -0,0 +1,16 @@ +#include "userset.h" +#include + + +double FunctF(double const x, double const y) +{ +// return std::sin(3.14159*1*x)*std::sin(3.14159*1*y); +// return 16.0*1024. ; +// return (double)1.0 ; + return x * x * std::sin(2.5 * 3.14159 * y); +} + +double FunctU(const double /* x */, double const /* y */) +{ + return 1.0 ; +} diff --git a/ex3/ex3/userset.h b/ex3/ex3/userset.h new file mode 100644 index 0000000..a734e1f --- /dev/null +++ b/ex3/ex3/userset.h @@ -0,0 +1,44 @@ +#ifndef USERSET_FILE +#define USERSET_FILE +#include +/** + * User function: f(@p x,@p y) + * @param[in] x x-coordinate of discretization point + * @param[in] y y-coordinate of discretization point + * @return value for right hand side f(@p x,@p y) + */ +double FunctF(double const x, double const y); + +/** + * User function: u(@p x,@p y) + * @param[in] x x-coordinate of discretization point + * @param[in] y y-coordinate of discretization point + * @return value for solution vector u(@p x,@p y) + */ +double FunctU(double const x, double const y); + + +/** + * User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$. + * @param[in] x x-coordinate of discretization point + * @param[in] y y-coordinate of discretization point + * @return value f(@p x,@p y) + */ +inline double fNice(double const x, double const y) +{ + return x * x * std::sin(2.5 * 3.14159 * y); +} + +/** + * User function: f(@p x,@p y) = 0$. + * @param[in] x x-coordinate of discretization point + * @param[in] y y-coordinate of discretization point + * @return value 0 + */ +inline double f_zero(double const x, double const y) +//double f_zero(double const /*x*/, double const /*y*/) +{ + return 0.0 + 0.0*(x+y); +} + +#endif diff --git a/ex3/ex3/vdop.cpp b/ex3/ex3/vdop.cpp new file mode 100644 index 0000000..55fac1a --- /dev/null +++ b/ex3/ex3/vdop.cpp @@ -0,0 +1,84 @@ +#include "vdop.h" +#include // assert() +#include +#include +#include +using namespace std; + + +void vddiv(vector &x, vector const &y, + vector const &z) +{ + assert( x.size() == y.size() && y.size() == z.size() ); + size_t n = x.size(); + + for (size_t k = 0; k < n; ++k) + { + x[k] = y[k] / z[k]; + } + return; +} + +//****************************************************************************** + +void vdaxpy(std::vector &x, std::vector const &y, + double alpha, std::vector const &z ) +{ + assert( x.size() == y.size() && y.size() == z.size() ); + size_t n = x.size(); + + for (size_t k = 0; k < n; ++k) + { + x[k] = y[k] + alpha * z[k]; + } + return; +} +//****************************************************************************** + +double dscapr(std::vector const &x, std::vector const &y) +{ + assert( x.size() == y.size()); + size_t n = x.size(); + + double s = 0.0; + for (size_t k = 0; k < n; ++k) + { + s += x[k] * y[k]; + } + + return s; +} + +//****************************************************************************** +void DebugVector(vector const &v) +{ + cout << "\nVector (nnode = " << v.size() << ")\n"; + for (size_t j = 0; j < v.size(); ++j) + { + cout.setf(ios::right, ios::adjustfield); + cout << v[j] << " "; + } + cout << endl; + + return; +} +//****************************************************************************** +bool CompareVectors(std::vector const &x, int const n, double const y[], double const eps) +{ + bool bn = (static_cast(x.size()) == n); + if (!bn) + { + cout << "######### Error: " << "number of elements" << endl; + } + //bool bv = equal(x.cbegin(),x.cend(),y); + bool bv = equal(x.cbegin(), x.cend(), y, + [eps](double a, double b) -> bool + { return std::abs(a - b) < eps * (1.0 + 0.5 * (std::abs(a) + std::abs(a))); } + ); + if (!bv) + { + assert(static_cast(x.size()) == n); + cout << "######### Error: " << "values" << endl; + } + return bn && bv; +} diff --git a/ex3/ex3/vdop.h b/ex3/ex3/vdop.h new file mode 100644 index 0000000..b2d0adb --- /dev/null +++ b/ex3/ex3/vdop.h @@ -0,0 +1,58 @@ +#ifndef VDOP_FILE +#define VDOP_FILE +#include + +/** @brief Element-wise vector divison x_k = y_k/z_k. + * + * @param[out] x target vector + * @param[in] y source vector + * @param[in] z source vector + * + */ +void vddiv(std::vector & x, std::vector const& y, + std::vector const& z); + +/** @brief Element-wise daxpy operation x(k) = y(k) + alpha*z(k). + * + * @param[out] x target vector + * @param[in] y source vector + * @param[in] alpha scalar + * @param[in] z source vector + * + */ +void vdaxpy(std::vector & x, std::vector const& y, + double alpha, std::vector const& z ); + + +/** @brief Calculates the Euclidian inner product of two vectors. + * + * @param[in] x vector + * @param[in] y vector + * @return Euclidian inner product @f$\langle x,y \rangle@f$ + * + */ +double dscapr(std::vector const& x, std::vector const& y); + + +/** + * Print entries of a vector. + * @param[in] v vector values +*/ +void DebugVector(std::vector const &v); + +/** @brief Compares an STL vector with POD vector. + * + * The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$ + * follows the book by + * Stoyan/Baran, p.8. + * + * @param[in] x STL vector + * @param[in] n length of POD vector + * @param[in] y POD vector + * @param[in] eps relative accuracy criteria (default := 0.0). + * @return true iff pairwise vector elements are relatively close to each other. + * + */ +bool CompareVectors(std::vector const& x, int n, double const y[], double const eps=0.0); + +#endif diff --git a/ex4/ex_4A.py b/ex4/ex_4A.py new file mode 100644 index 0000000..e8dd84e --- /dev/null +++ b/ex4/ex_4A.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt + +np.set_printoptions(precision=2) + + +N = 10 # number of elements +a = 1 +alpha = 1 +g_b = 1 +f_const = 1 + +x = np.linspace(0, 1, N + 1) + + +A = np.zeros((N + 1, N + 1)) +f_vec = np.zeros(N + 1) + +for i in range(1, N + 1): + h = x[i] - x[i - 1] + + + a_11 = 1./h + a*h/3. + a_12 = -1./h + a*h/6. + a_21 = -1./h + a*h/6. + a_22 = 1./h + a*h/3 + + A[i - 1, i - 1] += a_11 + A[i - 1, i] += a_12 + A[i, i - 1] += a_21 + A[i, i] += a_22 + + + f_vec[i] = f_const*h + +print("A =\n", A) + +# take Neumann data into account +A[N, N] += alpha +f_vec[N] += alpha*g_b + + +# take Dirichlet data into account +u_g = np.zeros(N + 1) +u_g[0] = 0 +print("u_g =\n", u_g) + +# remove first row of A +A_g = A[1:N+1, :] +#print("A_g =\n", A_g) + +# remove first row of f_vec +f_g = f_vec[1:N+1] + +# assemble RHS with dirichlet data +f_g -= A_g.dot(u_g) +#print(f_g) + +# matrix for the inner nodes (excluding nodes with dirichlet bcs) +A_0 = A[1:N+1, 1:N+1] +#print(A_0) + +# solve for u_0 (free dofs) +u_0 = np.linalg.solve(A_0, f_g) + +# assemble "u = u_0 + u_g" +u = np.concatenate([[0], u_0]) + + +print("u =\n", u) + +plt.plot(x, u, '-') +plt.xlabel('x') +plt.ylabel('u_h(x)') +plt.grid() +plt.show() \ No newline at end of file diff --git a/ex4/ex_4B.py b/ex4/ex_4B.py new file mode 100644 index 0000000..cd58310 --- /dev/null +++ b/ex4/ex_4B.py @@ -0,0 +1,83 @@ +import numpy as np +import matplotlib.pyplot as plt + +np.set_printoptions(precision=2) + + + +N = 2 # number of elements +split = np.sqrt(2)/2 +#split = 0.5 + + +N_lower = round(N*split) # number of elements in (0, split) +print(N_lower, "elements below sqrt(2)/2") +N_upper = N - N_lower # number of elements in (split, 1) +print(N_upper, "elements above sqrt(2)/2") + +assert(N == N_lower + N_upper) + + + +x_lower = np.linspace(0, split, N_lower + 1) +x_upper = np.linspace(split, 1, N_upper + 1) +x = np.concatenate([x_lower, x_upper[1:]]) +print(x) + + +A = np.zeros((N + 1, N + 1)) + +for i in range(1, N + 1): + h = x[i] - x[i - 1] + + lam = 1 + if(x[i] > split): + lam = 10 + + + a_11 = lam/h + a_12 = -lam/h + a_21 = -lam/h + a_22 = lam/h + + A[i - 1, i - 1] += a_11 + A[i - 1, i] += a_12 + A[i, i - 1] += a_21 + A[i, i] += a_22 + +print("A =\n", A) + + + +# take dirichlet data into account +u_g = np.zeros(N + 1) +u_g[0] = 0 +u_g[N] = 1 +print("u_g =\n", u_g) + +# remove first and last row of A +A_g = A[1:N, :] +#print("A_g =\n", A_g) + +# assemble RHS with dirichlet data +f = -A_g.dot(u_g) +#print(f) + +# matrix for the inner nodes (excluding nodes with dirichlet bcs) +A_0 = A[1:N, 1:N] +#print(A_0) + +# solve for u_0 (free dofs) +u_0 = np.linalg.solve(A_0, f) + +# assemble "u = u_0 + u_g" +u = np.concatenate([[0], u_0, [1]]) + + +print("u =\n", u) + +plt.plot(x, u, '-') +plt.xlabel('x') +plt.ylabel('u_h(x)') +plt.grid() +plt.show() \ No newline at end of file diff --git a/ex4/ex_4C.py b/ex4/ex_4C.py new file mode 100644 index 0000000..5c8184e --- /dev/null +++ b/ex4/ex_4C.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt + +np.set_printoptions(precision=2) + + +p = 70 +N_vec = [10, 20, 30, 40, 70] +for N in N_vec: + + x = np.linspace(0, 1, N + 1) + + A = np.zeros((N + 1, N + 1)) + + for i in range(1, N + 1): + h = x[i] - x[i - 1] + + + a_11 = 1./h - p/2. + a_12 = -1./h + p/2. + a_21 = -1./h - p/2. + a_22 = 1./h + p/2. + + A[i - 1, i - 1] += a_11 + A[i - 1, i] += a_12 + A[i, i - 1] += a_21 + A[i, i] += a_22 + + print("A =\n", A) + + + + # take dirichlet data into account + u_g = np.zeros(N + 1) + u_g[0] = 0 + u_g[N] = 1 + print("u_g =\n", u_g) + + # remove first and last row of A + A_g = A[1:N, :] + #print("A_g =\n", A_g) + + # assemble RHS with dirichlet data + f = -A_g.dot(u_g) + #print(f) + + # matrix for the inner nodes (excluding nodes with dirichlet bcs) + A_0 = A[1:N, 1:N] + #print(A_0) + + # solve for u_0 (free dofs) + u_0 = np.linalg.solve(A_0, f) + + # assemble "u = u_0 + u_g" + u = np.concatenate([[0], u_0, [1]]) + + + print("u =\n", u) + + plt.plot(x, u, '-') + + +# plotting the exact solution +plt.plot(x, (np.exp(p*x) - 1.)/(np.exp(p) - 1.)) + +plt.xlabel('x') +plt.ylabel('u_h(x)') +plt.legend(N_vec + ['exact']) +plt.title("Comparing discrete solution for increasing number of elements") +plt.grid() +plt.show() + + + + + diff --git a/ex4/ex_4_results.pdf b/ex4/ex_4_results.pdf new file mode 100644 index 0000000..2018312 Binary files /dev/null and b/ex4/ex_4_results.pdf differ diff --git a/ex5/CLANG_default.mk b/ex5/CLANG_default.mk new file mode 100644 index 0000000..d27bbe6 --- /dev/null +++ b/ex5/CLANG_default.mk @@ -0,0 +1,131 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=CLANG_ +# https://llvm.org/docs/CompileCudaWithLLVM.html +# https://llvm.org/docs/NVPTXUsage.html + +#CLANGPATH=//usr/lib/llvm-10/bin/ +CC = ${CLANGPATH}clang +CXX = ${CLANGPATH}clang++ +#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0 +#F77 = gfortran +LINKER = ${CXX} + +#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages +WARNINGS += -pedantic -Weverything -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion +WARNINGS += -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1 +WARNINGS += -Wno-unsafe-buffer-usage +#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic + +CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS} +# don't use -Ofast +# -ftrapv +LINKFLAGS += -O3 + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +LINKFLAGS += -llapack -lblas +# -lopenblas +ifeq ($(UBUNTU),1) +# ubuntu +else +# on archlinux +LINKFLAGS += -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +#sudo apt install libomp-dev +# OpenMP +CXXFLAGS += -fopenmp +LINKFLAGS += -fopenmp + +# very good check +# http://clang.llvm.org/extra/clang-tidy/ +# good check, see: http://llvm.org/docs/CodingStandards.html#include-style +SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init +SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration +#READABILITY=,readability*${SWITCH_OFF} +#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp" +#TIDYFLAGS += -checks='modernize* +# ??? +#TIDYFLAGS = -checks='cert*' -header-filter=.* +# MPI checks ?? +#TIDYFLAGS = -checks='mpi*' +# ?? +#TIDYFLAGS = -checks='performance*' -header-filter=.* +#TIDYFLAGS = -checks='portability-*' -header-filter=.* +#TIDYFLAGS = -checks='readability-*' -header-filter=.* + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +codecheck: tidy_check +tidy_check: + clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES} +# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html + +run: clean ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +codecheck: tidy_check diff --git a/ex5/GCCMKL_default.mk b/ex5/GCCMKL_default.mk new file mode 100644 index 0000000..e16d98f --- /dev/null +++ b/ex5/GCCMKL_default.mk @@ -0,0 +1,212 @@ +# Basic Defintions for using GNU-compiler suite with OpenMP und MKL +# requires setting of COMPILER=GCCMKL_ + +# install MKL in manjaro +# https://linux-packages.com/manjaro-linux/package/intel-mkl +# > sudo pacman -Sy +# > sudo pacman -S intel-mkl + +ifeq ($(ONEAPI),1) +MKL_INCLUDE=/opt/intel/oneapi/mkl/2024.0/include +MKL_LIB=/opt/intel/oneapi/2024.0/lib +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/oneapi/2024.0/lib +else +MKL_INCLUDE=/usr/include/mkl +MKL_LIB=/usr/lib/x86_64-linux-gnu/mkl +endif + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS} +#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS} +#-msse3 +# -ftree-vectorizer-verbose=2 -DNDEBUG +# -ftree-vectorizer-verbose=5 +# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr + +# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details +# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2 +# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +LINKFLAGS += -O3 + +#architecture +CPU = -march=znver2 +#CPU = -march=core-avx2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# MKL +#CXXFLAGS += -I/usr/include/mkl -DUSE_MKL -Wno-redundant-decls +CXXFLAGS += -I${MKL_INCLUDE} -DUSE_MKL -Wno-redundant-decls +#LINKFLAGS += -lmkl_intel_lp64 -lmkl_tbb_thread -ltbb -lmkl_core +#LINKFLAGS += -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -L/usr/lib/x86_64-linux-gnu/mkl +LINKFLAGS += -L${MKL_LIB} -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core +#LINKFLAGS += -lmkl_intel_lp64 -lmkl_sequential -lmkl_core + +# workaround for MKL slow down on AMD hardware +# https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html +default: run +libfakeintel.so: + gcc -shared -fPIC -o libfakeintel.so fakeintel.c + echo "call: export LD_PRELOAD=./libfakeintel.so " + +# different libraries in Ubuntu or manajaro +#ifndef UBUNTU +#UBUNTU=1 +#endif + +## BLAS, LAPACK +#ifeq ($(UBUNTU),1) +#LINKFLAGS += -llapack -lblas +## -lopenblas +#else +## on archlinux +#LINKFLAGS += -llapack -lopenblas -lcblas +#endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# for debugging purpose (save code) +# -fsanitize=leak # only one out the three can be used +# -fsanitize=address +# -fsanitize=thread +SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \ + -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \ + -fsanitize=bool -fsanitize=enum -fsanitize=vptr +#CXXFLAGS += ${SANITARY} +#LINKFLAGS += ${SANITARY} + +# OpenMP +CXXFLAGS += -fopenmp +LINKFLAGS += -fopenmp + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + -@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt + -@rm -rf html + +run: clean ${PROGRAM} libfakeintel.so +#run: ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * +#find . -size +10M > large_files +#--exclude-from ${MY_DIR}/large_files + +zip: clean + @echo "Zip the directory: " ${MY_DIR} + @cd .. ;\ + zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} + +doc: + doxygen Doxyfile + +######################################################################### +.SUFFIXES: .f90 + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log) + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +.f90.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +# no "-pg" in compile/link options +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} +# Graphical interface +# valkyrie + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/ +# * install +# * sudo vi /etc/sysctl.conf +# add kernel.perf_event_paranoid = 0 + +#Trace your heap: +#> heaptrack ./main.GCC_ +#> heaptrack_gui heaptrack.main.GCC_..gz +heap: ${PROGRAM} + heaptrack ./$^ ${PARAMS} 11 + heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` & + +codecheck: $(SOURCES) + cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^ + + +######################################################################## +# get the detailed status of all optimization flags +info: + echo "detailed status of all optimization flags" + $(CXX) --version + $(CXX) -Q $(CXXFLAGS) --help=optimizers + lscpu + inxi -C + lstopo + +# Excellent hardware info +# hardinfo +# Life monitoring of CPU frequency etc. +# sudo i7z + +# Memory consumption +# vmstat -at -SM 3 +# xfce4-taskmanager + + +# https://www.tecmint.com/check-linux-cpu-information/ +#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/ + +# Debugging: +# https://wiki.archlinux.org/index.php/Debugging diff --git a/ex5/GCC_SINGLE_default.mk b/ex5/GCC_SINGLE_default.mk new file mode 100644 index 0000000..5ee06e1 --- /dev/null +++ b/ex5/GCC_SINGLE_default.mk @@ -0,0 +1,111 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=GCC_ + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +# on mephisto: +#CXXFLAGS += -I/share/apps/atlas/include +#LINKFLAGS += -L/share/apps/atlas/lib -L/usr/lib64/atlas +#LINKFLAGS += -latlas -lcblas +#LINKFLAGS += -lblas +# Der Header muss mit extern "C" versehen werden, damit g++ alles findet. + + +WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +#CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native -DNDEBUG ${WARNINGS} +CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native ${WARNINGS} + +# info on vectorization +#VECTORIZE = -ftree-vectorize -fdump-tree-vect-blocks=foo.dump +#-fdump-tree-pre=stderr +VECTORIZE = -ftree-vectorize -fopt-info -ftree-vectorizer-verbose=5 +#CXXFLAGS += ${VECTORIZE} +# -funroll-all-loops -msse3 +#GCC -march=knl -march=broadwell -march=haswell + +# interprocedural optimization +#CXXFLAGS += -flto +LINKFLAGS += -flto + +# for debugging purpose (save code) +# -fsanitize=leak # only one out the trhee can be used +# -fsanitize=address +# -fsanitize=thread +SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \ + -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \ + -fsanitize=bool -fsanitize=enum -fsanitize=vptr +#CXXFLAGS += ${SANITARY} +#LINKFLAGS +=${SANITARY} + +# OpenMP but no OpenMP in Single mode +#CXXFLAGS += -fopenmp +LINKFLAGS += -fopenmp + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt + @rm -rf html + +run: clean ${PROGRAM} +# time ./${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ + + +thread:${PROGRAM} + valgrind -v --tool=helgrind --log-file=$^.thread.out ./$^ + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + ./$^ + gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & diff --git a/ex5/GCC_default.mk b/ex5/GCC_default.mk new file mode 100644 index 0000000..c2fbde3 --- /dev/null +++ b/ex5/GCC_default.mk @@ -0,0 +1,182 @@ +# Basic Defintions for using GNU-compiler suite sequentially +# requires setting of COMPILER=GCC_ + +CC = gcc +CXX = g++ +F77 = gfortran +LINKER = ${CXX} + +WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \ + -Wredundant-decls -Winline -fmax-errors=1 +# -Wunreachable-code +CXXFLAGS += -ffast-math -O3 -march=native -std=c++20 ${WARNINGS} +#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS} +#-msse3 +# -ftree-vectorizer-verbose=2 -DNDEBUG +# -ftree-vectorizer-verbose=5 +# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr + +# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details +# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2 +# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp +LINKFLAGS += -O3 + +#architecture +#CPU = -march=znver2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} -ltbb + +# different libraries in Ubuntu or manajaró +ifndef UBUNTU +UBUNTU=1 +endif + +# BLAS, LAPACK +ifeq ($(UBUNTU),1) +LINKFLAGS += -llapack -lblas +# -lopenblas +else +# on archlinux +LINKFLAGS += -llapack -lopenblas -lcblas +endif + +# interprocedural optimization +CXXFLAGS += -flto +LINKFLAGS += -flto + +# for debugging purpose (save code) +# -fsanitize=leak # only one out the three can be used +# -fsanitize=address +# -fsanitize=thread +SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \ + -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \ + -fsanitize=bool -fsanitize=enum -fsanitize=vptr +#CXXFLAGS += ${SANITARY} +#LINKFLAGS += ${SANITARY} + +# OpenMP +CXXFLAGS += -fopenmp +LINKFLAGS += -fopenmp + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + -@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt + -@rm -rf html + +run: clean ${PROGRAM} +#run: ${PROGRAM} +# time ./${PROGRAM} ${PARAMS} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * +#find . -size +10M > large_files +#--exclude-from ${MY_DIR}/large_files + +zip: clean + @echo "Zip the directory: " ${MY_DIR} + @cd .. ;\ + zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} + +doc: + doxygen Doxyfile + +######################################################################### +.SUFFIXES: .f90 + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log +# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log) + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +.f90.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags) +cache: ${PROGRAM} + valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS} +# kcachegrind callgrind.out. & + kcachegrind `ls -1tr callgrind.out.* |tail -1` + +# Check for wrong memory accesses, memory leaks, ... +# use smaller data sets +# no "-pg" in compile/link options +mem: ${PROGRAM} + valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS} +# Graphical interface +# valkyrie + +# Simple run time profiling of your code +# CXXFLAGS += -g -pg +# LINKFLAGS += -pg +prof: ${PROGRAM} + perf record ./$^ ${PARAMS} + perf report +# gprof -b ./$^ > gp.out +# kprof -f gp.out -p gprof & + +# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/ +# * install +# * sudo vi /etc/sysctl.conf +# add kernel.perf_event_paranoid = 0 + +#Trace your heap: +#> heaptrack ./main.GCC_ +#> heaptrack_gui heaptrack.main.GCC_..gz +heap: ${PROGRAM} + heaptrack ./$^ ${PARAMS} 11 + heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` & + +codecheck: $(SOURCES) + cppcheck --enable=all --inconclusive --std=c++17 -I${CUDA_INC} --suppress=missingIncludeSystem $^ + + +######################################################################## +# get the detailed status of all optimization flags +info: + echo "detailed status of all optimization flags" + $(CXX) --version + $(CXX) -Q $(CXXFLAGS) --help=optimizers + lscpu + inxi -C + lstopo + +# Excellent hardware info +# hardinfo +# Life monitoring of CPU frequency etc. +# sudo i7z + +# Memory consumption +# vmstat -at -SM 3 +# xfce4-taskmanager + + +# https://www.tecmint.com/check-linux-cpu-information/ +#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/ + +# Debugging: +# https://wiki.archlinux.org/index.php/Debugging diff --git a/ex5/ICC_default.mk b/ex5/ICC_default.mk new file mode 100644 index 0000000..56c1275 --- /dev/null +++ b/ex5/ICC_default.mk @@ -0,0 +1,151 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ICC_ + +# special on my sony [GH] +#BINDIR = /opt/save.intel/bin/ +# very special on my sony [GH] +# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so + +#export KMP_AFFINITY=verbose,compact + +CC = ${BINDIR}icc +CXX = ${BINDIR}icpc +F77 = ${BINDIR}ifort +LINKER = ${CXX} + +WARNINGS = -pedantic -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 + #-Winline -Wunreachable-code -Wredundant-decls +CXXFLAGS += -std=c++17 -O3 -fma -DNDEBUG ${WARNINGS} -mkl +#CXXFLAGS += -std=c++17 -O3 -march=core-avx2 -fma -ftz -fomit-frame-pointer -DNDEBUG ${WARNINGS} -mkl +# -fast # fast inludes also -ipo ! +CXXFLAGS += -fargument-noalias -fargument-noalias-global -ansi-alias +CXXFLAGS += -align -qopt-dynamic-align +#CXXFLAGS += -xCore-AVX2 +#CXXFLAGS += -tp=zen +# -qopt-subscript-in-range +# -vec-threshold0 +# -xCORE-AVX2 +# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2 +# -ipo + +# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par + +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg +# Redirect report from *.optrpt to stderr +# -qopt-report-file=stderr +# Guided paralellization +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +# interprocedural optimization +#CXXFLAGS += -ipo +#LINKFLAGS += -ipo + +# annotated Assembler file +ANNOTED = -fsource-asm -S + +# OpenMP +CXXFLAGS += -qopenmp +# -qopt-report-phase=openmp +# -diag-enable=sc-full -diag-file=filename -diag-file-append[=filename] +LINKFLAGS += -qopenmp + +# use MKL by INTEL +# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +LINKFLAGS += -O2 -mkl +# -ipo + + + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} *.optrpt + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# cache: ${PROGRAM} +# valgrind --tool=callgrind --simulate-cache=yes ./$^ +# # kcachegrind callgrind.out. & +# +# # Check for wrong memory accesses, memory leaks, ... +# # use smaller data sets +# mem: ${PROGRAM} +# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ +# +# # Simple run time profiling of your code +# # CXXFLAGS += -g -pg +# # LINKFLAGS += -pg +# prof: ${PROGRAM} +# ./$^ +# gprof -b ./$^ > gp.out +# # kprof -f gp.out -p gprof & +# + + +mem: inspector +prof: amplifier +cache: amplifier + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid + amplxe-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + #${BINDIR}../inspector_xe_2013/bin64/inspxe-gui & + inspxe-gui & + +advisor: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new + export ADVIXE_EXPERIMENTAL=roofline + advixe-gui & + +icc-info: + icpc -# main.cpp + + + + diff --git a/ex5/ONEAPI_default.mk b/ex5/ONEAPI_default.mk new file mode 100644 index 0000000..c4c79ab --- /dev/null +++ b/ex5/ONEAPI_default.mk @@ -0,0 +1,181 @@ +# Basic Defintions for using INTEL compiler suite sequentially +# requires setting of COMPILER=ONEAPI_ + +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# requires +# source /opt/intel/oneapi/setvars.sh +# on AMD: export MKL_DEBUG_CPU_TYPE=5 + +#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/ +#MKL_ROOT = /opt/intel/oneapi/mkl/latest/ +#export KMP_AFFINITY=verbose,compact + +CC = ${BINDIR}icc +CXX = ${BINDIR}dpcpp +F77 = ${BINDIR}ifort +LINKER = ${CXX} + +## Compiler flags +WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic +WARNINGS += -Wpessimizing-move -Wredundant-move +#-wd2015,2012,2014 -wn3 +# -Winline -Wredundant-decls -Wunreachable-code +# -qopt-subscript-in-range +# -vec-threshold0 + +CXXFLAGS += -O3 -std=c++17 -tbb ${WARNINGS} +# https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned +# needed on Desktop-PC Haase (not needed on mephisto) +CXXFLAGS += -D_GLIBCXX_USE_TBB_PAR_BACKEND=0 +#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include" +#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include" +LINKFLAGS += -O3 -tbb +#LINKFLAGS += -no-prec-div + +# interprocedural optimization +CXXFLAGS += -ipo +LINKFLAGS += -ipo +LINKFLAGS += -flto + +# annotated Assembler file +ANNOTED = -fsource-asm -S + +#architecture +CPU = -march=core-avx2 +#CPU += -mtp=zen +# -xCORE-AVX2 +# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2 +CXXFLAGS += ${CPU} +LINKFLAGS += ${CPU} + +# use MKL by INTEL +# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html +# sequential MKL +# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64) +CXXFLAGS += -qmkl=sequential -UMKL_ILP64 +LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread +#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread + +# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static +# install intel-oneapi-compiler-static +# or +LINKFLAGS += -shared-intel + + +OPENMP = -qopenmp +CXXFLAGS += ${OPENMP} +LINKFLAGS += ${OPENMP} + + +# profiling tools +#CXXFLAGS += -pg +#LINKFLAGS += -pg +# -vec-report=3 +# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par +#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg +# Redirect report from *.optrpt to stderr +# -qopt-report-file=stderr +# Guided paralellization +# -guide -parallel +# -guide-opts=string -guide-par[=n] -guide-vec[=n] +# -auto-p32 -simd + +## run time checks +# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html + + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + rm -f ${PROGRAM} ${OBJECTS} *.optrpt + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} ${PARAMS} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# some tools +# Cache behaviour (CXXFLAGS += -g tracks down to source lines) +# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html + +mem: inspector +prof: vtune +cache: inspector + +gap_par_report: + ${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt + +# GUI for performance report +amplifier: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope + echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid + amplxe-gui & + +# GUI for Memory and Thread analyzer (race condition) +inspector: ${PROGRAM} + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# inspxe-gui & + vtune-gui ./${PROGRAM} & + +advisor: +# source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh +# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui & + advisor --collect=survey ./${PROGRAM} ${PARAMS} +# advisor --collect=roofline ./${PROGRAM} ${PARAMS} + advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv +# advisor-gui + +vtune: + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope +# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new + export ADVIXE_EXPERIMENTAL=roofline + vtune -collect hotspots ./${PROGRAM} ${PARAMS} + vtune -report hotspots -r r000hs > vtune.out +# vtune-gui ./${PROGRAM} & + +icc-info: + icpc -# main.cpp + +# MKL on AMD +# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/ +# +# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu +# export MKL_DEBUG_CPU_TYPE=5 +# export MKL_NUM_THRAEDS=1 +# export MKL_DYNAMIC=false +# on Intel compiler +# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html diff --git a/ex5/PGI_default.mk b/ex5/PGI_default.mk new file mode 100644 index 0000000..e06cb3c --- /dev/null +++ b/ex5/PGI_default.mk @@ -0,0 +1,96 @@ +# Basic Defintions for using PGI-compiler suite sequentially +# requires setting of COMPILER=PGI_ +# OPTIRUN = optirun +# on mephisto: +#CXXFLAGS += -I/share/apps/atlas/include +#LINKFLAGS += -L/share/apps/atlas/lib +#LINKFLAGS += -lcblas -latlas + +LINKFLAGS += -lblas + +CC = pgcc +CXX = pgc++ +F77 = pgfortran +LINKER = ${CXX} + + +WARNINGS = -Minform=warn + +#PGI_PROFILING = -Minfo=loop,vect,opt,intensity,mp,accel +PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity + +# -Minfo +# -Mprof=lines + +CXXFLAGS += -std=c++14 -O3 -fast -DNDEBUG ${PGI_PROFILING} ${WARNINGS} +CXXFLAGS += -Mvect -Mcache_align -Msafeptr -Mprefetch -Mlre -Mdepchk +#-Msmart + +LINKFLAGS += ${PGI_PROFILING} +#-lcblas +# OpenMP +CXXFLAGS += -mp=align,bind,numa -Mneginfo=mp +LINKFLAGS += -mp=allcores,bind,numa + +default: ${PROGRAM} + +${PROGRAM}: ${OBJECTS} + $(LINKER) $^ ${LINKFLAGS} -o $@ + +clean: + @rm -f ${PROGRAM} ${OBJECTS} + +clean_all:: clean + @rm -f *_ *~ *.bak *.log *.out *.tar + +run: clean ${PROGRAM} + ./${PROGRAM} + +# tar the current directory +MY_DIR = `basename ${PWD}` +tar: clean_all + @echo "Tar the directory: " ${MY_DIR} + @cd .. ;\ + tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\ + cd ${MY_DIR} +# tar cf `basename ${PWD}`.tar * + +doc: + doxygen Doxyfile + +######################################################################### + +.cpp.o: + $(CXX) -c $(CXXFLAGS) -o $@ $< + +.c.o: + $(CC) -c $(CFLAGS) -o $@ $< + +.f.o: + $(F77) -c $(FFLAGS) -o $@ $< + +################################################################################################## +# # some tools +# # Simple run time profiling of your code +# # CXXFLAGS += -g +# # LINKFLAGS += + + +# Profiling options PGI, see: pgprof -h +PROF_FILE = jac.pgprof +# CPU_PROF = -allcache +CPU_PROF = --cpu-profiling on --analysis-metrics +# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit +#GPU_PROF = -cuda=branch:cc20 +# + +cache: prof + +prof: ${PROGRAM} +# ./$^ +# $(CUDA_HOME)/bin/nvvp & +# more /opt/pgi/linux86-64/16.10/bin/pgcollectrc + ${OPTIRUN} ${BINDIR}pgprof ${CPU_PROF} -o $(PROF_FILE) ./$^ + ${OPTIRUN} ${BINDIR}pgprof -i $(PROF_FILE) 2> prof.out + + diff --git a/ex5/ex5_1/Makefile b/ex5/ex5_1/Makefile new file mode 100644 index 0000000..f5bc097 --- /dev/null +++ b/ex5/ex5_1/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp mylib.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_1/check_env.h b/ex5/ex5_1/check_env.h new file mode 100644 index 0000000..41bd99d --- /dev/null +++ b/ex5/ex5_1/check_env.h @@ -0,0 +1,99 @@ +#pragma once + +#include +#ifdef _OPENMP +#include +#endif +#include + +//##################################### +// G.Haase +// See https://sourceforge.net/p/predef/wiki/Compilers/ +// http://www.cplusplus.com/doc/tutorial/preprocessor/ +// also: export OMP_DISPLAY_ENV=VERBOSE +//##################################### +/** Checks for compilers, its versions, threads etc. + * + @param[in] argc number of command line arguemnts + @param[in] argv command line arguments as array of C-strings +*/ +template +void check_env(T argc, char const *argv[]) +{ + std::cout << "\n#######################################################################\n"; + std::cout << "Code :"; + for (T k = 0; k < argc; ++k) std::cout << " " << argv[k]; + std::cout << std::endl; + + // compiler: https://sourceforge.net/p/predef/wiki/Compilers/ + std::cout << "Compiler: "; +#if defined __INTEL_COMPILER +#pragma message(" ########## INTEL ###############") + std::cout << "INTEL " << __INTEL_COMPILER; + // Ignore warnings for #pragma acc unrecognice +#pragma warning disable 161 + // Ignore warnings for #pragma omp unrecognice +#pragma warning disable 3180 + +#elif defined __PGI +#pragma message(" ########## PGI ###############") + std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__; +#elif defined __clang__ +#pragma message(" ########## CLANG ###############") + std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__; +#elif defined __GNUC__ +#pragma message(" ########## Gnu ###############") + std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__; +#else +#pragma message(" ########## unknown Compiler ###############") + std::cout << "unknown"; +#endif + std::cout << " C++ standard: " << __cplusplus << std::endl; + + // Parallel environments + std::cout << "Parallel: "; +#if defined MPI_VERSION +#pragma message(" ########## MPI ###############") +#ifdef OPEN_MPI + std::cout << "OpenMPI "; +#else + std::cout << "MPI "; +#endif + std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " "; +#endif + +#ifdef _OPENMP +//https://www.openmp.org/specifications/ +//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux + std::unordered_map const map{ + {200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}}; +#pragma message(" ########## OPENMP ###############") + //std::cout << _OPENMP; + std::cout << "OpenMP "; + try { + std::cout << map.at(_OPENMP); + } + catch (...) { + std::cout << _OPENMP; + } + #pragma omp parallel + { + #pragma omp master + { + const int nn = omp_get_num_threads(); // OpenMP + std::cout << " ---> " << nn << " Threads "; + } + #pragma omp barrier + } + +#endif +#ifdef _OPENACC +#pragma message(" ########## OPENACC ###############") + std::cout << "OpenACC "; +#endif + std::cout << std::endl; + std::cout << "Date : " << __DATE__ << " " << __TIME__; + std::cout << "\n#######################################################################\n"; +} +// HG + diff --git a/ex5/ex5_1/main.cpp b/ex5/ex5_1/main.cpp new file mode 100644 index 0000000..c261c29 --- /dev/null +++ b/ex5/ex5_1/main.cpp @@ -0,0 +1,142 @@ +#include "check_env.h" +#include "mylib.h" +#include // atoi() +#include // strncmp() +#include +#include +#include // OpenMP +#include +#include +using namespace std; + +int main(int argc, char const *argv[]) +{ + omp_set_schedule(omp_sched_static, 2000000); + //omp_set_schedule(omp_sched_dynamic, 1000000); + //omp_set_schedule(omp_sched_guided, 1000000); + //omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto + + + // Speedup for different number of cores (incl. hyperthreading) + omp_set_num_threads(8); + + // Print number of available processors + cout << "Number of available processors: " << omp_get_num_procs() << endl; + + // Currently executing parallel code? -> no + cout << "Currently in parallel? " << omp_in_parallel() << endl; + + + int const NLOOPS = 10; // chose a value such that the benchmark runs at least 10 sec. + unsigned int N = 500000001; + + +//########################################################################## +// Read Parameter from command line (C++ style) + cout << "Checking command line parameters for: -n " << endl; + for (int i = 1; i < argc; i++) + { + cout << " arg[" << i << "] = " << argv[i] << endl; + string ss(argv[i]); + if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter + { + N = static_cast(atoi(argv[i + 1])); + } + else + { + cout << "Corect call: " << argv[0] << " -n \n"; + } + } + + cout << "\nN = " << N << endl; + + check_env(argc, argv); +//######################################################################## + int nthreads; // OpenMP + #pragma omp parallel default(none) shared(cout,nthreads) + { + stringstream inparallel; + inparallel << "Currently in parallel? " << omp_in_parallel() << endl; + + + int const th_id = omp_get_thread_num(); // OpenMP + int const nthrds = omp_get_num_threads(); // OpenMP + stringstream ss; + ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl; + #pragma omp critical + { + cout << ss.str(); // output to a shared ressource + cout << inparallel.str() << endl; + } + #pragma omp master + nthreads = nthrds; // transfer nn to to master thread + } + cout << " " << nthreads << " threads have been started." << endl; + +//########################################################################## +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N), y(N); + + cout.precision(2); + cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +//########################################################################## +// Data initialization +// Special: x_i = i+1; y_i = 1/x_i ==> == N + for (unsigned int i = 0; i < N; ++i) + { + x[i] = i + 1; + y[i] = 1.0 / x[i]; + } + +//########################################################################## + cout << "\nStart Benchmarking\n"; + +// Do calculation + double tstart = omp_get_wtime(); // OpenMP + + double sk(0.0); + for (int i = 0; i < NLOOPS; ++i) + { + //sk = scalar(x, y); + sk = scalar_parallel(x, y); + //sk = scalar_trans(x, y); + //sk = norm(x); + } + + double t1 = omp_get_wtime() - tstart; // OpenMP + + t1 /= NLOOPS; // divide by number of function calls + +//########################################################################## +// Check the correct result + cout << "\n = " << sk << endl; + if (static_cast(sk) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + +//########################################################################## +// Timings and Performance + cout << endl; + cout.precision(2); + cout << "Total benchmarking time: " << t1*NLOOPS << endl; + cout << "Timing in sec. : " << t1 << endl; + cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl; + cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl; + +//######################################################################### + + cout << "\n Try the reduction with an STL-vektor \n"; + + auto vr = reduction_vec_append(5); + cout << "done\n"; + cout << vr << endl; + + + return 0; +} // memory for x and y will be deallocated their destructors diff --git a/ex5/ex5_1/mylib.cpp b/ex5/ex5_1/mylib.cpp new file mode 100644 index 0000000..b5f2697 --- /dev/null +++ b/ex5/ex5_1/mylib.cpp @@ -0,0 +1,137 @@ +#include "mylib.h" +#include // assert() +#include +#include +#include // multiplies<>{} +#include +#include // iota() +#ifdef _OPENMP +#include +#endif +#include +using namespace std; + + +double scalar_parallel(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + double sum = 0.0; +#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum) + { + const size_t nthreads = omp_get_num_threads(); + const size_t threadnum = omp_get_thread_num(); + const size_t chunksize = N/nthreads; + + + size_t start = threadnum*chunksize; + size_t end = start + chunksize; + if (threadnum == nthreads - 1) + end = N; + + + for (size_t i = start; i < end; ++i) + { + sum += x[i] * y[i]; + } + + } + return sum; +} + + +vector reduction_vec_append(int n) +{ + vector vec(n); +#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec) + { + #pragma omp barrier + #pragma omp critical + cout << omp_get_thread_num() << " : " << vec.size() << endl; + #pragma omp barrier + iota( vec.begin(),vec.end(), omp_get_thread_num() ); + #pragma omp barrier + + } + return vec; +} + + + + + + + + + + + +double scalar(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + size_t const N = x.size(); + double sum = 0.0; +#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime) + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; + //sum += exp(x[i])*log(y[i]); + } + return sum; +} + + +double norm(vector const &x) +{ + size_t const N = x.size(); + double sum = 0.0; +#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime) + for (size_t i = 0; i < N; ++i) + { + sum += x[i]*x[i]; + } + return sum; +} + + +vector reduction_vec(int n) +{ + vector vec(n); +#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec) + { + #pragma omp barrier + #pragma omp critical + cout << omp_get_thread_num() << " : " << vec.size() << endl; + #pragma omp barrier + iota( vec.begin(),vec.end(), omp_get_thread_num() ); + #pragma omp barrier + + } + return vec; +} + + +double scalar_trans(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG + vector z(x.size()); + //list z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-') + // c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison + + transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{}); + + double sum = 0.0; +#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime) + for (auto pi = cbegin(z); pi!=cend(z); ++pi) + { + sum += *pi; + } + //for (auto val: z) + //{ + //sum += val; + //} + return sum; +} + + + diff --git a/ex5/ex5_1/mylib.h b/ex5/ex5_1/mylib.h new file mode 100644 index 0000000..fe1948a --- /dev/null +++ b/ex5/ex5_1/mylib.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include // setw() +#include +#include +#include + +/** Inner product + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar_parallel(std::vector const &x, std::vector const &y); +double scalar(std::vector const &x, std::vector const &y); +double scalar_trans(std::vector const &x, std::vector const &y); + + + +// Declare additional reduction operation in OpenMP for STL-vector +#pragma omp declare reduction(VecAppend : std::vector : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \ + initializer (omp_priv=omp_orig) + +std::vector reduction_vec_append(int n); + + +/** l2-norm + @param[in] x vector + @return resulting Euclidian norm +*/ +double norm(std::vector const &x); + +/** Vector @p b adds its elements to vector @p a . + @param[in] a vector + @param[in] b vector + @return a+=b componentwise +*/ +template +std::vector &operator+=(std::vector &a, std::vector const &b) +{ + assert(a.size()==b.size()); + for (size_t k = 0; k < a.size(); ++k) { + a[k] += b[k]; + } + return a; +} + +// Declare the reduction operation in OpenMP for an STL-vector +// omp_out += omp_in requires operator+=(vector &, vector const &) from above +// ------------------------------------------------------------ +// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm +// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c +// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size +// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf +#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) \ + initializer (omp_priv=omp_orig) + +// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type. +//template +//#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) initializer (omp_priv(omp_orig)) +// MS: template nach #pragma !? + +// ------------------------------------------------------------ + +/** Test for vector reduction. + * + * The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$. + * Afterwards these vectors are accumulated, i.e., + * @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$. + * + * @param[in] n size of global/private vector + * @return resulting global vector. +*/ +std::vector reduction_vec(int n); + + + +/** Output of a vector. + @param[in,out] s output stream + @param[in] x vector + @return modified output stream +*/ +template +std::ostream &operator<<(std::ostream &s, std::vector const &x) +{ + for (auto const &v : x) s << std::setw(4) << v << " "; + return s; +} + diff --git a/ex5/ex5_1/timing.h b/ex5/ex5_1/timing.h new file mode 100644 index 0000000..11d8da2 --- /dev/null +++ b/ex5/ex5_1/timing.h @@ -0,0 +1,70 @@ +#pragma once +#include // timing +#include + +using Clock = std::chrono::system_clock; //!< The wall clock timer chosen +//using Clock = std::chrono::high_resolution_clock; +using TPoint= std::chrono::time_point; + +// [Galowicz, C++17 STL Cookbook, p. 29] +inline +std::stack MyStopWatch; //!< starting time of stopwatch + +/** Starts stopwatch timer. + * Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode + * + * The timining is allowed to be nested and the recent time is stored on top of the stack. + * + * @return recent time + * @see toc + */ +inline auto tic() +{ + MyStopWatch.push(Clock::now()); + return MyStopWatch.top(); +} + +/** Returns the elapsed time from stopwatch. + * + * The time from top of the stack is used + * if time point @p t_b is not passed as input parameter. + * Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode + * or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode + * The last option is to be used in the case of + * non-nested but overlapping time measurements. + * + * @param[in] t_b start time of some stop watch + * @return elapsed time in seconds. + * +*/ +inline double toc(TPoint const &t_b = MyStopWatch.top()) +{ + // https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point + using Unit = std::chrono::seconds; + using FpSeconds = std::chrono::duration; + auto t_e = Clock::now(); + MyStopWatch.pop(); + return FpSeconds(t_e-t_b).count(); +} + +#include +#include +/** Executes function @p f and measures/prints elapsed wall clock time in seconds + * + * Call as + * @code measure("Time for (b = b + 1)", [&]() { + thrust::transform(b.begin(), b.end(), b.begin(), increment()); + }); @endcode + * + * @param[in] label additional string to be printed with the measurement. + * @param[in] f function to execute. + * @author Therese Bösmüller, 2025 + * +*/ +auto measure = [](const std::string& label, auto&& f) { + auto start = std::chrono::high_resolution_clock::now(); + f(); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start).count(); + std::cout << label << ": " << duration << " microseconds" << std::endl; +}; // ';' is needed for a visible documentation of this lambda-function diff --git a/ex5/ex5_2/Makefile b/ex5/ex5_2/Makefile new file mode 100644 index 0000000..7363b62 --- /dev/null +++ b/ex5/ex5_2/Makefile @@ -0,0 +1,31 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp mylib.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_2/data_1.txt b/ex5/ex5_2/data_1.txt new file mode 100644 index 0000000..0cc4bb8 --- /dev/null +++ b/ex5/ex5_2/data_1.txt @@ -0,0 +1,501 @@ +141 +261 +87 +430 +258 +298 +425 +120 +496 +707 +244 +786 +75 +394 +4 +221 +2 +190 +143 +269 +175 +139 +599 +902 +940 +222 +483 +377 +524 +265 +69 +437 +174 +27 +955 +431 +962 +763 +8 +681 +706 +646 +553 +219 +773 +229 +371 +891 +857 +403 +319 +609 +911 +910 +592 +333 +854 +443 +905 +34 +533 +717 +180 +337 +188 +322 +404 +549 +49 +553 +275 +242 +244 +155 +957 +936 +819 +729 +176 +361 +189 +2 +317 +700 +626 +544 +440 +288 +502 +762 +763 +577 +748 +646 +124 +505 +348 +93 +148 +199 +673 +432 +695 +257 +10 +533 +280 +947 +907 +393 +25 +672 +838 +972 +57 +451 +583 +687 +720 +651 +727 +374 +582 +117 +58 +980 +285 +595 +963 +186 +194 +342 +933 +391 +274 +152 +398 +375 +132 +436 +92 +615 +11 +574 +790 +236 +449 +570 +62 +497 +643 +222 +838 +972 +847 +506 +279 +747 +237 +958 +621 +601 +173 +91 +256 +859 +912 +700 +726 +230 +577 +811 +404 +989 +90 +321 +512 +61 +726 +557 +530 +830 +859 +790 +318 +453 +753 +110 +110 +270 +525 +973 +711 +312 +292 +851 +912 +640 +256 +89 +839 +585 +949 +62 +585 +286 +828 +191 +443 +394 +827 +677 +208 +319 +134 +672 +571 +170 +148 +477 +909 +553 +33 +54 +806 +452 +383 +790 +365 +533 +712 +872 +329 +651 +975 +76 +588 +414 +310 +264 +759 +996 +187 +782 +196 +993 +803 +425 +729 +499 +809 +357 +74 +591 +911 +194 +433 +750 +40 +947 +764 +559 +184 +498 +518 +995 +855 +963 +679 +404 +935 +480 +232 +397 +706 +559 +757 +996 +963 +536 +964 +116 +52 +305 +581 +531 +902 +541 +432 +543 +713 +17 +801 +143 +479 +257 +370 +662 +170 +279 +199 +196 +327 +881 +472 +404 +180 +969 +408 +845 +616 +377 +878 +785 +465 +814 +899 +430 +335 +597 +902 +703 +378 +735 +955 +543 +541 +312 +72 +182 +93 +464 +10 +916 +643 +2 +31 +209 +455 +128 +9 +728 +355 +781 +437 +437 +50 +50 +92 +595 +242 +842 +858 +964 +489 +221 +227 +537 +763 +348 +462 +640 +918 +162 +716 +578 +434 +885 +394 +179 +634 +625 +328 +803 +1000 +981 +128 +233 +24 +608 +111 +408 +885 +549 +370 +209 +441 +957 +125 +471 +857 +44 +692 +979 +284 +134 +686 +910 +611 +900 +194 +755 +347 +419 +156 +820 +625 +739 +806 +68 +951 +498 +756 +743 +832 +157 +458 +619 +933 +836 +896 +583 +583 +855 +35 +886 +408 +37 +747 +155 +144 +606 +255 +325 +402 +407 +387 +610 +167 +189 +95 +324 +770 +235 +741 +693 +825 +828 +294 +310 +524 +326 +832 +811 +557 +263 +681 +234 +457 +385 +539 +992 +756 +981 +235 +529 +52 +757 +602 +858 +989 +930 +410 +1 +541 +208 +220 +326 +96 +748 +749 +544 +339 +833 +553 +958 +893 +357 +547 +347 +623 +797 +746 +126 +823 +26 +415 +732 +782 +368 + diff --git a/ex5/ex5_2/main.cpp b/ex5/ex5_2/main.cpp new file mode 100644 index 0000000..1bea099 --- /dev/null +++ b/ex5/ex5_2/main.cpp @@ -0,0 +1,130 @@ +#include "mylib.h" +#include +#include +#include +#include +using namespace std; + + +int main() +{ + // read vector from file + vector data_vector = {}; + + ifstream input_stream("data_1.txt"); + + size_t line; + while(input_stream >> line) + { + data_vector.push_back(line); + } + data_vector.shrink_to_fit(); + + + + + // specify loops + size_t NLOOPS = 10000; + + + + // ############# Parallelization with openMP ############# + // calculate arithmetic mean, geometric mean and harmonic mean + double am_omp, gm_omp, hm_omp; + + double tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + means_omp(data_vector, am_omp, gm_omp, hm_omp); + + double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS; + + // calculate minimum and maximum + size_t min, max; + + tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + minmax_omp(data_vector, min, max); + + double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS; + + + + + + + // ############# Parallelization with C++ algorithms ############# + // calculate arithmetic mean, geometric mean and harmonic mean + double am_cpp, gm_cpp, hm_cpp; + + tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp); + + double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS; + + // calculate minimum and maximum + size_t min_cpp, max_cpp; + + tstart = omp_get_wtime(); + + for (size_t i = 0; i < NLOOPS; ++i) + minmax_cpp(data_vector, min_cpp, max_cpp); + + double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS; + + + + + + // print results + cout << "####### OpenMP #######" << endl; + cout << "minimum: " << min << endl; + cout << "maximum: " << max << endl; + cout << "duration: " << t_minmax_omp << endl << endl; + + cout << "arithmetic mean: " << am_omp << endl; + cout << "geometric mean: " << gm_omp << endl; + cout << "harmonic mean: " << hm_omp << endl; + cout << "duration: " << t_means_omp << endl << endl; + + + cout << "####### C++ #######" << endl; + cout << "minimum: " << min_cpp << endl; + cout << "maximum: " << max_cpp << endl; + cout << "duration: " << t_minmax_cpp << endl << endl; + + cout << "arithmetic mean: " << am_cpp << endl; + cout << "geometric mean: " << gm_cpp << endl; + cout << "harmonic mean: " << hm_cpp << endl; + cout << "duration: " << t_means_cpp << endl << endl; + + + + // ####### OpenMP ####### + // minimum: 1 + // maximum: 1000 + // duration: 3.52086e-06 + + // arithmetic mean: 498.184 + // geometric mean: 364.412 + // harmonic mean: 95.6857 + // duration: 5.90171e-06 + + // ####### C++ ####### + // minimum: 1 + // maximum: 1000 + // duration: 1.76816e-05 + + // arithmetic mean: 498.184 + // geometric mean: 364.412 + // harmonic mean: 95.6857 + // duration: 2.35728e-05 + + // --> the openMP variant is faster in both cases + + + return 0; +} diff --git a/ex5/ex5_2/mylib.cpp b/ex5/ex5_2/mylib.cpp new file mode 100644 index 0000000..34e5992 --- /dev/null +++ b/ex5/ex5_2/mylib.cpp @@ -0,0 +1,103 @@ +#include "mylib.h" +#include +#include +#include +#include +#include +#include + +using namespace std; + + +void means_omp(const std::vector numbers, double &am, double &gm, double &hm) +{ + size_t const n = numbers.size(); + + am = 0.; + gm = 0.; + hm = 0.; + +#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm) + for (size_t i = 0; i < n; ++i) + { + am += numbers[i]; + gm += log(numbers[i]); + hm += 1.0/numbers[i]; + + // #pragma omp critical + // { + // cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl; + // } + } + + am /= n; + gm = exp(gm/n); + hm = n/hm; +} + + +void minmax_omp(const std::vector numbers, size_t &global_min, size_t &global_max) +{ + size_t const n = numbers.size(); + + global_min = -1; // gives the maximum size_t value + global_max = 0; + +#pragma omp parallel shared(numbers, n, global_min, global_max) + { + const size_t nthreads = omp_get_num_threads(); + const size_t threadnum = omp_get_thread_num(); + const size_t chunksize = n/nthreads; + + + size_t start = threadnum*chunksize; + size_t end = start + chunksize; + if (threadnum == nthreads - 1) + end = n; + + + size_t local_min = -1; + size_t local_max = 0; + for (size_t i = start; i < end ; ++i) + { + if (numbers[i] < local_min) + local_min = numbers[i]; + + if (numbers[i] > local_max) + local_max = numbers[i]; + } + + #pragma omp critical + { + if (local_min < global_min) + global_min = local_min; + + if (local_max > global_max) + global_max = local_max; + } + } +} + + +void means_cpp(const std::vector numbers, double &am, double &gm, double &hm) +{ + size_t const n = numbers.size(); + + am = reduce(std::execution::par, numbers.begin(), numbers.end()); + gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } ); + hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; }); + + am /= n; + gm = exp(gm/n); + hm = n/hm; +} + + +void minmax_cpp(const std::vector numbers, size_t &global_min, size_t &global_max) +{ + auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end()); + auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end()); + + global_min = *min_it; + global_max = *max_it; +} \ No newline at end of file diff --git a/ex5/ex5_2/mylib.h b/ex5/ex5_2/mylib.h new file mode 100644 index 0000000..01d2fa9 --- /dev/null +++ b/ex5/ex5_2/mylib.h @@ -0,0 +1,42 @@ +#include + +/** + This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector. + Uses openMP parallelization. + @param[in] numbers vector containing integers + @param[out] am arithmetic mean + @param[out] gm geometric mean + @param[out] hm harmonic mean +*/ +void means_omp(const std::vector numbers, double &am, double &gm, double &hm); + + +/** + This function calculates the minimum and maximum of a vector. + Uses openMP parallelization. + @param[in] numbers vector containing integers + @param[out] global_min minimum + @param[out] global_max maximum +*/ +void minmax_omp(const std::vector numbers, size_t &global_min, size_t &global_max); + + +/** + This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector. + Uses C++ parallelization. + @param[in] numbers vector containing integers + @param[out] am arithmetic mean + @param[out] gm geometric mean + @param[out] hm harmonic mean +*/ +void means_cpp(const std::vector numbers, double &am, double &gm, double &hm); + + +/** + This function calculates the minimum and maximum of a vector. + Uses C++ parallelization. + @param[in] numbers vector containing integers + @param[out] global_min minimum + @param[out] global_max maximum +*/ +void minmax_cpp(const std::vector numbers, size_t &global_min, size_t &global_max); \ No newline at end of file diff --git a/ex5/ex5_3/Makefile b/ex5/ex5_3/Makefile new file mode 100644 index 0000000..4a714a3 --- /dev/null +++ b/ex5/ex5_3/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp goldbach.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_3/goldbach.cpp b/ex5/ex5_3/goldbach.cpp new file mode 100644 index 0000000..55709c3 --- /dev/null +++ b/ex5/ex5_3/goldbach.cpp @@ -0,0 +1,46 @@ +#include "goldbach.h" +#include +#include +#include + +size_t single_goldbach(size_t k) +{ + const std::vector relevant_primes = get_primes(k); + size_t m = relevant_primes.size(); + + size_t counter = 0; + +#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter) + for(size_t i = 0; i < m; ++i) + { + for(size_t j = i; j < m; ++j) + { + if(relevant_primes[i] + relevant_primes[j] == k) + ++counter; + } + } + + return counter; +} + + +std::vector count_goldbach(size_t n) +{ + const std::vector relevant_primes = get_primes(n); + size_t m = relevant_primes.size(); + + std::vector counter_vector(n + 1, 0); + +#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector) + for(size_t i = 0; i < m; ++i) + { + for(size_t j = i; j < m; ++j) + { + size_t sum = relevant_primes[i] + relevant_primes[j]; + if(sum <= n) + ++counter_vector[relevant_primes[i] + relevant_primes[j]]; + } + } + + return counter_vector; +} diff --git a/ex5/ex5_3/goldbach.h b/ex5/ex5_3/goldbach.h new file mode 100644 index 0000000..15df6d4 --- /dev/null +++ b/ex5/ex5_3/goldbach.h @@ -0,0 +1,45 @@ +#pragma once +#include "mayer_primes.h" +#include +#include + + +/** + This function returns the number of possible decompositions of an integer into a sum of two prime numbers. + @param[in] k first integer + @param[out] count number of decompositions +*/ +size_t single_goldbach(size_t k); + + +/** + This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n]. + @param[in] n upper integer bound + @param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index +*/ +std::vector count_goldbach(size_t n); + + +/** Vector @p b adds its elements to vector @p a . + @param[in] a vector + @param[in] b vector + @return a+=b componentwise +*/ +template +std::vector &operator+=(std::vector &a, std::vector const &b) +{ + assert(a.size()==b.size()); + for (size_t k = 0; k < a.size(); ++k) { + a[k] += b[k]; + } + return a; +} + +// Declare the reduction operation in OpenMP for an STL-vector +// omp_out += omp_in requires operator+=(vector &, vector const &) from above +// ------------------------------------------------------------ +// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm +// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c +// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size +// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf +#pragma omp declare reduction(VecAdd : std::vector : omp_out += omp_in) initializer (omp_priv=omp_orig) \ No newline at end of file diff --git a/ex5/ex5_3/main.cpp b/ex5/ex5_3/main.cpp new file mode 100644 index 0000000..1be01c2 --- /dev/null +++ b/ex5/ex5_3/main.cpp @@ -0,0 +1,45 @@ +#include "goldbach.h" +#include +#include +#include +using namespace std; + + +int main() +{ + cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl; + + for(size_t n : {10000, 100000, 400000, 1000000, 2000000}) + { + double t_start = omp_get_wtime(); + + auto goldbach_vector = count_goldbach(n); + + auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end()); + size_t max_number = distance(goldbach_vector.begin(), max_it); + + double t_end = omp_get_wtime() - t_start; + + cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl; + } + + /* + ###### WITHOUT PARALLELIZATION ###### + The number 9240 has 329 decompositions. Duration: 0.00307696 + The number 99330 has 2168 decompositions. Duration: 0.189839 + The number 390390 has 7094 decompositions. Duration: 1.3042 + The number 990990 has 15594 decompositions. Duration: 5.45034 + The number 1981980 has 27988 decompositions. Duration: 47.1807 + + + ###### WITH PARALLELIZATION ###### + The number 9240 has 329 decompositions. Duration: 0.000734854 + The number 99330 has 2168 decompositions. Duration: 0.0251322 + The number 390390 has 7094 decompositions. Duration: 0.487375 + The number 990990 has 15594 decompositions. Duration: 6.16972 + The number 1981980 has 27988 decompositions. Duration: 31.5699 + */ + + + return 0; +} \ No newline at end of file diff --git a/ex5/ex5_3/mayer_primes.h b/ex5/ex5_3/mayer_primes.h new file mode 100644 index 0000000..6c6dc23 --- /dev/null +++ b/ex5/ex5_3/mayer_primes.h @@ -0,0 +1,73 @@ +#pragma once + +#include //memset +#include +//using namespace std; + +/** \brief Determines all prime numbers in interval [2, @p max]. + * + * The sieve of Eratosthenes is used. + * + * The implementation originates from Florian Mayer. + * + * \param[in] max end of interval for the prime number search. + * \return vector of prime numbers @f$2,3,5, ..., p<=max @f$. + * + * \copyright + * Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ +template +std::vector get_primes(T max) +{ + std::vector primes; + char *sieve; + sieve = new char[max / 8 + 1]; + // Fill sieve with 1 + memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char)); + for (T x = 2; x <= max; x++) + { + if (sieve[x / 8] & (0x01 << (x % 8))) { + primes.push_back(x); + // Is prime. Mark multiplicates. + for (T j = 2 * x; j <= max; j += x) + { + sieve[j / 8] &= ~(0x01 << (j % 8)); + } + } + } + delete[] sieve; + return primes; +} + +//--------------------------------------------------------------- +//int main() // by Florian Mayer +//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out +// vector primes; +// primes = get_primes(10000000); +// // return 0; +// // Print out result. +// vector::iterator it; +// for(it=primes.begin(); it < primes.end(); it++) +// cout << *it << " "; +// +// cout << endl; +// return 0; +//} diff --git a/ex5/ex5_4/Makefile b/ex5/ex5_4/Makefile new file mode 100644 index 0000000..b2129d8 --- /dev/null +++ b/ex5/ex5_4/Makefile @@ -0,0 +1,30 @@ +# +# use GNU-Compiler tools +COMPILER=GCC_ +# alternatively from the shell +# export COMPILER=GCC_ +# or, alternatively from the shell +# make COMPILER=GCC_ + +# use Intel compilers +#COMPILER=ICC_ + +# use PGI compilers +# COMPILER=PGI_ + + +SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +PROGRAM = main.${COMPILER} + +# uncomment the next to lines for debugging and detailed performance analysis +CXXFLAGS += -g +LINKFLAGS += -g +# do not use -pg with PGI compilers + +ifndef COMPILER + COMPILER=GCC_ +endif + +include ../${COMPILER}default.mk diff --git a/ex5/ex5_4/benchmark_tests.cpp b/ex5/ex5_4/benchmark_tests.cpp new file mode 100644 index 0000000..35ed87b --- /dev/null +++ b/ex5/ex5_4/benchmark_tests.cpp @@ -0,0 +1,375 @@ +#include "benchmark_tests.h" +#include "benchmarks.h" +#include +#include +#include +using namespace std::chrono; + +vector test_A(const size_t &NLOOPS, const size_t &N) +{ + cout << "#################### (A) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N), y(N); + + cout.precision(2); + cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization +// Special: x_i = i+1; y_i = 1/x_i ==> == N + + for (size_t i = 0; i < N; ++i) + { + x[i] = i % 219 + 1; + y[i] = 1.0/x[i]; + } + + + cout << "\nStart Benchmarking scalar\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + double check(0.0),ss(0.0); + for (size_t i = 0; i < NLOOPS; ++i) + { + check = scalar_parallel(x, y); + ss += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n = " << check << endl; + if (static_cast(check) != N) + cout << " !! W R O N G result !!\n"; + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + return vector{t_diff, Gflops, MemBandwidth}; +} + +vector test_A_sum(const size_t &NLOOPS, const size_t &N) +{ + cout << "#################### (A) sum ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector x(N); + + cout.precision(2); + cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization + + for (size_t i = 0; i < N; ++i) + { + x[i] = 1; + } + + + cout << "\nStart Benchmarking sum\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + double check(0.0),ss(0.0); + for (size_t i = 0; i < NLOOPS; ++i) + { + check = sum(x); + ss += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n = " << check << endl; + if (static_cast(check) != N) + cout << " !! W R O N G result !!\n"; + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_B(const size_t &NLOOPS, const size_t &N, const size_t &M) +{ + cout << "#################### (B) ####################" << endl; + + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + cout << "\nM = " << M << endl; + +// Memory allocation + cout << "Memory allocation\n"; + + vector A(M*N); + vector x(N); + + cout.precision(2); + cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +// Data initialization + + for (size_t i = 0; i < M; ++i) + for (size_t j = 0; j < N; ++j) + A[N*i + j] = (i + j) % 219 + 1; + + + for (size_t j = 0; j < N; ++j) + { + x[j] = 1.0/A[N*17 + j]; + } + + cout << "\nStart Benchmarking MatVec\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector b(M); + + for (size_t i = 0; i < NLOOPS; ++i) + { + b = MatVec_parallel(A, x); + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + +// Check the correct result + cout << "\n = " << b[17] << endl; + if (static_cast(b[17]) != N) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N) +{ + cout << "#################### (C) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nL = " << L << endl; + cout << "\nM = " << M << endl; + cout << "\nN = " << N << endl; + + +// Memory allocation + cout << "Memory allocation\n"; + + vector A(M*L); + vector B(L*N); + + cout.precision(2); + cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + + +// Data initialization + + for (size_t i = 0; i < M; ++i) + for (size_t k = 0; k < L; ++k) + A[L*i + k] = (i + k) % 219 + 1; + + for (size_t k = 0; k < L; ++k) + for (size_t j = 0; j < N; ++j) + B[N*k + j] = 1.0/A[L*17 + k]; + + + cout << "\nStart Benchmarking MatMat\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector C(M*N); + double check; + double check_sum = 0; + + for (size_t i = 0; i < NLOOPS; ++i) + { + C = MatMat_parallel(A, B, L); + + check = C[N*17]; + check_sum += check; // prevents the optimizer from removing unused calculation results. + } + cout << check_sum; + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + +// Check the correct result + cout << "\n C[17,0] = " << check << endl; + if (static_cast(check) != L) + { + cout << " !! W R O N G result !!, should be " << L <<"\n"; + } + cout << endl; + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} + + +vector test_D(const size_t &NLOOPS, const size_t &N, const size_t &p) +{ + cout << "#################### (D) ####################" << endl; + cout << "\nLOOPS = " << NLOOPS << endl; + cout << "\nN = " << N << endl; + cout << "\np = " << p << endl; + +// Memory allocation + cout << "Memory allocation\n"; + + vector a(p + 1, 0); + vector x(N); + + cout.precision(2); + cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n"; + cout.precision(6); + +// Data initialization + + for (size_t j = 0; j < N; ++j) + x[j] = 1.0*j; + + for (size_t k = 0; k < p + 1; ++k) + a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ... + + + + cout << "\nStart Benchmarking poly\n"; + + auto t1 = system_clock::now(); // start timer +// Do calculation + vector y(N); + double check; + double check_sum; + + for (size_t i = 0; i < NLOOPS; ++i) + { + y = poly_parallel(a, x); + check = y[0]; + + check_sum += check; // prevents the optimizer from removing unused calculation results. + } + + auto t2 = system_clock::now(); // stop timer + auto duration = duration_cast(t2 - t1); // duration in microseconds + double t_diff = static_cast(duration.count()) / 1e6; // overall duration in seconds + t_diff = t_diff/NLOOPS; // duration per loop seconds + + + +// Check the correct result + cout << "\n poly(" << x[0] << ") = " << check << endl; + if (abs(check - 1.0) > 1.0/1e6) + { + cout << " !! W R O N G result !!\n"; + } + cout << endl; + + +// Timings and Performance + cout << endl; + cout.precision(2); + + + double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024; + double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]); + + cout << "Total duration : " << t_diff*NLOOPS << endl; + cout << "Timing in sec. : " << t_diff << endl; + cout << "GFLOPS : " << Gflops << endl; + cout << "GiByte/s : " << MemBandwidth << endl; + + + + return vector{t_diff, Gflops, MemBandwidth}; +} \ No newline at end of file diff --git a/ex5/ex5_4/benchmark_tests.h b/ex5/ex5_4/benchmark_tests.h new file mode 100644 index 0000000..45c64a9 --- /dev/null +++ b/ex5/ex5_4/benchmark_tests.h @@ -0,0 +1,13 @@ +#pragma once +#include +using namespace std; + +vector test_A(const size_t &NLOOPS, const size_t &N); + +vector test_A_sum(const size_t &NLOOPS, const size_t &N); + +vector test_B(const size_t &NLOOPS, const size_t &N, const size_t &M); + +vector test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N); + +vector test_D(const size_t &NLOOPS, const size_t &N, const size_t &p); \ No newline at end of file diff --git a/ex5/ex5_4/benchmarks.cpp b/ex5/ex5_4/benchmarks.cpp new file mode 100644 index 0000000..c0eb7d9 --- /dev/null +++ b/ex5/ex5_4/benchmarks.cpp @@ -0,0 +1,141 @@ +#include "benchmarks.h" +#include // assert() +#include +#include +#include +#include + +// (A) Inner product of two vectors (from skalar_stl) +double scalar_parallel(vector const &x, vector const &y) +{ + assert(x.size() == y.size()); + size_t const N = x.size(); + double sum = 0.0; +//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime) +#pragma omp parallel for shared(x, y, N) reduction(+:sum) + for (size_t i = 0; i < N; ++i) + { + sum += x[i] * y[i]; + } + return sum; +} + +// (A) Vector entry sum +double sum(vector const &x) +{ + double sum = 0.0; +#pragma omp parallel for shared(x) reduction(+:sum) + for (size_t i = 0; i < x.size(); ++i) + { + sum += x[i]; + } + return sum; +} + + +// (B) Matrix-vector product (from intro_vector_densematrix) +vector MatVec_parallel(vector const &A, vector const &x) +{ + size_t const nelem = A.size(); + size_t const N = x.size(); + assert(nelem % N == 0); // make sure multiplication is possible + size_t const M = nelem/N; + + vector b(M); + +#pragma omp parallel for shared(A, x, N, M, b) + for (size_t i = 0; i < M; ++i) + { + double tmp = 0.0; + for (size_t j = 0; j < N; ++j) + tmp += A[N*i + j] * x[j]; + b[i] = tmp; + } + + return b; +} + + +// (C) Matrix-matrix product +vector MatMat_parallel(vector const &A, vector const &B, size_t const &L) +{ + size_t const nelem_A = A.size(); + size_t const nelem_B = B.size(); + + assert(nelem_A % L == 0 && nelem_B % L == 0); + + size_t const M = nelem_A/L; + size_t const N = nelem_B/L; + + + vector C(M*N); + + +#pragma omp parallel for shared(A, B, M, N, L, C) + for (size_t i = 0; i < M; ++i) + { + for (size_t k = 0; k < L; ++k) + { + for (size_t j = 0; j < N; ++j) + { + C[N*i + j] += A[L*i + k]*B[N*k + j]; + } + + } + } + + return C; +} + + +// (D) Evaluation of a polynomial function +vector poly_parallel(vector const &a, vector const &x) +{ + size_t const N = x.size(); + size_t const p = a.size() - 1; + vector y(N, 0); + +#pragma omp parallel for shared(a, x, N, p, y) + for (size_t i = 0; i < N; ++i) + { + double x_temp = x[i]; + double y_temp = 0; + for (size_t k = 0; k < p + 1; ++k) + { + y_temp += x_temp*y_temp + a[p - k]; + } + y[i] = y_temp; + } + + return y; +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ex5/ex5_4/benchmarks.h b/ex5/ex5_4/benchmarks.h new file mode 100644 index 0000000..2b55d2b --- /dev/null +++ b/ex5/ex5_4/benchmarks.h @@ -0,0 +1,55 @@ +#pragma once +#include +using namespace std; + +/** (A) Inner product of two vectors (from skalar_stl) + @param[in] x vector + @param[in] y vector + @return resulting Euclidian inner product +*/ +double scalar_parallel(vector const &x, vector const &y); + + +/** (A) Sum entries of vector + @param[in] x vector + @return sum +*/ +double sum(vector const &x); + + +/** (B) Matrix-vector product (from intro_vector_densematrix) + * @param[in] A dense matrix (1D access) + * @param[in] u vector + * + * @return resulting vector +*/ +vector MatVec_parallel(vector const &A, vector const &x); + + +/** (C) Matrix-matrix product + * @param[in] A MxL dense matrix (1D access) + * @param[in] B LxN dense matrix (1D access) + * @param[in] shared_dim shared dimension L + * + * @return resulting MxN matrix +*/ +vector MatMat_parallel(vector const &A, vector const &B, size_t const &shared_dim); + + +/** (D) Evaluation of a polynomial function using Horner's scheme + * @param[in] a coefficient vector + * @param[in] x vector with input values + * + * @return vector with output values +*/ +vector poly_parallel(vector const &a, vector const &x); + + + + + + + + + + diff --git a/ex5/ex5_4/main.cpp b/ex5/ex5_4/main.cpp new file mode 100644 index 0000000..d8fa63d --- /dev/null +++ b/ex5/ex5_4/main.cpp @@ -0,0 +1,84 @@ +#include "benchmark_tests.h" +#include +#include + +int main() +{ + vector> results_scalar; + results_scalar.push_back(test_A(2000000, pow(10,3))); + results_scalar.push_back(test_A(1000000, pow(10,4))); + results_scalar.push_back(test_A(100000, pow(10,5))); + results_scalar.push_back(test_A(10000, pow(10,6))); + results_scalar.push_back(test_A(750, pow(10,7))); + results_scalar.push_back(test_A(125, pow(10,8))); + + + vector> results_sum; + results_sum.push_back(test_A_sum(3000000, pow(10,3))); + results_sum.push_back(test_A_sum(2000000, pow(10,4))); + results_sum.push_back(test_A_sum(1000000, pow(10,5))); + results_sum.push_back(test_A_sum(50000, pow(10,6))); + results_sum.push_back(test_A_sum(2000, pow(10,7))); + results_sum.push_back(test_A_sum(250, pow(10,8))); + + + test_B(100, 20000, 10000); + + test_C(25, 500, 1000, 1500); + + test_D(100, 100, 1000000); + + + + cout << endl << "###### Scalar ######" << endl; + cout << "Timing\tGFLOPS\tGiByte/s" << endl; + cout << "------------------------------" << endl; + for (size_t i = 0; i < results_scalar.size(); ++i) + cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl; + + cout << endl << "###### Sum ######" << endl; + cout << "Timing\tGFLOPS\tGiByte/s" << endl; + cout << "------------------------------" << endl; + for (size_t i = 0; i < results_sum.size(); ++i) + cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl; + + + + + // ###### Scalar ###### + // Timing GFLOPS GiByte/s + // ------------------------------ + // 3.4e-06 0.54 4.3 + // 4.6e-06 4 32 + // 1.6e-05 12 95 + // 0.0011 1.7 13 + // 0.0097 1.9 15 + // 0.075 2.5 20 + + + // ###### Sum ###### + // Timing GFLOPS GiByte/s + // ------------------------------ + // 5.5e-06 0.17 1.3 + // 5.4e-06 1.7 14 + // 1.5e-05 6.1 49 + // 0.00013 7.2 57 + // 0.0033 2.8 23 + // 0.032 2.9 23 + + + + + + // ######### NOT PARALLEL (from exercise sheet 2) ######### + // Timing GFLOPS GiByte/s + // ---------------------------------- + // (A) 0.038 2.5 20 + // (B) 0.13 2.9 23 + // (C) 0.44 3.2 25 + // (D) 0.19 1.5 12 + + + + return 0; +}