Pushing everything again, accidentally deleted my remote repository

2025-12-09 22:06:13 +01:00 · 2025-12-09 22:06:13 +01:00 · 1bee3e8e5b
commit 1bee3e8e5b
101 changed files with 9428 additions and 0 deletions
--- a/ex3/.vscode/settings.json
+++ b/ex3/.vscode/settings.json
@ -0,0 +1,71 @@
+{
+    "files.associations": {
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "cctype": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "condition_variable": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "forward_list": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "format": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "semaphore": "cpp",
+        "span": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "cinttypes": "cpp",
+        "typeindex": "cpp",
+        "typeinfo": "cpp",
+        "variant": "cpp",
+        "cassert": "cpp",
+        "list": "cpp",
+        "fstream": "cpp"
+    }
+}
--- a/ex3/CLANG_default.mk
+++ b/ex3/CLANG_default.mk
@ -0,0 +1,123 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=CLANG_
+
+#CLANGPATH=//usr/lib/llvm-10/bin/
+CC     = ${CLANGPATH}clang
+CXX    = ${CLANGPATH}clang++
+#CXX   = ${CLANGPATH}clang++ -lomptarget  -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
+#F77   = gfortran
+LINKER = ${CXX}
+
+#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
+WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
+WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+
+CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
+# don't use -Ofast
+# -ftrapv
+LINKFLAGS += -O3
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+ifeq ($(UBUNTU),1)
+# ubuntu
+else
+# on  archlinux
+LINKFLAGS += -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+#   very good check
+# http://clang.llvm.org/extra/clang-tidy/
+#   good check, see:  http://llvm.org/docs/CodingStandards.html#include-style
+SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
+SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
+#READABILITY=,readability*${SWITCH_OFF}
+#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+#TIDYFLAGS += -checks='modernize*
+#   ???
+#TIDYFLAGS = -checks='cert*'  -header-filter=.*
+#   MPI checks ??
+#TIDYFLAGS = -checks='mpi*'
+#   ??
+#TIDYFLAGS = -checks='performance*'   -header-filter=.*
+#TIDYFLAGS = -checks='portability-*'  -header-filter=.*
+#TIDYFLAGS = -checks='readability-*'  -header-filter=.*
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+codecheck: tidy_check
+tidy_check:
+	clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
+# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
+
+run: clean ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+codecheck: tidy_check
--- a/ex3/GCC_AMD32_default.mk
+++ b/ex3/GCC_AMD32_default.mk
@ -0,0 +1,130 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=GCC_
+
+CC	= gcc
+CXX     = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+# on mephisto:
+#CXXFLAGS  += -I/share/apps/atlas/include
+#LINKFLAGS += -L/share/apps/atlas/lib
+#LINKFLAGS   += -lcblas -latlas
+
+#LINKFLAGS   += -lblas
+# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
+
+
+#WARNINGS = -pedantic -pedantic-errors -Wall -Wextra -Werror -Wconversion -Weffc++ -Woverloaded-virtual  -Wfloat-equal -Wshadow
+WARNINGS = -pedantic -Wall -Wextra -Wconversion -Weffc++ -Woverloaded-virtual  -Wfloat-equal -Wshadow \
+           -Wredundant-decls -Winline -fmax-errors=1
+#           -Wunreachable-code
+#  -Wunreachable-code
+CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
+#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
+#-msse3
+# -ftree-vectorizer-verbose=2  -DNDEBUG
+# -ftree-vectorizer-verbose=5
+# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump  -fdump-tree-pre=stderr
+
+# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
+# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
+# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+LINKFLAGS   += -O3
+
+# BLAS, LAPACK
+OPENBLAS_DIR = /opt/openblas_GCCseq
+#OPENBLAS_DIR = /opt/openblas_GCC
+OPENBLAS_LIBDIR = ${OPENBLAS_DIR}/lib
+OPENBLAS_INCDIR = ${OPENBLAS_DIR}/include
+CXXFLAGS += -I${OPENBLAS_INCDIR}
+LINKFLAGS += -L${OPENBLAS_LIBDIR} -lopenblas
+
+# interprocedural optimization
+CXXFLAGS += -flto
+LINKFLAGS += -flto
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig
+	-@rm -r html
+
+run: clean ${PROGRAM}
+#	time  ./${PROGRAM}
+#	./${PROGRAM}
+	( export LD_LIBRARY_PATH=${OPENBLAS_LIBDIR}:${LD_LIBRARY_PATH} ; ./${PROGRAM} )
+#            or  'export LD_LIBRARY_PATH=/opt/openblas_gcc/lib:${LD_LIBRARY_PATH}'  in your ~/.bashrc 
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar:
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+# no "-pg"  in compile/link options
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	./$^
+	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+#Trace your heap:
+#> heaptrack ./main.GCC_
+#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
+heap: ${PROGRAM}
+	heaptrack ./$^ 11
+	heaptrack_gui  `ls -1tr  heaptrack.$^.* |tail -1` &
+
+
+
+########################################################################
+#  get the detailed  status of all optimization flags
+info:
+	echo "detailed  status of all optimization flags"
+	$(CXX) --version
+	$(CXX) -Q $(CXXFLAGS) --help=optimizers
--- a/ex3/GCC_default.mk
+++ b/ex3/GCC_default.mk
@ -0,0 +1,183 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=GCC_
+
+CC	= gcc
+CXX     = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -Winline -fmax-errors=1
+#  -Wunreachable-code
+CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
+#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
+#-msse3
+# -ftree-vectorizer-verbose=2  -DNDEBUG
+# -ftree-vectorizer-verbose=5
+# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump  -fdump-tree-pre=stderr
+
+# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
+# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
+# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+LINKFLAGS   += -O3
+
+#architecture
+#CPU = -march=znver2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+ifeq ($(UBUNTU),1)
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+else
+# on  archlinux
+LINKFLAGS += -llapack -lopenblas -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+# for debugging purpose (save code)
+# -fsanitize=leak         # only one out the three can be used
+# -fsanitize=address
+# -fsanitize=thread
+SANITARY =  -fsanitize=address  -fsanitize=undefined -fsanitize=null -fsanitize=return \
+ -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
+ -fsanitize=bool -fsanitize=enum -fsanitize=vptr
+#CXXFLAGS  += ${SANITARY}
+#LINKFLAGS += ${SANITARY}
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
+	-@rm -rf html
+
+run: clean ${PROGRAM}
+#run: ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+#find . -size +10M > large_files
+#--exclude-from ${MY_DIR}/large_files
+
+zip: clean
+	@echo "Zip the directory: " ${MY_DIR}
+	@cd .. ;\
+	zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+.SUFFIXES: .f90
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $<.log 
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $(<:.cpp=.log)
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+.f90.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+# no "-pg"  in compile/link options
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+# Graphical interface
+# valkyrie
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+#  perf in Ubuntu 20.04:   https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
+#  * install 
+#  * sudo vi /etc/sysctl.conf
+#                add   kernel.perf_event_paranoid = 0
+
+#Trace your heap:
+#> heaptrack ./main.GCC_
+#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
+heap: ${PROGRAM}
+	heaptrack ./$^  ${PARAMS}
+	heaptrack_gui  `ls -1tr  heaptrack.$^.* |tail -1` &
+
+codecheck: $(SOURCES)
+	cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
+
+
+########################################################################
+#  get the detailed  status of all optimization flags
+info:
+	echo "detailed  status of all optimization flags"
+	$(CXX) --version
+	$(CXX) -Q $(CXXFLAGS) --help=optimizers
+	lscpu
+	inxi -C
+	lstopo
+
+# Excellent hardware info
+#	hardinfo
+# Life monitoring of CPU frequency etc.
+#	sudo i7z
+
+# Memory  consumption
+#	vmstat -at -SM 3
+#	xfce4-taskmanager
+
+
+# https://www.tecmint.com/check-linux-cpu-information/
+#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
+
+# Debugging:
+# https://wiki.archlinux.org/index.php/Debugging
--- a/ex3/ICC_default.mk
+++ b/ex3/ICC_default.mk
@ -0,0 +1,137 @@
+# Basic Defintions for using INTEL compiler suite sequentially
+# requires setting of COMPILER=ICC_
+
+#BINDIR = /opt/intel/bin/
+
+# special on my sony [GH]
+#BINDIR = /opt/save.intel/bin/
+# very special on my sony [GH]
+# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
+
+# Error with g++-4.8 using icpc14.0,x:
+#   find directory wherein bits/c++config.h is located
+#   'locate bits/c++config.h'
+#FOUND_CONFIG = -I/usr/include/x86_64-linux-gnu/c++/4.8
+
+
+CC	= ${BINDIR}icc
+CXX     = ${BINDIR}icpc
+F77	= ${BINDIR}ifort
+LINKER  = ${CXX}
+
+
+WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3
+#    -Winline -Wredundant-decls -Wunreachable-code
+CXXFLAGS +=  -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl ${FOUND_CONFIG}
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+# -vec-report=3
+# -qopt-report=5 -qopt-report-phase=vec
+# -guide -parallel
+# -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+# -auto-p32 -simd
+CXXFLAGS += -align
+
+# use MKL by INTEL
+#  https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/linking-your-application-with-the-intel-math-kernel-library/linking-quick-start/using-the-mkl-compiler-option.html
+# https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html
+# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+#LINKFLAGS += -O3 -L/opt/intel/mkl/lib -mkl
+LINKFLAGS += -O3 -mkl=sequential
+
+# interprocedural optimization
+CXXFLAGS += -ipo
+LINKFLAGS += -ipo
+
+# annotated assembler file
+ANNOTED = -fsource-asm -S
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+# #    some tools
+# # Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
+# cache: ${PROGRAM}
+# 	valgrind --tool=callgrind --simulate-cache=yes ./$^
+# #	kcachegrind callgrind.out.<pid> &
+#
+# # Check for wrong memory accesses, memory leaks, ...
+# # use smaller data sets
+# mem: ${PROGRAM}
+# 	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
+#
+# #  Simple run time profiling of your code
+# #  CXXFLAGS += -g -pg
+# #  LINKFLAGS += -pg
+# prof: ${PROGRAM}
+# 	./$^
+# 	gprof -b ./$^ > gp.out
+# #	kprof -f gp.out -p gprof &
+#
+
+
+mem: inspector
+prof: amplifier
+cache: amplifier
+
+gap_par_report:
+	${CXX}  -c -guide -parallel $(SOURCES) 2> gap.txt
+
+# GUI for performance report
+amplifier: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	alternatively to the solution above:
+            #edit file  /etc/sysctl.d/10-ptrace.conf     and set variable   kernel.yama.ptrace_scope   variable to 0 .
+	amplxe-gui &
+
+# GUI for Memory and Thread analyzer (race condition)
+inspector: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	inspxe-gui &
+
+advisor:
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	advixe-gui &
+
+icc-info:
+	icpc -# main.cpp
+
+
+
+
--- a/ex3/ONEAPI_default.mk
+++ b/ex3/ONEAPI_default.mk
@ -0,0 +1,176 @@
+# Basic Defintions for using INTEL compiler suite sequentially
+# requires setting of COMPILER=ONEAPI_
+
+#         https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
+# requires
+# source /opt/intel/oneapi/setvars.sh
+# on  AMD:    export MKL_DEBUG_CPU_TYPE=5
+
+#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
+#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
+#export KMP_AFFINITY=verbose,compact
+
+CC	= ${BINDIR}icc
+CXX     = ${BINDIR}dpcpp
+F77	= ${BINDIR}ifort
+LINKER  = ${CXX}
+
+## Compiler flags
+WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
+WARNINGS += -Wpessimizing-move -Wredundant-move
+#-wd2015,2012,2014 -wn3
+#    -Winline -Wredundant-decls -Wunreachable-code
+# -qopt-subscript-in-range
+# -vec-threshold0
+
+CXXFLAGS += -O3 -std=c++17  ${WARNINGS}
+#CXXFLAGS += -DMKL_ILP64  -I"${MKLROOT}/include"
+#CXXFLAGS += -DMKL_ILP32  -I"${MKLROOT}/include"
+LINKFLAGS += -O3
+
+# interprocedural optimization
+CXXFLAGS  += -ipo
+LINKFLAGS += -ipo
+LINKFLAGS += -flto
+
+# annotated Assembler file
+ANNOTED = -fsource-asm -S 
+
+#architecture
+CPU  = -march=core-avx2
+#CPU += -mtp=zen
+# -xCORE-AVX2
+# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# use MKL by INTEL
+# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
+# sequential MKL
+#                              use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
+CXXFLAGS +=  -qmkl=sequential  -UMKL_ILP64
+LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
+
+# shared libs:  https://aur.archlinux.org/packages/intel-oneapi-compiler-static
+#     install intel-oneapi-compiler-static   
+# or 
+LINKFLAGS += -shared-intel
+
+
+OPENMP = -qopenmp
+CXXFLAGS += ${OPENMP}
+LINKFLAGS += ${OPENMP}
+
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+# -vec-report=3
+# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
+# -guide -parallel
+# -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+# -auto-p32 -simd
+
+# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=vec,par
+#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=cg
+# Redirect report from *.optrpt to stderr
+#    -qopt-report-file=stderr
+# Guided paralellization
+#    -guide -parallel
+#    -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
+#    -auto-p32 -simd
+
+## run time checks
+# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
+
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	rm -f ${PROGRAM} ${OBJECTS} *.optrpt
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
+# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
+
+mem: inspector
+prof: vtune
+cache: inspector
+
+gap_par_report:
+	${CXX}  -c -guide -parallel $(SOURCES) 2> gap.txt
+
+# GUI for performance report
+amplifier: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+	echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
+	amplxe-gui &
+
+# GUI for Memory and Thread analyzer (race condition)
+inspector: ${PROGRAM}
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	inspxe-gui &
+	vtune-gui ./${PROGRAM} &
+
+advisor:
+	source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
+#	/opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
+	advisor --collect=survey ./${PROGRAM} 
+#	advisor --collect=roofline ./${PROGRAM} 
+	advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
+
+vtune:
+	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
+#	https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
+	export ADVIXE_EXPERIMENTAL=roofline
+	vtune -collect hotspots ./${PROGRAM}
+	vtune -report hotspots -r r000hs > vtune.out
+#	vtune-gui ./${PROGRAM} &	
+
+icc-info:
+	icpc -# main.cpp
+
+# MKL on AMD
+# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
+#
+# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
+# export MKL_DEBUG_CPU_TYPE=5
+# export MKL_NUM_THRAEDS=1
+# export MKL_DYNAMIC=false
+#  on Intel compiler
+# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html
--- a/ex3/PGI_default.mk
+++ b/ex3/PGI_default.mk
@ -0,0 +1,93 @@
+# Basic Defintions for using PGI-compiler suite sequentially
+# requires setting of COMPILER=PGI_
+# OPTIRUN = optirun
+
+
+CC	= pgcc
+CXX     = pgc++
+F77	= pgfortran
+LINKER  = ${CXX}
+
+# on mephisto:
+#CXXFLAGS  += -I/share/apps/atlas/include
+#LINKFLAGS += -L/share/apps/atlas/lib
+#LINKFLAGS   += -lcblas -latlas
+
+#LINKFLAGS   += -lblas
+# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
+
+WARNINGS = -Minform=warn
+# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow  -Wredundant-decls
+#           -pedantic -Wunreachable-code -Wextra -Winline
+#  -Wunreachable-code
+
+#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel
+PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
+# -Minfo
+# -Mprof=time
+# -Mprof=lines
+#        take care with option      -Msafeptr
+CXXFLAGS += -O3 -std=c++17  ${WARNINGS}
+#CXXFLAGS += -O3 -std=c++11  -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
+#  -fastsse  -fargument-noalias ${WARNINGS}  -msse3 -vec-report=3
+
+default:	${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+run: clean ${PROGRAM}
+	./${PROGRAM}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+# #    some tools
+# #  Simple run time profiling of your code
+# #  CXXFLAGS += -g -pg
+# #  LINKFLAGS += -pg
+
+
+# Profiling options PGI, see: pgcollect -help
+# CPU_PROF = -allcache
+CPU_PROF = -time
+# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
+#GPU_PROF = -cuda=branch:cc20
+#
+PROF_FILE = pgprof.out
+
+cache: prof
+
+prof: ${PROGRAM}
+	${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^
+	${OPTIRUN} ${BINDIR}pgprof -exe ./$^  $(PROF_FILE) &
+
+info:
+	pgaccelinfo -v
--- a/ex3/ex3/Makefile
+++ b/ex3/ex3/Makefile
@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp vdop.cpp getmatrix.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
--- a/ex3/ex3/benchmark_tests.cpp
+++ b/ex3/ex3/benchmark_tests.cpp
@ -0,0 +1,333 @@
+#include "benchmark_tests.h"
+#include "benchmarks.h"
+#include <chrono>
+#include <iostream>
+#include <math.h>       
+using namespace std::chrono; 
+
+vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function)
+{
+    cout << "#################### (A) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N), y(N);
+
+    cout.precision(2);
+    cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        x[i] = i % 219 + 1;
+        y[i] = 1.0/x[i];
+    }
+
+
+    cout << "\nStart Benchmarking scalar\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double check(0.0),ss(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        check = scalar_function(x, y);
+        ss += check;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n <x,y> = " << check << endl;
+    if (static_cast<unsigned int>(check) != N)
+        cout << "  !!   W R O N G  result   !!\n";
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+//##########################################################################
+    cout << "\nStart Benchmarking norm\n";
+
+    auto t3 = system_clock::now(); // start timer
+// Do calculation
+    double ss2(0.0);
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        auto sk1 = sqrt(scalar(x, x));
+        ss2 += sk1;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t4 = system_clock::now();  // stop timer
+    auto duration2 = duration_cast<microseconds>(t4 - t3);        // duration in microseconds
+    double t_diff2 = static_cast<double>(duration2.count()) / 1e6; // overall duration in seconds
+    t_diff2 = t_diff2/NLOOPS;                                      // duration per loop seconds
+    
+
+    cout << "ss(norm): " << ss2 << endl;
+    cout << "Timing in sec. : " << t_diff2 << endl;
+
+
+
+    
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function)
+{
+    cout << "#################### (B) ####################" << endl;
+
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+    cout << "\nM = " << M << endl;
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> A(M*N);
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//  Data initialization
+
+    for (size_t i = 0; i < M; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A[N*i + j] = (i + j) % 219 + 1;
+
+
+    for (size_t j = 0; j < N; ++j)
+    {
+        x[j] = 1.0/A[N*17 + j];
+    }
+
+    cout << "\nStart Benchmarking MatVec\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> b(M);
+    
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        b = MatVec_function(A, x);
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+
+
+// Check the correct result
+    cout << "\n <A[17,*],x> = " << b[17] << endl;
+    if (static_cast<size_t>(b[17]) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+    double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function)
+{
+    cout << "#################### (C) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nL = " << L << endl;
+    cout << "\nM = " << M << endl;
+    cout << "\nN = " << N << endl;
+
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> A(M*L);
+    vector<double> B(L*N);
+
+    cout.precision(2);
+    cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+
+//  Data initialization
+
+    for (size_t i = 0; i < M; ++i)
+        for (size_t k = 0; k < L; ++k)
+            A[L*i + k] = (i + k) % 219 + 1;
+
+    for (size_t k = 0; k < L; ++k)
+        for (size_t j = 0; j < N; ++j)
+            B[N*k + j] = 1.0/A[L*17 + k];
+
+
+    cout << "\nStart Benchmarking MatMat\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> C(M*N);
+    double check;
+    double check_sum = 0;
+
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        C = MatMat_function(A, B, L);
+
+        check = C[N*17];
+        check_sum += check; // prevents the optimizer from removing unused calculation results.
+    }
+    cout << check_sum;
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+
+
+// Check the correct result
+    cout << "\n C[17,0] = " << check << endl;
+    if (static_cast<unsigned int>(check) != L)
+    {
+        cout << "  !!   W R O N G  result   !!, should be " << L <<"\n";
+    }
+    cout << endl;
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+
+
+    double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
+
+
+vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
+{
+    cout << "#################### (D) ####################" << endl;
+    cout << "\nLOOPS = " << NLOOPS << endl;
+    cout << "\nN = " << N << endl;
+    cout << "\np = " << p << endl;
+
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> a(p + 1, 0);
+    vector<double> x(N);
+
+    cout.precision(2);
+    cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//  Data initialization
+
+    for (size_t j = 0; j < N; ++j)
+        x[j] = 1.0*j;
+    
+    for (size_t k = 0; k < p + 1; ++k)
+        a[k] = pow(-1.0, k);         // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
+
+
+
+    cout << "\nStart Benchmarking poly\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    vector<double> y(N);
+    double check;
+    double check_sum;
+    
+    for (size_t i = 0; i < NLOOPS; ++i)
+    {
+        y = poly(a, x);
+        check = y[0];
+
+        check_sum += check; // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+
+
+// Check the correct result
+    cout << "\n poly(" << x[0] << ") = " << check << endl;
+    if (abs(check - 1.0) > 1.0/1e6)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    
+
+    double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
+    double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
+
+    cout << "Total duration : " << t_diff*NLOOPS << endl;
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << Gflops << endl;
+    cout << "GiByte/s       : " << MemBandwidth << endl;
+
+
+
+    return vector<double>{t_diff, Gflops, MemBandwidth};
+}
--- a/ex3/ex3/benchmark_tests.h
+++ b/ex3/ex3/benchmark_tests.h
@ -0,0 +1,15 @@
+#pragma once
+#include <vector>
+#include <functional>
+using namespace std;
+
+
+
+
+vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function);
+
+vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function);
+
+vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function);
+
+vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
--- a/ex3/ex3/benchmarks.cpp
+++ b/ex3/ex3/benchmarks.cpp
@ -0,0 +1,245 @@
+#include "benchmarks.h"
+#include "vdop.h"
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <cassert>       // assert()
+
+
+#ifdef __INTEL_CLANG_COMPILER
+#pragma message(" ##########  Use of MKL  ###############")
+#include <mkl.h>
+#else
+#pragma message(" ##########  Use of CBLAS  ###############")
+
+#include <cblas.h>               // cBLAS Library
+#include <lapacke.h>             // Lapack
+
+#endif
+
+
+
+// (A) Inner product of two vectors (from skalar_stl)
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size());
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+    }
+    return sum;
+}
+
+// (A) 5.(b) Kahan scalar product
+double Kahan_skalar(vector<double> const &x, vector<double> const &y)
+{
+    double sum = 0.0;
+    double c = 0.0;
+    size_t n = x.size();
+    for (size_t i = 0; i < n; ++i)
+    {
+        double z = x[i]*y[i] - c;    // c is the part that got lost in the last iteration
+        double t = sum + z;          // when adding sum + z, the lower digits are lost if sum is large
+        c = (t - sum) - z;           // now we recover the lower digits to add in the next iteration
+        sum = t;
+    }
+    return sum;
+}
+
+// (A) 6. cBLAS scalar product
+double scalar_cBLAS(vector<double> const &x, vector<double> const &y)
+{
+    return cblas_ddot(x.size(), x.data(), 1, y.data(), 1); // x.data() = &x[0]
+}
+
+
+// (B) Matrix-vector product (from intro_vector_densematrix)
+vector<double> MatVec(vector<double> const &A, vector<double> const &x)
+{
+    size_t const nelem = A.size();
+    size_t const N = x.size();
+    assert(nelem % N == 0); // make sure multiplication is possible
+    size_t const M = nelem/N;          
+
+
+    vector<double> b(M);    
+
+    for (size_t i = 0; i < M; ++i)
+    {
+        double tmp = 0.0;
+        for (size_t j = 0; j < N; ++j)
+            tmp += A[N*i + j] * x[j];
+        b[i] = tmp;
+    }
+
+    return b;
+}
+
+// (B) cBLAS Matrix-vector product
+vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x)
+{
+    size_t const nelem = A.size();
+    size_t const N = x.size();
+    assert(nelem % N == 0); // make sure multiplication is possible
+    size_t const M = nelem/N;
+
+
+    vector<double> b(M);
+
+    cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0.0, b.data(), 1);
+
+    return b;
+}
+
+
+// (C) Matrix-matrix product
+vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &L)
+{
+    size_t const nelem_A = A.size();
+    size_t const nelem_B = B.size();
+
+    assert(nelem_A % L == 0 && nelem_B % L == 0);
+
+    size_t const M = nelem_A/L;
+    size_t const N = nelem_B/L;
+
+
+    vector<double> C(M*N);
+
+            
+
+    for (size_t i = 0; i < M; ++i)
+    {
+        for (size_t k = 0; k < L; ++k)
+        {
+            for (size_t j = 0; j < N; ++j)
+            {
+                C[N*i + j] += A[L*i + k]*B[N*k + j];
+            }
+            
+        }
+    }
+
+    return C;
+}
+
+// (C) cBLAS matrix-matrix product
+vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &L)
+{
+    size_t const nelem_A = A.size();
+    size_t const nelem_B = B.size();
+
+    assert(nelem_A % L == 0 && nelem_B % L == 0);
+
+    size_t const M = nelem_A/L;
+    size_t const N = nelem_B/L;
+
+
+    vector<double> C(M*N);
+
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
+
+    return C;
+}
+
+
+// (D) Evaluation of a polynomial function
+vector<double> poly(vector<double> const &a, vector<double> const &x)
+{
+    size_t const N = x.size();
+    size_t const p = a.size() - 1;
+    vector<double> y(N, 0);
+
+    for (size_t i = 0; i < N; ++i)
+    {
+        double x_temp = x[i];
+        double y_temp = 0;
+        for (size_t k = 0; k < p + 1; ++k)
+        {
+            y_temp += x_temp*y_temp + a[p - k];
+        }
+        y[i] = y_temp;
+    }
+
+    return y;
+}
+
+
+// (E) Solves linear system of equations
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
+{
+    const double omega   = 1.0;
+    const int    maxiter = 1000;
+    const double tol  = 1e-5,                // tolerance
+                 tol2 = tol * tol;           // tolerance^2
+
+    int nrows = SK.Nrows();                  // number of rows == number of columns
+    assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
+
+    cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s"  << endl;
+    //  Choose initial guess
+    for (int k = 0; k < nrows; ++k)
+    {
+        u[k] = 0.0;                          //  u := 0
+    }
+
+    vector<double> dd(nrows);                // matrix diagonal
+    vector<double>  r(nrows);                // residual
+    vector<double>  w(nrows);                // correction
+
+    SK.GetDiag(dd);                          //  dd := diag(K)
+    ////DebugVector(dd);{int ijk; cin >> ijk;}
+
+    //  Initial sweep
+    SK.Defect(r, f, u);                      //  r := f - K*u
+
+    vddiv(w, r, dd);                         //  w := D^{-1}*r
+    double sigma0 = dscapr(w, r);            // s0 := <w,r>
+
+    // Iteration sweeps
+    int iter  = 0;
+    double sigma = sigma0;
+    while ( sigma > tol2 * sigma0 && maxiter > iter)
+    {
+        ++iter;
+        vdaxpy(u, u, omega, w );             //  u := u + om*w
+        SK.Defect(r, f, u);                  //  r := f - K*u
+        vddiv(w, r, dd);                     //  w := D^{-1}*r
+        sigma = dscapr(w, r);                // s0 := <w,r>
+//      	cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
+    }
+    cout << "aver. Jacobi rate :  " << exp(log(sqrt(sigma / sigma0)) / iter) << "  (" << iter << " iter)" << endl;
+    cout << "final error: " << sqrt(sigma / sigma0) << " (rel)   " << sqrt(sigma) << " (abs)\n";
+
+    return;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/ex3/ex3/benchmarks.h
+++ b/ex3/ex3/benchmarks.h
@ -0,0 +1,89 @@
+#pragma once
+#include "getmatrix.h"
+#include <vector>
+
+using namespace std;
+
+/**     (A) Inner product of two vectors (from skalar_stl)
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar(vector<double> const &x, vector<double> const &y);
+
+/**     (A) 5.(b) Inner product of two vectors using the Kahan scalar product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double Kahan_skalar(vector<double> const &x,  vector<double> const &y);
+
+/** 	(A) 6. cBLAS scalar product of two vectors
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_cBLAS(vector<double> const &x, vector<double> const &y);
+
+
+/** 	(B) Matrix-vector product (from intro_vector_densematrix)
+ * 	@param[in] A	dense matrix (1D access)
+ *  @param[in] u	vector
+ *
+ *	@return    resulting vector
+*/
+vector<double> MatVec(vector<double> const &A, vector<double> const &x);
+
+/** 	(B) 6. cBLAS Matrix-vector product
+ * 	@param[in] A	dense matrix (1D access)
+ *  @param[in] u	vector
+ *
+ *	@return    resulting vector
+*/
+vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x);
+
+
+/** 	(C) Matrix-matrix product
+ * 	@param[in] A			MxL dense matrix (1D access)
+ *  @param[in] B			LxN dense matrix (1D access)
+ *  @param[in] shared_dim 	shared dimension L
+ *
+ *	@return    resulting MxN matrix
+*/
+vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
+
+/** 	(C) 6. cBLAS Matrix-matrix product
+ * 	@param[in] A			MxL dense matrix (1D access)
+ *  @param[in] B			LxN dense matrix (1D access)
+ *  @param[in] shared_dim 	shared dimension L
+ *
+ *	@return    resulting MxN matrix
+*/
+vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
+
+
+/** 	(D) Evaluation of a polynomial function using Horner's scheme
+ * 	@param[in] a	coefficient vector
+ *  @param[in] x	vector with input values
+ *
+ *	@return    vector with output values
+*/
+vector<double> poly(vector<double> const &a, vector<double> const &x);
+
+
+/**     (E) Solves linear system of equations  K @p u = @p f  via the Jacobi iteration (from jaboci_oo_stl)
+ * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
+ * solution is set to 0.
+ * @param[in] SK	CSR matrix
+ * @param[in] f		distributed local vector storing the right hand side
+ * @param[out] u	accumulated local vector storing the solution.
+*/
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u);
+
+
+
+
+
+
+
+
--- a/ex3/ex3/factorization_solve.cpp
+++ b/ex3/ex3/factorization_solve.cpp
@ -0,0 +1,39 @@
+#include "factorization_solve.h"
+#include <vector>
+#include <math.h>
+#include <assert.h>
+#include <lapack.h>
+#include <iostream>
+
+using namespace std;
+
+
+void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs)
+{
+    int32_t nelem = A.size();
+    int32_t N = sqrt(nelem);
+    assert(N == static_cast<int32_t>(b.size())/n_rhs);
+
+    vector<int> IPIV(N); // pivot indices
+    int info_factorization;
+
+    dgetrf_(&N, &N, A.data(), &N, IPIV.data(), &info_factorization);
+
+    const char transp = 'N';
+    int info_solve;
+    dgetrs_(&transp, &N, &n_rhs, A.data(), &N, IPIV.data(), b.data(), &N, &info_solve, 1); // 1 is length of parameter 'N'
+
+}
+
+void print_matrix(vector<double> &A, size_t M, size_t N)
+{
+    for (size_t i = 0; i < M; ++i)
+    {
+        for (size_t j = 0; j < N; ++j)
+        {
+            cout << A[N*i + j] << "\t";
+        }
+        cout << endl;
+    }
+    cout << endl;
+}
--- a/ex3/ex3/factorization_solve.h
+++ b/ex3/ex3/factorization_solve.h
@ -0,0 +1,21 @@
+#pragma once
+#include <vector>
+#include <cstdint>
+using namespace std;
+
+
+/** 	Solve linear system of equations with multiple right hand sides using LU factorization
+ * 	@param[inout] A			NxN Matrix (1D access), gets modified to contain the LU decomposition of A
+ *  @param[inout] B			N x n_rhs Matrix of right-hand-sides (1D access), gets modified to contain the solution vectors x
+ *  @param[in] n_rhs 	    number of right-hand-sides b
+ *
+*/
+void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs);
+
+/** 	Print a matrix to console
+ * 	@param[in] A			MxN Matrix (1D access)
+ *  @param[in] M            rows			
+ *  @param[in] N            columns 	    
+ *
+*/
+void print_matrix(vector<double> &A, size_t M, size_t N);
--- a/ex3/ex3/factorization_solve_tests.cpp
+++ b/ex3/ex3/factorization_solve_tests.cpp
@ -0,0 +1,89 @@
+#include "factorization_solve_tests.h"
+#include "factorization_solve.h"
+#include "benchmarks.h"
+#include <chrono>
+#include <iostream>
+
+using namespace std::chrono; 
+
+void CheckCorrectness()
+{
+    cout.precision(4);
+
+    size_t N = 5;
+    size_t n_rhs = 5;
+    vector<double> A(N*N);
+    vector<double> b(N*n_rhs);
+
+    // initialize A
+    for (size_t i = 0; i < N; ++i)
+    {
+        for (size_t j = 0; j < N; ++j)
+            if (i == j)
+                A[N*i + j] = 4.0;
+            else
+                A[N*i + j] = 1.0/((i - j)*(i - j));
+    }
+    const vector<double> A_copy = A;
+        
+    // initialize b as identity matrix
+    for (size_t j = 0; j < N; ++j)
+    {
+        for (size_t l = 0; l < n_rhs; ++l)
+            if (l == j)
+                b[N*l + j] = 1.0;
+            else
+                b[N*l + j] = 0.0;
+    }
+    
+    cout << "A = " << endl;
+    print_matrix(A, N, N);
+    cout << "b = " << endl;
+    print_matrix(b, N, n_rhs);
+
+    // solve system
+    factorization_solve(A, b, n_rhs);
+
+    vector<double> check_matrix_identity = MatMat_cBLAS(A_copy, b, N);
+    cout << "A*A^{-1} = " << endl;
+    print_matrix(check_matrix_identity, N, n_rhs);
+}
+
+
+void CheckDuration(const size_t &N)
+{
+    for (size_t n_rhs : {1, 2, 4, 8, 16, 32})
+    {
+        auto time_start = system_clock::now();
+
+        vector<double> A(N*N);
+        vector<double> b(N*n_rhs);
+
+        // initialize A
+        for (size_t i = 0; i < N; ++i)
+        {
+            for (size_t j = 0; j < N; ++j)
+                if (i == j)
+                    A[N*i + j] = 4.0;
+                else
+                    A[N*i + j] = 1.0/((i - j)*(i - j));
+        }
+
+        // initialize b
+        for (size_t j = 0; j < N; ++j)
+            for (size_t l = 0; l < n_rhs; ++l)
+                b[N*l + j] = 1.0;
+
+
+        // solve system
+        factorization_solve(A, b, n_rhs);
+
+
+
+        auto time_end = system_clock::now();
+        auto duration = duration_cast<microseconds>(time_end - time_start);        // duration in microseconds
+        double t_diff = static_cast<double>(duration.count()) / 1e6;               // overall duration in seconds
+
+        cout << "n_rhs = " << n_rhs << "\tTime per rhs: " << t_diff/n_rhs << endl;
+    }
+}
--- a/ex3/ex3/factorization_solve_tests.h
+++ b/ex3/ex3/factorization_solve_tests.h
@ -0,0 +1,7 @@
+#pragma once
+#include <cstddef>
+
+
+void CheckCorrectness();
+
+void CheckDuration(const size_t &N);
--- a/ex3/ex3/geom.cpp
+++ b/ex3/ex3/geom.cpp
@ -0,0 +1,522 @@
+// see:   http://llvm.org/docs/CodingStandards.html#include-style
+#include "geom.h"
+
+#include <algorithm>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <string>
+#include <vector>
+using namespace std;
+
+Mesh::Mesh(int ndim, int nvert_e, int ndof_e)
+    : _nelem(0), _nvert_e(nvert_e), _ndof_e(ndof_e), _nnode(0), _ndim(ndim), _ia(0), _xc(0)
+{
+}
+
+Mesh::~Mesh()
+{}
+
+void Mesh::SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const
+{
+    int const nnode = Nnodes();            // number of vertices in mesh
+    assert( nnode == static_cast<int>(v.size()) );
+    for (int k = 0; k < nnode; ++k)
+    {
+        v[k] = func( _xc[2 * k], _xc[2 * k + 1] );
+    }
+}
+
+
+void Mesh::Debug() const
+{
+    cout << "\n ############### Debug  M E S H  ###################\n";
+    cout << "\n ...............    Coordinates       ...................\n";
+    for (int k = 0; k < _nnode; ++k)
+    {
+        cout << k << " : " ;
+        for (int i = 0; i < _ndof_e; ++i )
+        {
+            cout << _xc[2*k+i] << "  ";
+        }
+        cout << endl;
+    }
+    cout << "\n ...............    Elements        ...................\n";
+    for (int k = 0; k < _nelem; ++k)
+    {
+        cout << k << " : ";
+        for (int i = 0; i < _ndof_e; ++i )
+            cout << _ia[_ndof_e * k + i] << "  ";
+        cout << endl;
+    }
+    return;
+}
+
+void Mesh::Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const
+{
+    assert(Nnodes() ==  static_cast<int>(v.size()));  // fits vector length to mesh information?
+
+    ofstream fout(fname);                             // open file ASCII mode
+    if ( !fout.is_open() )
+    {
+        cout << "\nFile " << fname << " has not been opened.\n\n" ;
+        assert( fout.is_open() && "File not opened."  );
+    }
+
+    string const DELIMETER(" ");    // define the same delimeter as in matlab/ascii_read*.m
+    int const    OFFSET(1);         // convert C-indexing to matlab
+
+    // Write data: #nodes, #space dimensions, #elements, #vertices per element
+    fout << Nnodes() << DELIMETER << Ndims() << DELIMETER << Nelems() << DELIMETER << NverticesElements() << endl;
+
+    // Write cordinates: x_k, y_k   in seperate lines
+    assert( Nnodes()*Ndims() ==  static_cast<int>(_xc.size()));
+    for (int k = 0, kj = 0; k < Nnodes(); ++k)
+    {
+        for (int j = 0; j < Ndims(); ++j, ++kj)
+        {
+            fout << _xc[kj] << DELIMETER;
+        }
+        fout << endl;
+    }
+
+    // Write connectivity: ia_k,0, ia_k,1 etc  in seperate lines
+    assert( Nelems()*NverticesElements() ==  static_cast<int>(_ia.size()));
+    for (int k = 0, kj = 0; k < Nelems(); ++k)
+    {
+        for (int j = 0; j < NverticesElements(); ++j, ++kj)
+        {
+            fout << _ia[kj] + OFFSET << DELIMETER;     // C to matlab
+        }
+        fout << endl;
+    }
+
+    // Write vector
+    for (int k = 0; k < Nnodes(); ++k)
+    {
+        fout << v[k] << endl;
+    }
+
+    fout.close();
+    return;
+}
+
+
+void Mesh::Visualize(std::vector<double> const &v) const
+{
+    // define external command
+    const string exec_m("matlab -nosplash < visualize_results.m");                 // Matlab
+    //const string exec_m("octave --no-window-system --no-gui visualize_results.m"); // Octave
+    //const string exec_m("flatpak run org.octave.Octave visualize_results.m");      // Octave (flatpak): desktop GH
+
+    const string fname("uv.txt");
+    Write_ascii_matlab(fname, v);
+
+    int ierror = system(exec_m.c_str());                                 // call external command
+
+    if (ierror != 0)
+    {
+        cout << endl << "Check path to Matlab/octave on your system" << endl;
+    }
+    cout << endl;
+    return;
+}
+
+
+
+
+// #####################################################################
+Mesh_2d_3_square::Mesh_2d_3_square(int nx, int ny, int myid, int procx, int procy)
+    : Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
+      _myid(myid), _procx(procx), _procy(procy), _neigh{{-1, -1, -1, -1}}, _color(0),
+_xl(0.0), _xr(1.0), _yb(0.0), _yt(1.0), _nx(nx), _ny(ny)
+{
+    //void IniGeom(int const myid, int const procx, int const procy, int neigh[], int &color)
+    int const ky = _myid / _procx;
+    int const kx = _myid % _procy;  //    MOD(myid,procx)
+    // Determine the neighbors of domain/rank myid
+    _neigh[0] = (ky == 0)       ?  -1 : _myid - _procx;    //   South
+    _neigh[1] = (kx == _procx - 1) ?  -1 : _myid + 1;      //   East
+    _neigh[2] = (ky == _procy - 1) ?  -1 : _myid + _procx;  //   North
+    _neigh[3] = (kx == 0)       ?  -1 : _myid - 1;        //   West
+
+    _color = (kx + ky) & 1 ;
+
+    // subdomain is part of unit square
+    double const hx = 1. / _procx;
+    double const hy = 1. / _procy;
+    _xl = kx * hx;                      //  left
+    _xr = (kx + 1) * hx;                //  right
+    _yb = ky * hy;                      //  bottom
+    _yt = (ky + 1) * hy;                //  top
+
+    // Calculate coordinates
+    int const nnode = (_nx + 1) * (_ny + 1); // number of nodes
+    Resize_Coords(nnode, 2);                 // coordinates in 2D [nnode][ndim]
+    GetCoordsInRectangle(_nx, _ny, _xl, _xr, _yb, _yt, GetCoords().data());
+
+    // Calculate element connectivity (linear triangles)
+    int const nelem = 2 * _nx * _ny;         // number of elements
+    Resize_Connectivity(nelem, 3);           // connectivity matrix [nelem][3]
+    GetConnectivityInRectangle(_nx, _ny, GetConnectivity().data());
+
+    return;
+}
+
+
+void Mesh_2d_3_square::SetU(std::vector<double> &u) const
+{
+    int dx    = _nx + 1;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        int k = j * dx;
+        for (int i = 0; i <= _nx; ++i, ++k)
+        {
+            u[k] = 0.0;
+        }
+    }
+
+}
+
+void Mesh_2d_3_square::SetF(std::vector<double> &f) const
+{
+    int dx    = _nx + 1;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        int k = j * dx;
+        for (int i = 0; i <= _nx; ++i, ++k)
+        {
+            f[k] = 1.0;
+        }
+    }
+
+}
+
+
+std::vector<int> Mesh_2d_3_square::Index_DirichletNodes() const
+{
+    int const dx = 1,
+              dy = _nx + 1,
+              bl  = 0,
+              br  = _nx,
+              tl  = _ny * (_nx + 1),
+              tr  = (_ny + 1) * (_nx + 1) - 1;
+    int const start[4] = { bl, br, tl, bl},
+                         end[4] = { br, tr, tr, tl},
+                                  step[4] = { dx, dy, dx, dy};
+
+    vector<int> idx(0);
+    for (int j = 0; j < 4; j++)
+    {
+        if (_neigh[j] < 0)
+        {
+            for (int i = start[j]; i <= end[j]; i += step[j])
+            {
+                idx.push_back(i);        // node i is Dirichlet node
+            }
+        }
+    }
+    //    remove multiple elements
+    sort(idx.begin(), idx.end());                          // sort
+    idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
+
+    return idx;
+}
+
+void Mesh_2d_3_square::SaveVectorP(std::string const &name, vector<double> const &u) const
+{
+//  construct the file name for subdomain myid
+    const string tmp( std::to_string(_myid / 100) + to_string((_myid % 100) / 10) + to_string(_myid % 10) );
+
+    const string namep = name + "." + tmp;
+    ofstream ff(namep.c_str());
+    ff.precision(6);
+    ff.setf(ios::fixed, ios::floatfield);
+
+    // assumes tensor product grid in unit square; rowise numbered (as generated in class constructor)
+    // output is provided for tensor product grid visualization ( similar to Matlab-surf() )
+    auto const &xc = GetCoords();
+    int k = 0;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        for (int i = 0; i <= _nx; ++i, ++k)
+            ff << xc[2 * k + 0] << "   " << xc[2 * k + 1] << "   " << u[k] << endl;
+        ff << endl;
+    }
+
+    ff.close();
+    return;
+}
+
+void Mesh_2d_3_square::GetCoordsInRectangle(int const nx, int const ny,
+        double const xl, double const xr, double const yb, double const yt,
+        double xc[])
+{
+    const double hx = (xr - xl) / nx,
+                 hy = (yt - yb) / ny;
+
+    int k = 0;
+    for (int j = 0; j <= ny; ++j)
+    {
+        const double y0 = yb + j * hy;
+        for (int i = 0; i <= nx; ++i, k += 2)
+        {
+            xc[k  ] = xl + i * hx;
+            xc[k + 1] = y0;
+        }
+    }
+
+    return;
+}
+
+void Mesh_2d_3_square::GetConnectivityInRectangle(int const nx, int const ny, int ia[])
+{
+    const int dx = nx + 1;
+    int k  = 0;
+    int l  = 0;
+    for (int j = 0; j < ny; ++j, ++k)
+    {
+        for (int i = 0; i < nx; ++i, ++k)
+        {
+            ia[l  ] = k;
+            ia[l + 1] = k + 1;
+            ia[l + 2] = k + dx + 1;
+            l += 3;
+            ia[l  ] = k;
+            ia[l + 1] = k + dx;
+            ia[l + 2] = k + dx + 1;
+            l += 3;
+        }
+    }
+    return;
+}
+
+// #################### still some old code (--> MPI) ############################
+
+
+//  Copies the values of w corresponding to the boundary
+//  South (ib==1), East (ib==2), North (ib==3), West (ib==4)
+
+void GetBound(int const ib, int const nx, int const ny, double const w[], double s[])
+{
+    const int //dx = 1,
+    dy = nx + 1,
+    bl  = 0,
+    br  = nx,
+    tl  = ny * (nx + 1),
+    tr  = (ny + 1) * (nx + 1) - 1;
+    switch (ib)
+    {
+        case 1:
+        {
+            for (int i = bl, j = 0; i <= br; ++i, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 3:
+        {
+            for (int i = tl, j = 0; i <= tr; ++i, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 4:
+        {
+            for (int i = bl, j = 0; i <= tl; i += dy, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 2:
+        {
+            for (int i = br, j = 0; i <= tr; i += dy, ++j)
+                s[j] = w[i];
+            break;
+        }
+        default:
+        {
+            cout << endl << "Wrong parameter  ib in " << __FILE__ << ":" << __LINE__ << endl;
+        }
+    }
+    return;
+}
+
+// ----------------------------------------------------------------------------------------------------------
+// Computes w:  = w + s at nodes on  the boundary
+// South (ib == 1), East (ib == 2), North (ib == 3), West (ib == 4)
+
+void AddBound(int const ib, int const nx, int const ny, double w[], double const s[])
+{
+    int const dy = nx + 1,
+              bl  = 0,
+              br  = nx,
+              tl  = ny * (nx + 1),
+              tr  = (ny + 1) * (nx + 1) - 1;
+    switch (ib)
+    {
+        case 1:
+        {
+            for (int i = bl, j = 0; i <= br; ++i, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 3:
+        {
+            for (int i = tl, j = 0; i <= tr; ++i, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 4:
+        {
+            for (int i = bl, j = 0; i <= tl; i += dy, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 2:
+        {
+            for (int i = br, j = 0; i <= tr; i += dy, ++j)
+                w[i] += s[j];
+            break;
+        }
+        default:
+        {
+            cout << endl << "Wrong parameter  ib in " << __FILE__ << ":" << __LINE__ << endl;
+        }
+    }
+    return;
+}
+
+
+// ####################################################################
+
+Mesh_2d_3_matlab::Mesh_2d_3_matlab(string const &fname)
+    : Mesh(2, 3, 3),  // two dimensions, 3 vertices, 3 dofs
+      bedges(0)
+{
+    ifstream ifs(fname);
+    if (!(ifs.is_open() && ifs.good()))
+    {
+        cerr << "Mesh_2d_3_matlab: Error cannot open file " << fname << endl;
+        assert(ifs.is_open());
+    }
+
+    int const OFFSET(1);             // Matlab to C indexing
+    cout << "ASCI file  " << fname << "  opened" << endl;
+
+    // Read some mesh constants
+    int nnode, ndim, nelem, nvert_e;
+    ifs >> nnode >> ndim >> nelem >> nvert_e;
+    cout << nnode << "  " << ndim << "  " << nelem << "  " << nvert_e << endl;
+    assert(ndim == 2 && nvert_e == 3);
+
+    // Allocate memory
+    Resize_Coords(nnode, ndim);                 // coordinates in 2D [nnode][ndim]
+    Resize_Connectivity(nelem, nvert_e);        // connectivity matrix [nelem][nvert]
+
+    // Read ccordinates
+    auto &xc = GetCoords();
+    for (int k = 0; k < nnode * ndim; ++k)
+    {
+        ifs >> xc[k];
+    }
+
+    // Read connectivity
+    auto &ia = GetConnectivity();
+    for (int k = 0; k < nelem * nvert_e; ++k)
+    {
+        ifs >> ia[k];
+        ia[k] -= OFFSET;                // Matlab to C indexing
+    }
+
+    // additional read of boundary information (only start/end point)
+    int nbedges;
+    ifs >> nbedges;
+
+    bedges.resize(nbedges * 2);
+    for (int k = 0; k < nbedges * 2; ++k)
+    {
+        ifs >> bedges[k];
+        bedges[k] -= OFFSET;            // Matlab to C indexing
+    }
+
+    return;
+}
+
+// binary
+//{
+//ifstream ifs(fname, ios_base::in | ios_base::binary);
+//if(!(ifs.is_open() && ifs.good()))
+//{
+//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
+//assert(ifs.is_open());
+//}
+//cout << "ReadBinMatrix: file opened" << file << endl;
+
+
+//}
+
+// binaryIO.cpp
+//void read_binMatrix(const string& file, vector<int> &cnt, vector<int> &col, vector<double> &ele)
+//{
+
+//ifstream ifs(file, ios_base::in | ios_base::binary);
+
+//if(!(ifs.is_open() && ifs.good()))
+//{
+//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
+//assert(ifs.is_open());
+//}
+//cout << "ReadBinMatrix: Opened file " << file << endl;
+
+//int _size;
+
+//ifs.read(reinterpret_cast<char*>(&_size), sizeof(int));   // old: ifs.read((char*)&_size, sizeof(int));
+//cnt.resize(_size);
+//cout << "ReadBinMatrix: cnt size: " << _size << endl;
+
+//ifs.read((char*)&_size, sizeof(int));
+//col.resize(_size);
+//cout << "ReadBinMatrix: col size: " << _size << endl;
+
+//ifs.read((char*)&_size, sizeof(int));
+//ele.resize(_size);
+//cout << "ReadBinMatrix: ele size: " << _size << endl;
+
+
+//ifs.read((char*)cnt.data(), cnt.size() * sizeof(int));
+//ifs.read((char*)col.data(), col.size() * sizeof(int));
+//ifs.read((char*)ele.data(), ele.size() * sizeof(double));
+
+//ifs.close();
+//cout << "ReadBinMatrix: Finished reading matrix.." << endl;
+
+//}
+
+
+std::vector<int> Mesh_2d_3_matlab::Index_DirichletNodes() const
+{
+    vector<int> idx(bedges);                              // copy
+
+    sort(idx.begin(), idx.end());                         // sort
+    idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
+
+    return idx;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/ex3/ex3/geom.h
+++ b/ex3/ex3/geom.h
@ -0,0 +1,381 @@
+#ifndef GEOM_FILE
+#define GEOM_FILE
+#include <array>
+#include <functional>             // function; C++11
+#include <string>
+#include <vector>
+
+/**
+ * Basis class for finite element meshes.
+ */
+class Mesh
+{
+public:
+    /**
+     * Constructor initializing the members with default values.
+      *
+     * @param[in] ndim  space dimensions (dimension for coordinates)
+      * @param[in] nvert_e  number of vertices per element (dimension for connectivity)
+      * @param[in] ndof_e   degrees of freedom per element (= @p nvert_e for linear elements)
+     */
+    explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0);
+
+    /**
+     * Destructor.
+     *
+     * See clang warning on
+     * <a href="https://stackoverflow.com/questions/28786473/clang-no-out-of-line-virtual-method-definitions-pure-abstract-c-class/40550578">weak-vtables</a>.
+     */
+    virtual ~Mesh();
+
+    /**
+     * Number of finite elements in (sub)domain.
+     * @return number of elements.
+     */
+    int Nelems() const
+    {
+        return _nelem;
+    }
+
+    /**
+     * Global number of vertices for each finite element.
+     * @return number of vertices per element.
+     */
+    int NverticesElements() const
+    {
+        return _nvert_e;
+    }
+
+    /**
+     * Global number of degrees of freedom (dof) for each finite element.
+     * @return degrees of freedom per element.
+     */
+    int NdofsElement() const
+    {
+        return _ndof_e;
+    }
+
+    /**
+     * Number of vertices in mesh.
+     * @return number of vertices.
+     */
+    int Nnodes() const
+    {
+        return _nnode;
+    }
+
+    /**
+     * Space dimension.
+     * @return number of dimensions.
+     */
+    int Ndims() const
+    {
+        return _ndim;
+    }
+
+    /**
+     * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
+      *
+     * @param[in] nelem    number of elements
+      * @param[in] nvert_e  number of vertices per element
+     */
+    void Resize_Connectivity(int nelem, int nvert_e)
+    {
+        SetNelem(nelem);               // number of elements
+        SetNverticesElement(nvert_e);  // vertices per element
+        _ia.resize(nelem * nvert_e);
+    }
+
+    /**
+     * Read connectivity information (g1,g2,g3)_i.
+     * @return convectivity vector [nelems*ndofs].
+     */
+    const std::vector<int>  &GetConnectivity() const
+    {
+        return _ia;
+    }
+
+    /**
+     * Access/Change connectivity information (g1,g2,g3)_i.
+     * @return convectivity vector [nelems*ndofs].
+     */
+    std::vector<int>  &GetConnectivity()
+    {
+        return _ia;
+    }
+
+    /**
+     * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
+      *
+     * @param[in] nnodes    number of nodes
+      * @param[in] ndim      space dimension
+     */
+    void Resize_Coords(int nnodes, int ndim)
+    {
+        SetNnode(nnodes);       // number of nodes
+        SetNdim(ndim);          // space dimension
+        _xc.resize(nnodes * ndim);
+    }
+
+    /**
+     * Read coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    const std::vector<double> &GetCoords() const
+    {
+        return _xc;
+    }
+
+    /**
+     * Access/Change coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    std::vector<double> &GetCoords()
+    {
+        return _xc;
+    }
+
+    /**
+     * Calculate values in vector @p v via function @p func(x,y)
+     * @param[in] v     vector
+      * @param[in] func  function of (x,y) returning a double value.
+     */
+    void SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const;
+
+    /**
+     * Prints the information for a finite element mesh
+     */
+    void Debug() const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    virtual std::vector<int> Index_DirichletNodes() const = 0;
+
+    /**
+     * Write vector @p v toghether with its mesh information to an ASCii file @p fname.
+      *
+      * The data are written in C-style.
+      *
+      * @param[in] fname  file name
+      * @param[in] v      vector
+     */
+    void Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const;
+
+    /**
+     * Visualize @p v together with its mesh information via matlab or octave.
+      *
+      * Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
+      * that are supported on your system.
+      *
+      * @param[in] v    vector
+      *
+      * @warning matlab files ascii_read_meshvector.m  visualize_results.m
+      *          must be in the executing directory.
+     */
+    void Visualize(std::vector<double> const &v) const;
+
+
+protected:
+    void SetNelem(int nelem)
+    {
+        _nelem = nelem;
+    }
+
+    void SetNverticesElement(int nvert)
+    {
+        _nvert_e = nvert;
+    }
+
+    void SetNdofsElement(int ndof)
+    {
+        _ndof_e = ndof;
+    }
+
+    void SetNnode(int nnode)
+    {
+        _nnode = nnode;
+    }
+
+    void SetNdim(int ndim)
+    {
+        _ndim = ndim;
+    }
+
+private:
+    int _nelem;         //!< number elements
+    int _nvert_e;       //!< number of vertices per element
+    int _ndof_e;        //!< degrees of freedom (d.o.f.) per element
+    int _nnode;         //!< number nodes/vertices
+    int _ndim;          //!< space dimension of the problem (1, 2, or 3)
+    std::vector<int> _ia;    //!< element connectivity
+    std::vector<double> _xc; //!< coordinates
+};
+
+/**
+ * 2D finite element mesh of the square consiting of linear triangular elements.
+ */
+class Mesh_2d_3_square: public Mesh
+{
+public:
+    /**
+     * Generates the f.e. mesh for the unit square.
+     *
+     * @param[in] nx    number of discretization intervals in x-direction
+     * @param[in] ny    number of discretization intervals in y-direction
+     * @param[in] myid  my MPI-rank / subdomain
+     * @param[in] procx number of ranks/subdomains in x-direction
+     * @param[in] procy number of processes in y-direction
+    */
+    Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1);
+
+    /**
+     * Destructor
+     */
+    ~Mesh_2d_3_square() override
+    {}
+
+    /**
+     * Set solution vector based on a tensor product grid in the rectangle.
+     * @param[in] u solution vector
+     */
+    void SetU(std::vector<double> &u) const;
+
+    /**
+     * Set right hand side (rhs) vector on a tensor product grid in the rectangle.
+     * @param[in] f rhs vector
+     */
+    void SetF(std::vector<double> &f) const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    std::vector<int> Index_DirichletNodes() const override;
+
+    /**
+      * Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot.
+      * The file stores rowise the x- and y- coordinates together with the value from  @p u .
+      * The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals.
+      *
+      * @param[in] name basename of file name (file name will be extended by the rank number)
+      * @param[in] u    local vector
+      *
+      * @warning   Assumes tensor product grid in unit square; rowise numbered
+      *            (as generated in class constructor).
+      *            The output is provided for tensor product grid visualization
+      *            ( similar to Matlab-surf() ).
+      *
+      * @see Mesh_2d_3_square
+      */
+    void SaveVectorP(std::string const &name, std::vector<double> const &u) const;
+
+    // here will still need to implement in the class
+    //  GetBound(), AddBound()
+    //  or better a generalized way with indices and their appropriate ranks for MPI communication
+
+private:
+    /**
+      * Determines the coordinates of the dicretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt]
+      * which is discretized into @p nx x @p ny intervals.
+      *
+      * @param[in] ny   number of discretization intervals in y-direction
+      * @param[in] xl   x-coordinate of left boundary
+      * @param[in] xr   x-coordinate of right boundary
+      * @param[in] yb   y-coordinate of lower boundary
+      * @param[in] yt   y-coordinate of upper boundary
+      * @param[out] xc  coordinate vector of length 2n with x(2*k,2*k+1) as coodinates of node k
+      */
+
+    void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt,
+                              double xc[]);
+    /**
+      * Determines the element connectivity of linear triangular elements of a FEM discretization
+      * of a rectangle using @p nx x @p ny equidistant intervals for discretization.
+      * @param[in] nx   number of discretization intervals in x-direction
+      * @param[in] ny   number of discretization intervals in y-direction
+      * @param[out] ia  element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s
+      */
+    void GetConnectivityInRectangle(int nx, int ny, int ia[]);
+
+private:
+    int _myid;          //!< my MPI rank
+    int _procx;         //!< number of MPI ranks in x-direction
+    int _procy;         //!< number of MPI ranks in y-direction
+    std::array<int, 4> _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.)
+    int _color;         //!< red/black coloring (checker board) of subdomains
+
+    double _xl;         //!< x coordinate of lower left  corner of square
+    double _xr;         //!< x coordinate of lower right corner of square
+    double _yb;         //!< y coordinate or lower left  corner of square
+    double _yt;         //!< y coordinate of upper right corner of square
+    int    _nx;         //!< number of intervals in x-direction
+    int    _ny;         //!< number of intervals in y-direction
+};
+
+// #################### still some old code (--> MPI) ############################
+/**
+ * Copies the values of @p w corresponding to boundary @p ib
+ * onto vector s.  South (ib==1), East (ib==2), North (ib==3), West (ib==4).
+ * The vector @p s has to be long enough!!
+ * @param[in] ib    my local boundary
+ * @param[in] nx    number of discretization intervals in x-direction
+ * @param[in] ny    number of discretization intervals in y-direction
+ * @param[in] w     vector for all nodes of local discretization
+ * @param[out] s    short vector with values on boundary @p ib
+*/
+// GH_NOTE: Absicherung bei s !!
+void GetBound(int ib, int nx, int ny, double const w[], double s[]);
+
+
+/**
+ * Computes @p w := @p w + @p s  at the interface/boundary nodes on the
+ * boundary @p ib .  South (ib==1), East (ib==2), North (ib==3), West (ib==4)
+ * @param[in] ib    my local boundary
+ * @param[in] nx    number of discretization intervals in x-direction
+ * @param[in] ny    number of discretization intervals in y-direction
+ * @param[in,out] w vector for all nodes of local discretization
+ * @param[in] s     short vector with values on boundary @p ib
+*/
+void AddBound(int ib, int nx, int ny, double w[], double const s[]);
+
+// #################### Mesh from Matlab ############################
+/**
+ * 2D finite element mesh of the square consiting of linear triangular elements.
+ */
+class Mesh_2d_3_matlab: public Mesh
+{
+public:
+    /**
+     * Reads mesh data from a binary file.
+     *
+     * File format, see ascii_write_mesh.m
+     *
+     * @param[in] fname file name
+    */
+    explicit Mesh_2d_3_matlab(std::string const &fname);
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions.
+     * @return index vector.
+      *
+      * @warning All boundary nodes are considered as Dirchlet nodes.
+     */
+    std::vector<int> Index_DirichletNodes() const override;
+
+private:
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    int Nnbedges() const
+    {
+        return static_cast<int>(bedges.size());
+    }
+
+    std::vector<int> bedges;     //!< boundary edges [nbedges][2] storing start/end vertex
+
+};
+
+#endif
--- a/ex3/ex3/getmatrix.cpp
+++ b/ex3/ex3/getmatrix.cpp
@ -0,0 +1,347 @@
+#include "getmatrix.h"
+#include "userset.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <vector>
+using namespace std;
+
+
+//  general routine for lin. triangular elements
+
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3])
+//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe)
+{
+    const int  i1  = 2 * ial[0],   i2 = 2 * ial[1],   i3 = 2 * ial[2];
+    const double x13 = xc[i3 + 0] - xc[i1 + 0],  y13 = xc[i3 + 1] - xc[i1 + 1],
+                 x21 = xc[i1 + 0] - xc[i2 + 0],  y21 = xc[i1 + 1] - xc[i2 + 1],
+                 x32 = xc[i2 + 0] - xc[i3 + 0],  y32 = xc[i2 + 1] - xc[i3 + 1];
+    const double jac = fabs(x21 * y13 - x13 * y21);
+
+    ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
+    ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
+    ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
+    ske[1][0] = ske[0][1];
+    ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
+    ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
+    ske[2][0] = ske[0][2];
+    ske[2][1] = ske[1][2];
+    ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
+
+    const double xm    = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
+                 ym    = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
+    //fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0;
+    fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0;
+}
+
+
+// general routine for lin. triangular elements,
+// non-symm. matrix
+// node numbering in element:  a s c e n d i n g   indices !!
+// GH: deprecated
+void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
+             int const id[], int const ik[], double sk[], double f[])
+{
+    for (int i = 0; i < 3; ++i)
+    {
+        const int ii  = ial[i],           // row ii (global index)
+                  id1 = id[ii],           // start and
+                  id2 = id[ii + 1];       // end of row ii in matrix
+        int ip  = id1;
+        for (int j = 0; j < 3; ++j)       // no symmetry assumed
+        {
+            const int jj = ial[j];
+            bool not_found = true;
+            do       // find entry jj (global index) in row ii
+            {
+                not_found = (ik[ip] != jj);
+                ++ip;
+            }
+            while (not_found && ip < id2);
+
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (not_found)     // no entry found !!
+            {
+                cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                     << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(!not_found);
+            }
+#endif
+            sk[ip - 1] += ske[i][j];
+        }
+        f[ii] += fe[i];
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+
+
+
+
+// ####################################################################
+
+CRS_Matrix::CRS_Matrix(Mesh const &mesh)
+    : _mesh(mesh), _nrows(0), _nnz(0), _id(0), _ik(0), _sk(0)
+{
+    Derive_Matrix_Pattern();
+    return;
+}
+
+void CRS_Matrix::Derive_Matrix_Pattern()
+{
+    int const nelem(_mesh.Nelems());
+    int const ndof_e(_mesh.NdofsElement());
+    auto const &ia(_mesh.GetConnectivity());
+//  Determine the number of matrix rows
+    _nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
+    ++_nrows;                                 // node numberng: 0 ... nnode-1
+    assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
+
+//  Collect for each node those nodes it is connected to (multiple entries)
+//  Detect the neighboring nodes
+    vector< list<int> > cc(_nrows);             //  cc[i] is the  list of nodes a node i is connected to
+    for (int i = 0; i < nelem; ++i)
+    {
+        int const idx = ndof_e * i;
+        for (int k = 0; k < ndof_e; ++k)
+        {
+            list<int> &cck = cc.at(ia[idx + k]);
+            cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e );
+        }
+    }
+//  Delete the multiple entries
+    _nnz = 0;
+    for (auto &it : cc)
+    {
+        it.sort();
+        it.unique();
+        _nnz += static_cast<int>(it.size());
+        // cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator<int,char>(cout,"  ")); cout << endl;
+    }
+
+// CSR data allocation
+    _id.resize(_nrows + 1);                  // Allocate memory for CSR row pointer
+    _ik.resize(_nnz);                        // Allocate memory for CSR column index vector
+
+//  copy CSR data
+    _id[0] = 0;                                 // begin of first row
+    for (size_t i = 0; i < cc.size(); ++i)
+    {
+        //cout << i << "   " << nid.at(i) << endl;;
+        const list<int> &ci = cc.at(i);
+        const auto nci = static_cast<int>(ci.size());
+        _id[i + 1] = _id[i] + nci; // begin of next line
+        copy(ci.begin(), ci.end(), _ik.begin() + _id[i] );
+    }
+
+    assert(_nnz == _id[_nrows]);
+    _sk.resize(_nnz);                      // Allocate memory for CSR column index vector
+    return;
+}
+
+
+void CRS_Matrix::Debug() const
+{
+//  ID points to first entry of row
+//  no symmetry assumed
+    cout << "\nMatrix  (nnz = " << _id[_nrows] << ")\n";
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        cout << "Row " << row << " : ";
+        int const id1 = _id[row];
+        int const id2 = _id[row + 1];
+        for (int j = id1; j < id2; ++j)
+        {
+            cout.setf(ios::right, ios::adjustfield);
+            cout << "[" << setw(2) << _ik[j] << "]  " << setw(4) << _sk[j] << "  ";
+        }
+        cout << endl;
+    }
+    return;
+}
+
+void CRS_Matrix::CalculateLaplace(vector<double> &f)
+{
+    assert(_mesh.NdofsElement() == 3);               // only for triangular, linear elements
+    //cout << _nnz << " vs. " << _id[_nrows] << "  " << _nrows<< endl;
+    assert(_nnz == _id[_nrows]);
+
+    for (int k = 0; k < _nrows; ++k)
+    {
+        _sk[k] = 0.0;
+    }
+    for (int k = 0; k < _nrows; ++k)
+    {
+        f[k] = 0.0;
+    }
+
+    double ske[3][3], fe[3];
+    //  Loop over all elements
+    auto const nelem = _mesh.Nelems();
+    auto const &ia   = _mesh.GetConnectivity();
+    auto const &xc   = _mesh.GetCoords();
+
+    for (int i = 0; i < nelem; ++i)
+    {
+        CalcElem(ia.data() + 3 * i, xc.data(), ske, fe);
+        //AddElem(ia.data()+3 * i, ske, fe, _id.data(), _ik.data(), _sk.data(), f.data()); // GH: deprecated
+        AddElem_3(ia.data() + 3 * i, ske, fe, f);
+    }
+
+    //Debug();
+
+    return;
+}
+
+void CRS_Matrix::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
+{
+    double const PENALTY = 1e6;
+    auto const idx = _mesh.Index_DirichletNodes();
+    int const nidx = static_cast<int>(idx.size());
+
+    for (int row = 0; row < nidx; ++row)
+    {
+        int const k = idx[row];
+        int const id1 = fetch(k, k); // Find diagonal entry of row
+        assert(id1 >= 0);
+        _sk[id1] += PENALTY;		// matrix weighted scaling feasible
+        f[k] += PENALTY * u[k];
+    }
+
+    return;
+}
+
+void CRS_Matrix::GetDiag(vector<double> &d) const
+{
+    assert( _nrows == static_cast<int>(d.size()) );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        const int ia = fetch(row, row); // Find diagonal entry of row
+        assert(ia >= 0);
+        d[row] = _sk[ia];
+    }
+    return;
+}
+
+bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const
+{
+    bool bn = (nnode == _nrows);     // number of rows
+    if (!bn)
+    {
+        cout << "#########   Error: " << "number of rows" << endl;
+    }
+
+    bool bz = (id[nnode] == _nnz);   // number of non zero elements
+    if (!bz)
+    {
+        cout << "#########   Error: " << "number of non zero elements" << endl;
+    }
+
+    bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts
+    if (!bd)
+    {
+        cout << "#########   Error: " << "row starts" << endl;
+    }
+
+    bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices
+    if (!bk)
+    {
+        cout << "#########   Error: " << "column indices" << endl;
+    }
+
+    bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values
+    if (!bv)
+    {
+        cout << "#########   Error: " << "values" << endl;
+    }
+
+    return bn && bz && bd && bk && bv;
+}
+
+
+void CRS_Matrix::Mult(vector<double> &w, vector<double> const &u) const
+{
+    assert( _nrows == static_cast<int>(w.size()) );
+    assert( w.size() == u.size() );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        double wi = 0.0;
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
+        {
+            wi += _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+void CRS_Matrix::Defect(vector<double> &w,
+                        vector<double> const &f, vector<double> const &u) const
+{
+    assert( _nrows == static_cast<int>(w.size()) );
+    assert( w.size() == u.size() && u.size() == f.size() );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        double wi = f[row];
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
+        {
+            wi -= _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+int CRS_Matrix::fetch(int const row, int const col) const
+{
+    int const id2 = _id[row + 1];    // end   and
+    int       ip  = _id[row];        // start of recent row (global index)
+
+    while (ip < id2 && _ik[ip] != col)  // find index col (global index)
+    {
+        ++ip;
+    }
+    if (ip >= id2)
+    {
+        ip = -1;
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+        cout << "No column  " << col << "  in row  " << row << endl;
+        assert(ip >= id2);
+#endif
+    }
+    return ip;
+}
+
+
+void CRS_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector<double> &f)
+{
+    for (int i = 0; i < 3; ++i)
+    {
+        const int ii  = ial[i];           // row ii (global index)
+        for (int j = 0; j < 3; ++j)       // no symmetry assumed
+        {
+            const int jj = ial[j];        // column jj (global index)
+            int ip = fetch(ii, jj);       // find column entry jj in row ii
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (ip < 0)        // no entry found !!
+            {
+                cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                     << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(ip >= 0);
+            }
+#endif
+            _sk[ip] += ske[i][j];
+        }
+        f[ii] += fe[i];
+    }
+}
+
--- a/ex3/ex3/getmatrix.h
+++ b/ex3/ex3/getmatrix.h
@ -0,0 +1,178 @@
+#ifndef GETMATRIX_FILE
+#define GETMATRIX_FILE
+
+#include "geom.h"
+#include <vector>
+
+/**
+ * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions.
+ * @param[in]	ial	node indices of the three element vertices
+ * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coodinates of node k
+ * @param[out] ske	element stiffness matrix
+ * @param[out] fe	element load vector
+ */
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]);
+
+/**
+ * Adds the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions to the appropriate positions in
+ * the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik)
+ *
+ * @param[in] ial   node indices of the three element vertices
+ * @param[in] ske	element stiffness matrix
+ * @param[in] fe	element load vector
+ * @param[out] sk	vector non-zero entries of CSR matrix
+ * @param[in] id	index vector containing the first entry in a CSR row
+ * @param[in] ik	column index vector of CSR matrix
+ * @param[out] f	distributed local vector storing the right hand side
+ *
+ * @warning Algorithm requires indices in connectivity @p ial in ascending order.
+ *          Currently deprecated.
+*/
+void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
+             int const id[], int const ik[], double sk[], double f[]);
+
+
+// #####################################################################
+/**
+ * Square matrix in CRS format (compressed row storage; also named CSR),
+ * see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
+ */
+class CRS_Matrix
+{
+    public:
+       /**
+        * Intializes the CRS matrix structure from the given discetization in @p mesh.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+        *
+        * @param[in] mesh given discretization
+        *
+        * @warning A reference to the discretization @p mesh is stored inside this class.
+        *          Therefore, changing @p mesh outside requires also
+        *          to call method @p Derive_Matrix_Pattern explicitely.
+        *
+        * @see Derive_Matrix_Pattern
+        */
+       explicit CRS_Matrix(Mesh const & mesh);
+
+      /**
+        * Destructor.
+        */
+       ~CRS_Matrix()
+       {}
+
+       /**
+        * Generates the sparse matrix pattern and overwrites the existing pattern.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+       */
+       void Derive_Matrix_Pattern();
+
+        /**
+        * Calculates the entries of f.e. stiffness matrix and load/rhs vector @p f for the Laplace operator in 2D.
+        * No memory is allocated.
+        *
+        * @param[in,out] f (preallocated) rhs/load vector
+        */
+       void CalculateLaplace(std::vector<double> &f);
+
+       /**
+        * Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
+        * The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
+        * is used for incorporating the given values @p u.
+        *
+        * @param[in]     u (global) vector with Dirichlet data
+        * @param[in,out] f load vector
+        */
+       void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
+
+       /**
+        * Extracts the diagonal elemenst of the sparse matrix.
+        *
+        * @param[in,out]  d  (prellocated) vector of diagonal elements
+        */
+       void GetDiag(std::vector<double> &d) const;
+
+       /**
+        * Performs the matrix-vector product  w := K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     u vector
+        */
+       void Mult(std::vector<double> &w, std::vector<double> const &u) const;
+
+        /**
+        * Calculates the defect/residuum w := f - K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     f load vector
+        * @param[in]     u vector
+        */
+       void Defect(std::vector<double> &w,
+                   std::vector<double> const &f, std::vector<double> const &u) const;
+
+       /**
+		 * Number rows in matrix.
+		 * @return number of rows.
+		 */
+       int Nrows() const
+          {return _nrows;}
+
+       /**
+		 * Show the matrix entries.
+		 */
+       void Debug() const;
+
+       /**
+		 * Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
+		 *
+		 * @param[in] row	row index
+		 * @param[in] col	column index
+		 * @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
+		 *
+		 * @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
+		*/
+       int fetch(int row, int col) const;
+
+      /**
+        * Adds the element stiffness matrix @p ske and the element load vector @p fe
+        * of one triangular element with linear shape functions to the appropriate positions in
+        * the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
+        *
+        * @param[in]     ial   node indices of the three element vertices
+        * @param[in]     ske   element stiffness matrix
+        * @param[in]     fe    element load vector
+        * @param[in,out] f	   distributed local vector storing the right hand side
+        *
+        * @warning Algorithm assumes  linear triangular elements (ndof_e==3).
+       */
+       void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
+
+        /**
+        * Compare @p this CRS matrix with an external CRS matrix stored in C-Style.
+        *
+        * The method prints statements on differences found.
+        *
+        * @param[in]     nnode  row number of external matrix
+        * @param[in]     id     start indices of matrix rows of external matrix
+        * @param[in]     ik     column indices of external matrix
+        * @param[in]     sk     non-zero values of external matrix
+        *
+        * @return true iff all data are identical.
+        */
+       bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const;
+
+    private:
+       Mesh const & _mesh;      //!< reference to discretization
+       int _nrows;              //!< number of rows in matrix
+       int _nnz;                //!< number of non-zero entries
+       std::vector<int> _id;    //!< start indices of matrix rows
+       std::vector<int> _ik;    //!< column indices
+       std::vector<double> _sk; //!< non-zero values
+
+};
+
+
+#endif
--- a/ex3/ex3/jacsolve.cpp
+++ b/ex3/ex3/jacsolve.cpp
@ -0,0 +1,61 @@
+#include "vdop.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+// #####################################################################
+// 	const int neigh[], const int color,
+// 	const MPI::Intracomm& icomm,
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
+{
+    const double omega   = 1.0;
+    const int    maxiter = 1000;
+    const double tol  = 1e-5,                // tolerance
+                 tol2 = tol * tol;           // tolerance^2
+
+    int nrows = SK.Nrows();                  // number of rows == number of columns
+    assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
+
+    cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s"  << endl;
+    //  Choose initial guess
+    for (int k = 0; k < nrows; ++k)
+    {
+        u[k] = 0.0;                          //  u := 0
+    }
+
+    vector<double> dd(nrows);                // matrix diagonal
+    vector<double>  r(nrows);                // residual
+    vector<double>  w(nrows);                // correction
+
+    SK.GetDiag(dd);                          //  dd := diag(K)
+    ////DebugVector(dd);{int ijk; cin >> ijk;}
+
+    //  Initial sweep
+    SK.Defect(r, f, u);                      //  r := f - K*u
+
+    vddiv(w, r, dd);                         //  w := D^{-1}*r
+    double sigma0 = dscapr(w, r);            // s0 := <w,r>
+
+    // Iteration sweeps
+    int iter  = 0;
+    double sigma = sigma0;
+    while ( sigma > tol2 * sigma0 && maxiter > iter)
+    {
+        ++iter;
+        vdaxpy(u, u, omega, w );             //  u := u + om*w
+        SK.Defect(r, f, u);                  //  r := f - K*u
+        vddiv(w, r, dd);                     //  w := D^{-1}*r
+        sigma = dscapr(w, r);                // s0 := <w,r>
+//      	cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
+    }
+    cout << "aver. Jacobi rate :  " << exp(log(sqrt(sigma / sigma0)) / iter) << "  (" << iter << " iter)" << endl;
+    cout << "final error: " << sqrt(sigma / sigma0) << " (rel)   " << sqrt(sigma) << " (abs)\n";
+
+    return;
+}
+
--- a/ex3/ex3/jacsolve.h
+++ b/ex3/ex3/jacsolve.h
@ -0,0 +1,18 @@
+#ifndef JACSOLVE_FILE
+#define JACSOLVE_FILE
+#include "getmatrix.h"
+#include <vector>
+
+
+/**
+ * Solves linear system of equations  K @p u = @p f  via the Jacobi iteration.
+ * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
+ * solution is set to 0.
+ * @param[in] SK	CSR matrix
+ * @param[in] f		distributed local vector storing the right hand side
+ * @param[out] u	accumulated local vector storing the solution.
+*/
+void JacobiSolve(CRS_Matrix const &SK, std::vector<double> const &f, std::vector<double> &u);
+
+
+#endif
--- a/ex3/ex3/main.cpp
+++ b/ex3/ex3/main.cpp
@ -0,0 +1,101 @@
+#include "benchmarks.h"
+#include "benchmark_tests.h"
+#include "factorization_solve_tests.h"
+#include <iostream>
+
+int main()
+{   
+    // ---------------------------------- 1. ----------------------------------
+    // results in file "test_system.txt"
+    
+    
+
+    // ---------------------------------- 2. ----------------------------------
+    //      Memory      FLOPS       Read-write operations
+    // (A)  2N          2N          2N
+    // (B)  MN + N      2NM         2NM + M
+    // (C)  ML + LM     2LNM        2LNM + MN
+    // (D)  (p+1) + N   3N(p+1)     N(2 + 3(p+1))
+
+
+
+    // ---------------------------------- 3. ----------------------------------
+    // implementation in file "benchmarks.cpp"
+
+
+
+    // ---------------------------------- 4.& 6. ----------------------------------
+    vector<vector<double>> results;
+
+    results.push_back(test_A(250, 50000000, scalar));       
+    results.push_back(test_A(250, 50000000, Kahan_skalar));
+    results.push_back(test_A(250, 50000000, scalar_cBLAS));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    // scalar           0.039   2.4     19
+    // Kahan_skalar     0.037   2.5     20
+    // scalar_cBLAS     0.032   2.9     23
+
+
+    results.push_back(test_B(100, 20000, 10000, MatVec));
+    results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    // MatVec           0.1     3.6     29
+    // MatVec_cBLAS     0.074   5       40
+
+
+    results.push_back(test_C(25, 500, 1000, 1500, MatMat));
+    results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    // MatMat           0.57    2.5     20
+    // MatMat_cBLAS     0.019   75      6e+02   // unrealistic
+
+
+    results.push_back(test_D(100, 100, 1000000));
+    //                  Timing  GFLOPS  GiByte/s
+    // ------------------------------------------
+    //                  0.11    2.5     20
+    
+
+    cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl;
+    cout << "------------------------------" << endl;
+    for (size_t i = 0; i < results.size(); ++i)
+        cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl;
+    cout << endl;
+    
+
+    
+    // ---------------------------------- 5. ----------------------------------
+    // 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product.
+    // Reason: only have to access entries of x, so less memory that has to be accessed
+    //
+    // 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product
+
+
+
+    // ---------------------------------- 6. ----------------------------------
+    // see 4.
+
+
+    // ---------------------------------- 7. ----------------------------------
+    CheckCorrectness();
+    // Checked correctness by computing the inverse of A
+    
+    CheckDuration(5000);
+    // The solving time per RHS scales roughly with factor 1/n_rhs
+
+
+
+    // ---------------------------------- 8. ----------------------------------
+    // done seperately
+    
+
+
+
+
+
+    return 0;
+    
+}
--- a/ex3/ex3/userset.cpp
+++ b/ex3/ex3/userset.cpp
@ -0,0 +1,16 @@
+#include "userset.h"
+#include <cmath>
+
+
+double FunctF(double const x, double const y)
+{
+// return  std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
+//  return 16.0*1024. ;
+// return (double)1.0 ;
+    return x * x * std::sin(2.5 * 3.14159 * y);
+}
+
+double FunctU(const double /* x */, double const /* y */)
+{
+    return 1.0 ;
+}
--- a/ex3/ex3/userset.h
+++ b/ex3/ex3/userset.h
@ -0,0 +1,44 @@
+#ifndef USERSET_FILE
+#define USERSET_FILE
+#include <cmath>
+/**
+ * User function: f(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for right hand side f(@p x,@p y)
+ */
+double FunctF(double const x, double const y);
+
+/**
+ * User function: u(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for solution vector u(@p x,@p y)
+ */
+double FunctU(double const x, double const y);
+
+
+/**
+ * User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value f(@p x,@p y)
+ */
+inline double fNice(double const x, double const y)
+{
+    return x * x * std::sin(2.5 * 3.14159 * y);
+}
+
+/**
+ * User function: f(@p x,@p y) = 0$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value 0
+ */
+inline double f_zero(double const x, double const y)
+//double f_zero(double const  /*x*/, double const /*y*/)
+{
+    return 0.0 + 0.0*(x+y);
+}
+
+#endif
--- a/ex3/ex3/vdop.cpp
+++ b/ex3/ex3/vdop.cpp
@ -0,0 +1,84 @@
+#include "vdop.h"
+#include <cassert>               // assert()
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+void vddiv(vector<double> &x, vector<double> const &y,
+           vector<double> const &z)
+{
+    assert( x.size() == y.size() && y.size() == z.size() );
+    size_t n = x.size();
+
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] / z[k];
+    }
+    return;
+}
+
+//******************************************************************************
+
+void vdaxpy(std::vector<double> &x, std::vector<double> const &y,
+            double alpha, std::vector<double> const &z )
+{
+    assert( x.size() == y.size() && y.size() == z.size() );
+    size_t n = x.size();
+
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] + alpha * z[k];
+    }
+    return;
+}
+//******************************************************************************
+
+double dscapr(std::vector<double> const &x, std::vector<double> const &y)
+{
+    assert( x.size() == y.size());
+    size_t n = x.size();
+
+    double    s = 0.0;
+    for (size_t k = 0; k < n; ++k)
+    {
+        s += x[k] * y[k];
+    }
+
+    return s;
+}
+
+//******************************************************************************
+void DebugVector(vector<double> const &v)
+{
+    cout << "\nVector  (nnode = " << v.size() << ")\n";
+    for (size_t j = 0; j < v.size(); ++j)
+    {
+        cout.setf(ios::right, ios::adjustfield);
+        cout << v[j] << "   ";
+    }
+    cout << endl;
+
+    return;
+}
+//******************************************************************************
+bool CompareVectors(std::vector<double> const &x, int const n, double const y[], double const eps)
+{
+    bool bn = (static_cast<int>(x.size()) == n);
+    if (!bn)
+    {
+        cout << "#########   Error: " << "number of elements" << endl;
+    }
+    //bool bv = equal(x.cbegin(),x.cend(),y);
+    bool bv = equal(x.cbegin(), x.cend(), y,
+                    [eps](double a, double b) -> bool
+    { return std::abs(a - b) < eps * (1.0 + 0.5 * (std::abs(a) + std::abs(a))); }
+                   );
+    if (!bv)
+    {
+        assert(static_cast<int>(x.size()) == n);
+        cout << "#########   Error: " << "values" << endl;
+    }
+    return bn && bv;
+}
--- a/ex3/ex3/vdop.h
+++ b/ex3/ex3/vdop.h
@ -0,0 +1,58 @@
+#ifndef VDOP_FILE
+#define VDOP_FILE
+#include <vector>
+
+/** @brief  Element-wise vector divison x_k = y_k/z_k.
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  z  source vector
+ *
+ */
+void vddiv(std::vector<double> & x, std::vector<double> const& y,
+                                    std::vector<double> const& z);
+
+/** @brief  Element-wise daxpy operation x(k) = y(k) + alpha*z(k).
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  alpha  scalar
+ * @param[in]  z  source vector
+ *
+ */
+void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
+                       double alpha, std::vector<double> const& z );
+
+
+/** @brief  Calculates the Euclidian inner product of two vectors.
+ *
+ * @param[in]  x vector
+ * @param[in]  y vector
+ * @return Euclidian inner product @f$\langle x,y \rangle@f$
+ *
+ */
+double dscapr(std::vector<double> const& x, std::vector<double> const& y);
+
+
+/**
+ * Print entries of a vector.
+ * @param[in] v	    vector values
+*/
+void DebugVector(std::vector<double> const &v);
+
+/** @brief  Compares an STL vector with POD vector.
+ *
+ * The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$
+ * follows the book by
+ * <a href="https://www.springer.com/la/book/9783319446592">Stoyan/Baran</a>, p.8.
+ *
+ * @param[in]  x    STL vector
+ * @param[in]  n    length of POD vector
+ * @param[in]  y    POD vector
+ * @param[in]  eps  relative accuracy criteria (default := 0.0).
+ * @return true iff pairwise vector elements are relatively close to each other.
+ *
+ */
+bool CompareVectors(std::vector<double> const& x, int n, double const y[], double const eps=0.0);
+
+#endif