task8

2025-11-11 15:50:51 +01:00 · 2025-11-11 15:50:18 +01:00
72 changed files with 160045 additions and 0 deletions
--- a/ex3/ex3_results.txt
+++ b/ex3/ex3_results.txt
@ -0,0 +1,222 @@
+
+-------------- Task 1 --------------
+
+-------------------------------------------------------------
+STREAM version $Revision: 5.10 $
+-------------------------------------------------------------
+This system uses 8 bytes per array element.
+-------------------------------------------------------------
+Array size = 80000000 (elements), Offset = 0 (elements)
+Memory per array = 610.4 MiB (= 0.6 GiB).
+Total memory required = 1831.1 MiB (= 1.8 GiB).
+Each kernel will be executed 20 times.
+    The *best* time for each kernel (excluding the first iteration)
+    will be used to compute the reported bandwidth.
+-------------------------------------------------------------
+Your clock granularity/precision appears to be 1 microseconds.
+Each test below will take on the order of 116886 microseconds.
+    (= 116886 clock ticks)
+Increase the size of the arrays if this shows that
+you are not getting at least 20 clock ticks per test.
+-------------------------------------------------------------
+WARNING -- The above is only a rough guideline.
+For best results, please be sure you know the
+precision of your system timer.
+-------------------------------------------------------------
+Function    Best Rate MB/s  Avg time     Min time     Max time
+Copy:           29569.4     0.048585     0.043288     0.059164
+Scale:          17644.0     0.082248     0.072546     0.102548
+Add:            21030.1     0.100620     0.091298     0.124700
+Triad:          21230.7     0.100758     0.090435     0.120631
+-------------------------------------------------------------
+Solution Validates: avg error less than 1.000000e-13 on all three arrays
+-------------------------------------------------------------
+./flops.exe
+
+    FLOPS C Program (Double Precision), V2.0 18 Dec 1992
+
+    Module     Error        RunTime      MFLOPS
+                            (usec)
+        1      4.0146e-13      0.0024   5827.9076
+        2     -1.4166e-13      0.0007  10037.8942
+        3      4.7184e-14      0.0039   4371.9185
+        4     -1.2557e-13      0.0034   4355.5711
+        5     -1.3800e-13      0.0066   4415.6439
+        6      3.2380e-13      0.0065   4441.6299
+        7     -8.4583e-11      0.0053   2277.1707
+        8      3.4867e-13      0.0069   4367.6094
+
+    Iterations      =  512000000
+    NullTime (usec) =     0.0000
+    MFLOPS(1)       =  7050.6178
+    MFLOPS(2)       =  3461.6233
+    MFLOPS(3)       =  4175.0442
+    MFLOPS(4)       =  4389.7311
+
+-------------- Task 2 --------------
+
+Memory needed (double 64-bit, 8 bytes):
+(A) (2N + 1) * 8 bytes
+(B) (M*N + M + N) * 8 bytes
+(C) (M*L + L*N + M*N) * 8 bytes
+(D) (N + N + p) * 8 bytes
+
+Floating point operations:
+(A) 2N
+(B) M * 2N
+(C) M * 2L * N
+(D) 2 * N * p (Horner Schema)
+
+Read/Write operations:
+(A) Read: 2N         Write: 1
+(B) Read: M*2N       Write: M*N
+(C) Read: M*2L*N     Write: M*L*N
+(D) Read: 2*N*p      Write: N*P
+
+-------------- Task 3 --------------
+
+Functions implemented in task_3.cpp
+
+-------------- Task 4 --------------
+
+----- Benchmark (A) -----
+Memory allocated  : 0.745 GByte
+Duration per loop : 0.036 sec
+GFLOPS            : 2.579
+GiByte/s          : 20.630
+-------------------------
+----- Benchmark (B) -----
+Memory allocated  : 0.715 GByte
+Duration per loop : 0.105 sec
+GFLOPS            : 1.704
+GiByte/s          : 6.818
+-------------------------
+----- Benchmark (C) -----
+Memory allocated  : 0.026 GByte
+Duration per loop : 0.459 sec
+GFLOPS            : 4.062
+GiByte/s          : 0.057
+-------------------------
+----- Benchmark (D) -----
+Memory allocated  : 0.015 GByte
+Duration per loop : 0.310 sec
+GFLOPS            : 1.201
+GiByte/s          : 0.048
+-------------------------
+
+
+-------------- Task 5 --------------
+
+----- Benchmark norm -----
+||x|| = 897124.301552
+Memory allocated  : 0.373 GByte
+Duration per loop : 0.022 sec
+GFLOPS            : 4.222
+GiByte/s          : 16.890
+-------------------------
+What do you observe? Why?
+-> Faster per loop than scalar product, only loads elements of 1 vector, instead of 2.
+
+-------------- Task 6 --------------
+
+Benchmarks using cBLAS
+----- Benchmark (A) -----
+Memory allocated  : 0.745 GByte
+Duration per loop : 0.023 sec
+GFLOPS            : 4.006
+GiByte/s          : 32.052
+-------------------------
+----- Benchmark (B) -----
+Memory allocated  : 0.715 GByte
+Duration per loop : 0.026 sec
+GFLOPS            : 7.010
+GiByte/s          : 28.045
+-------------------------
+----- Benchmark (C) -----
+Memory allocated  : 0.026 GByte
+Duration per loop : 0.020 sec
+GFLOPS            : 91.320
+GiByte/s          : 1.278
+-------------------------
+
+
+-------------- Task 7 --------------
+
+A =
+4.000000 1.000000 0.250000 0.111111 0.062500 
+1.000000 4.000000 1.000000 0.250000 0.111111 
+0.250000 1.000000 4.000000 1.000000 0.250000 
+0.111111 0.250000 1.000000 4.000000 1.000000 
+0.062500 0.111111 0.250000 1.000000 4.000000 
+
+
+b =
+0.000000 1.000000 
+0.000000 1.000000 
+0.000000 1.000000 
+0.000000 1.000000 
+0.000000 1.000000 
+
+
+L + U =
+4.000000 1.000000 0.250000 0.111111 0.062500 
+0.250000 3.750000 0.937500 0.222222 0.095486 
+0.062500 0.250000 3.750000 0.937500 0.222222 
+0.027778 0.059259 0.250000 3.749370 0.937050 
+0.015625 0.025463 0.059259 0.249922 3.749234 
+
+
+x =
+0.000000 0.196259 
+0.000000 0.148391 
+0.000000 0.151272 
+0.000000 0.148391 
+0.000000 0.196259 
+
+
+Check solution:
+A * x = 
+0.000000 1.000000 
+0.000000 1.000000 
+0.000000 1.000000 
+0.000000 1.000000 
+0.000000 1.000000 
+
+
+N    =   | 1      | 2      | 4      | 8      | 16     | 32 
+---------|--------|--------|--------|--------|--------|-------
+Nrhs = 2 | 0.0047 | 0.0045 | 0.0046 | 0.0130 | 0.0203 | 0.0476
+Nrhs = 4 | 0.0027 | 0.0031 | 0.0033 | 0.0046 | 0.0085 | 0.0250
+Nrhs = 8 | 0.0035 | 0.0035 | 0.0045 | 0.0061 | 0.0119 | 0.0300
+Nrhs = 16 | 0.0085 | 0.0062 | 0.0221 | 0.0113 | 0.0599 | 0.0757
+Nrhs = 32 | 0.0122 | 0.0165 | 0.0112 | 0.0123 | 0.0238 | 0.0834
+Nrhs = 64 | 0.0072 | 0.0078 | 0.0164 | 0.0133 | 0.0421 | 0.0666
+Nrhs = 128 | 0.0073 | 0.0189 | 0.0269 | 0.0199 | 0.0337 | 0.1041
+Nrhs = 256 | 0.0107 | 0.0135 | 0.0279 | 0.0351 | 0.0582 | 0.1438
+Nrhs = 512 | 0.0276 | 0.0174 | 0.0237 | 0.1027 | 0.1113 | 0.2417
+
+For fixed n, the solution time per rhs does not slow down consistently and scales very well.
+Its faster than expected.
+
+
+-------------- Task 8 --------------
+
+
+ There are 1 processes running.
+
+Intervalls: 100 x 100
+
+ Start Jacobi solver for 10201 d.o.f.s
+aver. Jacobi rate :  0.997922  (1000 iter)
+final error: 0.124971 (rel)   0.000194029 (abs)
+JacobiSolve: timing in sec. : 0.079399
+ASCI file  square_100.txt  opened
+17361  2  34320  3
+
+ Start Jacobi solver for 17361 d.o.f.s
+aver. Jacobi rate :  0.998401  (1000 iter)
+final error: 0.201744 (rel)   0.000265133 (abs)
+JacobiSolve: timing in sec. : 0.18853
+
+
+
--- a/ex3/ex3_sheet.pdf
+++ b/ex3/ex3_sheet.pdf
--- a/ex3/seq/CLANG_default.mk
+++ b/ex3/seq/CLANG_default.mk
@ -0,0 +1,123 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=CLANG_
+
+#CLANGPATH=//usr/lib/llvm-10/bin/
+CC     = ${CLANGPATH}clang
+CXX    = ${CLANGPATH}clang++
+#CXX   = ${CLANGPATH}clang++ -lomptarget  -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
+#F77   = gfortran
+LINKER = ${CXX}
+
+#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
+WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
+WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
+
+CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
+# don't use -Ofast
+# -ftrapv
+LINKFLAGS += -O3
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+ifeq ($(UBUNTU),1)
+# ubuntu
+else
+# on  archlinux
+LINKFLAGS += -lcblas
+endif
+
+# interprocedural optimization
+CXXFLAGS  += -flto
+LINKFLAGS += -flto
+
+#   very good check
+# http://clang.llvm.org/extra/clang-tidy/
+#   good check, see:  http://llvm.org/docs/CodingStandards.html#include-style
+SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
+SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
+#READABILITY=,readability*${SWITCH_OFF}
+#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
+#TIDYFLAGS += -checks='modernize*
+#   ???
+#TIDYFLAGS = -checks='cert*'  -header-filter=.*
+#   MPI checks ??
+#TIDYFLAGS = -checks='mpi*'
+#   ??
+#TIDYFLAGS = -checks='performance*'   -header-filter=.*
+#TIDYFLAGS = -checks='portability-*'  -header-filter=.*
+#TIDYFLAGS = -checks='readability-*'  -header-filter=.*
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	@rm -f *_ *~ *.bak *.log *.out *.tar
+
+codecheck: tidy_check
+tidy_check:
+	clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
+# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
+
+run: clean ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+
+#  Simple run time profiling of your code
+#  CXXFLAGS += -g -pg
+#  LINKFLAGS += -pg
+prof: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+codecheck: tidy_check
--- a/ex3/seq/GCC_default.mk
+++ b/ex3/seq/GCC_default.mk
@ -0,0 +1,196 @@
+# Basic Defintions for using GNU-compiler suite sequentially
+# requires setting of COMPILER=GCC_
+
+CC	= gcc
+CXX = g++
+F77	= gfortran
+LINKER  = ${CXX}
+
+WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
+           -Wredundant-decls -fmax-errors=1
+#  -Wunreachable-code  -Winline
+CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
+#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
+#-msse3
+# -ftree-vectorizer-verbose=2  -DNDEBUG
+# -ftree-vectorizer-verbose=5
+# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump  -fdump-tree-pre=stderr
+
+# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
+# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
+# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
+LINKFLAGS   += -O3
+
+#architecture
+#CPU = -march=znver2
+CXXFLAGS  += ${CPU}
+LINKFLAGS += ${CPU}
+
+# different libraries in Ubuntu or manajaró
+ifndef UBUNTU
+UBUNTU=1
+endif
+
+# BLAS, LAPACK
+ifeq ($(UBUNTU),1)
+LINKFLAGS += -llapack -lblas
+# -lopenblas
+else
+# on  archlinux
+LINKFLAGS += -llapack -lopenblas -lcblas
+endif
+
+# interprocedural optimization
+#CXXFLAGS  += -flto
+#LINKFLAGS += -flto
+
+# for debugging purpose (save code)
+# -fsanitize=leak         # only one out the three can be used
+# -fsanitize=address
+# -fsanitize=thread
+SANITARY =  -fsanitize=address  -fsanitize=undefined -fsanitize=null -fsanitize=return \
+ -fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
+ -fsanitize=bool -fsanitize=enum -fsanitize=vptr
+#CXXFLAGS  += ${SANITARY}
+#LINKFLAGS += ${SANITARY}
+
+# profiling tools
+#CXXFLAGS  += -pg
+#LINKFLAGS += -pg
+
+
+default: ${PROGRAM}
+
+${PROGRAM}:	${OBJECTS}
+	$(LINKER)  $^  ${LINKFLAGS} -o $@
+
+clean:
+	@rm -f ${PROGRAM} ${OBJECTS}
+
+clean_all:: clean
+	-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
+	-@rm -rf html
+
+run: clean ${PROGRAM}
+#run: ${PROGRAM}
+#	time  ./${PROGRAM} ${PARAMS}
+	./${PROGRAM} ${PARAMS}
+
+# tar the current directory
+MY_DIR = `basename ${PWD}`
+tar: clean_all
+	@echo "Tar the directory: " ${MY_DIR}
+	@cd .. ;\
+	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+# 	tar cf `basename ${PWD}`.tar *
+#find . -size +10M > large_files
+#--exclude-from ${MY_DIR}/large_files
+
+zip: clean
+	@echo "Zip the directory: " ${MY_DIR}
+	@cd .. ;\
+	zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
+	cd ${MY_DIR}
+
+doc:
+	doxygen Doxyfile
+
+#########################################################################
+.SUFFIXES: .f90
+
+.cpp.o:
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $<.log 
+#	$(CXX) -c $(CXXFLAGS) -o $@ $<  2>&1 | tee -a $(<:.cpp=.log)
+
+.c.o:
+	$(CC) -c $(CFLAGS) -o $@ $<
+
+.f.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+.f90.o:
+	$(F77) -c $(FFLAGS) -o $@ $<
+
+##################################################################################################
+#    some tools
+# Cache behaviour (CXXFLAGS += -g  tracks down to source lines; no -pg in linkflags)
+cache: ${PROGRAM}
+	valgrind --tool=callgrind --simulate-cache=yes ./$^  ${PARAMS}
+#	kcachegrind callgrind.out.<pid> &
+	kcachegrind `ls -1tr  callgrind.out.* |tail -1`
+
+# Check for wrong memory accesses, memory leaks, ...
+# use smaller data sets
+# no "-pg"  in compile/link options
+mem: ${PROGRAM}
+	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^  ${PARAMS}
+# Graphical interface
+# valkyrie
+
+#  Simple run time profiling of your code
+CXXFLAGS += -g -pg
+LINKFLAGS += -pg
+prof: ${PROGRAM}
+	./$^  ${PARAMS}
+	gprof -b ./$^ > gp.out
+#	kprof -f gp.out -p gprof &
+
+#                   sudo apt install gprofng-gui
+#                   https://parallel.computer/presentations/PPoPP2023/2023-Ruud-Slides.pdf
+#   read §3 in      https://sourceware.org/binutils/docs/gprofng.html
+# /usr/bin/gp-collect-app -o test.1.er -p on -S on /home/ghaase/Lectures/Math2CPP/Codes/seq/jacobi_oo_stl/main.GCC_ 
+prof2: ${PROGRAM}
+	gprofng collect app -h auto ./$^  ${PARAMS}
+#	gprofng display text -functions `ls -1tdr test.*.er |tail -1`
+	gprofng display text -script gprofng_script2 `ls -1tdr test.*.er |tail -1`
+#	gprofng display text -script gprofng_script2 test.*.er
+#	gprofng display gui &
+	
+prof3: ${PROGRAM}
+	perf record ./$^  ${PARAMS}
+	perf report
+#  perf in Ubuntu 20.04:   https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
+#  * install 
+#  * sudo vi /etc/sysctl.conf
+#                add   kernel.perf_event_paranoid = 0
+
+#Trace your heap:
+#> heaptrack ./main.GCC_
+#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
+heap: ${PROGRAM}
+	heaptrack ./$^  ${PARAMS}
+	heaptrack_gui  `ls -1tr  heaptrack.$^.* |tail -1` &
+
+codecheck: $(SOURCES)
+	cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
+
+
+########################################################################
+#  get the detailed  status of all optimization flags
+info:
+	echo "detailed  status of all optimization flags"
+	$(CXX) --version
+	$(CXX) -Q $(CXXFLAGS) --help=optimizers
+	lscpu
+	inxi -C
+	lstopo
+
+# Excellent hardware info
+#	hardinfo
+# Life monitoring of CPU frequency etc.
+#	sudo i7z
+
+# Memory  consumption
+#	vmstat -at -SM 3
+#	xfce4-taskmanager
+
+
+# https://www.tecmint.com/check-linux-cpu-information/
+#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
+
+# Debugging:
+# https://wiki.archlinux.org/index.php/Debugging
--- a/ex3/seq/Makefile
+++ b/ex3/seq/Makefile
@ -0,0 +1,36 @@
+#DIRS=skalar skalar_stl jacobi generate_mesh jacobi_oo_stl
+DIRS=skalar skalar_stl jacobi jacobi_oo_stl thread_17 densematrices_libs generate_mesh mgrid
+#
+#WWW_ROOT=${HOME}/public_html/Lectures/Math2CPP/Codes/seq
+WWW_ROOT=../../html/Codes/seq
+
+
+clean:
+	@for i in ${DIRS}; do cd $${i}; make clean_all; cd ..; done
+#	rm *.tar
+
+doc:
+	@for i in ${DIRS}; do cd $${i}; make doc; cd ..; done
+
+tar:
+	@for i in ${DIRS}; do cd $${i}; make tar; cd ..; done
+
+zip:
+	@for i in ${DIRS}; do cd $${i}; make zip; cd ..; done
+
+www: clean tar zip doc
+	mkdir -p ${WWW_ROOT}
+	cp -up *_default.mk ${WWW_ROOT}
+	@for i in ${DIRS};\
+	do \
+	  mv $${i}.tar $${i}.zip ${WWW_ROOT}; \
+	  cp -rup $${i} ${WWW_ROOT}; 	\
+	  done
+
+
+#	@for i in ${DIRS};\
+#	do \
+#	  tar -czf $${i}.tgz $${i} *default*.mk; \
+#	  mv $${i}.tgz  ${WWW_ROOT}; \
+#	  cp -r $${i} ${WWW_ROOT}; 	\
+#	  find $${i} -name html -exec cp -r {} ${WWW_ROOT}/$${i}  \; ; done
--- a/ex3/seq/check_all
+++ b/ex3/seq/check_all
@ -0,0 +1,32 @@
+#!/bin/bash
+
+EXAMPLES='skalar skalar_stl jacobi jacobi_stl jacobi_oo_stl thread_17 densematrices_libs'
+# EXAMPLES='template_seq'
+#COMPTYPE='GCC_ CLANG_'
+COMPTYPE='ONEAPI_'
+
+LOG_FILE='compile.log'
+rm -f ${LOG_FILE}
+
+echo
+echo '     Compile examples'
+echo
+
+
+for verz in ${EXAMPLES}
+  do echo                                          2>&1 | tee -a  ${LOG_FILE}
+    echo '-----------  '  $verz  '  -----------'   2>&1 | tee -a  ${LOG_FILE}
+    pushd $verz
+    echo                                           2>&1 | tee -a  ${LOG_FILE}
+    pwd                                            >>  ../${LOG_FILE}
+    for comp in ${COMPTYPE}
+    do echo                                        2>&1 | tee -a  ../${LOG_FILE}
+      echo '###########  '  $comp '  ###########'  2>&1 | tee -a  ../${LOG_FILE}
+      echo                                         2>&1 | tee -a  ../${LOG_FILE}
+      make clean COMPILER=${comp}                  2>&1 | tee -a  ../${LOG_FILE}
+      make COMPILER=${comp}                        2>&1 | tee -a  ../${LOG_FILE}
+#       make run COMPILER=${comp}                    2>&1 | tee -a  ../${LOG_FILE}
+    done
+    popd
+  done
+    
--- a/ex3/seq/compile.log
+++ b/ex3/seq/compile.log
@ -0,0 +1,49 @@
+
+-----------   skalar   -----------
+/home/dino/scf_celebic/ex3/seq/skalar
+
+###########   ONEAPI_   ###########
+
+Makefile:32: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+Makefile:32: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+
+-----------   skalar_stl   -----------
+/home/dino/scf_celebic/ex3/seq/skalar_stl
+
+###########   ONEAPI_   ###########
+
+Makefile:30: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+Makefile:30: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+
+-----------   jacobi   -----------
+
+
+-----------   jacobi_stl   -----------
+
+
+-----------   jacobi_oo_stl   -----------
+/home/dino/scf_celebic/ex3/seq/jacobi_oo_stl
+
+###########   ONEAPI_   ###########
+
+Makefile:26: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+Makefile:26: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+
+-----------   thread_17   -----------
+/home/dino/scf_celebic/ex3/seq/thread_17
+
+###########   ONEAPI_   ###########
+
+Makefile:42: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+Makefile:42: ../ONEAPI_default.mk: No such file or directory
+make: *** No rule to make target '../ONEAPI_default.mk'.  Stop.
+
+-----------   densematrices_libs   -----------
+
--- a/ex3/seq/generate_mesh/L_shape.m
+++ b/ex3/seq/generate_mesh/L_shape.m
@ -0,0 +1,26 @@
+% Copyright: Reza Mokhtari
+clear all
+clc
+%% plot L-shape
+g=[2 0 2 0 0 1 0;
+   2 2 2 0 1 1 0;
+   2 2 1 1 1 1 0;
+   2 1 1 1 2 1 0;
+   2 1 0 2 2 1 0;
+   2 0 0 2 0 1 0]';
+[p,e,t] = initmesh(g,'hmax',0.5); 
+pdemesh(p,e,t)
+
+%% GH
+% output from <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>
+%
+% coordinates  p: [2][nnode]
+% connectivity t: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges        e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+
+% flatpak run org.octave.Octave <filename>
+ascii_write_mesh( p, t, e, mfilename);
--- a/ex3/seq/generate_mesh/ascii_read_meshvector.m
+++ b/ex3/seq/generate_mesh/ascii_read_meshvector.m
@ -0,0 +1,43 @@
+function [ xc, ia, v ] = ascii_read_meshvector( fname )
+%
+% Loads the 2D triangular mesh (coordinates, vertex connectivity) 
+%  together with values on its vertices from an ASCII file.
+%   Matlab indexing is stored  (starts with 1).
+% 
+% The input file format is compatible 
+%  with Mesh_2d_3_matlab:Write_ascii_matlab(..) in jacobi_oo_stl/geom.h
+%
+%
+%  IN: fname - filename
+% OUT: xc    - coordinates
+%      ia    - mesh connectivity
+%      v     - solution vector
+
+DELIMETER = ' ';
+
+fprintf('Read file  %s\n',fname)
+
+% Read mesh constants
+nn = dlmread(fname,DELIMETER,[0 0 0 3]);  %% row_1, col_1, row_2, col_2  in C indexing!!!
+nnode = nn(1);
+ndim  = nn(2);
+nelem = nn(3);
+nvert = nn(4);
+
+% Read coordinates
+row_start = 0+1;
+row_end   = 0+nnode;
+xc = dlmread(fname,DELIMETER,[row_start 0 row_end ndim-1]);
+
+% Read connectivity
+row_start = row_end+1;
+row_end   = row_end+nelem;
+ia = dlmread(fname,DELIMETER,[row_start 0 row_end nvert-1]);
+
+% Read solution
+row_start = row_end+1;
+row_end   = row_end+nnode;
+v = dlmread(fname,DELIMETER,[row_start 0 row_end 0]);
+end
+
+
--- a/ex3/seq/generate_mesh/ascii_write_mesh.m
+++ b/ex3/seq/generate_mesh/ascii_write_mesh.m
@ -0,0 +1,49 @@
+function ascii_write_mesh( xc, ia, e, basename)
+%
+% Saves the 2D triangular mesh in the minimal way (only coordinates, vertex connectivity, minimal boundary edge info)
+%  in an ASCII file.
+%  Matlab indexing is stored  (starts with 1).
+% 
+% The output file format is compatible with Mesh_2d_3_matlab:Mesh_2d_3_matlab(std::string const &fname) in jacobi_oo_stl/geom.h
+%
+% IN:
+% coordinates  xc: [2][nnode]
+% connectivity ia: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges         e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+%        basename: file name without extension
+% 
+% Data have been generated via <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>.
+%
+fname = [basename, '.txt'];
+
+nnode = int32(size(xc,2));
+ndim  = int32(size(xc,1));
+nelem = int32(size(ia,2));
+nvert_e = int32(3);
+
+
+dlmwrite(fname,nnode,'delimiter','\t','precision',16)                 % number of nodes
+dlmwrite(fname,ndim,'-append','delimiter','\t','precision',16)        % space dimension
+dlmwrite(fname,nelem,'-append','delimiter','\t','precision',16)       % number of elements
+dlmwrite(fname,nvert_e,'-append','delimiter','\t','precision',16)     % number of vertices per element
+
+% dlmwrite(fname,xc(:),'-append','delimiter','\t','precision',16)       % coordinates
+dlmwrite(fname,xc([1,2],:).','-append','delimiter','\t','precision',16) % coordinates
+
+% no subdomain info transferred
+tmp=int32(ia(1:3,:));
+% dlmwrite(fname,tmp(:),'-append','delimiter','\t','precision',16)      % connectivity in Matlab indexing
+dlmwrite(fname,tmp(:,:).','-append','delimiter','\t','precision',16)    % connectivity in Matlab indexing
+
+% store only start and end point of boundary edges,
+nbedges = size(e,2);
+dlmwrite(fname,nbedges,'-append','delimiter','\t','precision',16)     % number boundary edges
+tmp=int32(e(1:2,:));
+% dlmwrite(fname,tmp(:),'-append','delimiter','\t','precision',16)    % boundary edges in Matlab indexing
+dlmwrite(fname,tmp(:,:).','-append','delimiter','\t','precision',16)  % boundary edges in Matlab indexing
+
+end
--- a/ex3/seq/generate_mesh/chip_2materials.asv
+++ b/ex3/seq/generate_mesh/chip_2materials.asv
@ -0,0 +1,56 @@
+% Square: 
+%   flatpak run org.octave.Octave <filename>
+%      or
+%   octave --no-window-system --no-gui  -qf <filename>
+
+clear all
+clc
+% %% L-shape
+% g=[2 0 2 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+%    2 2 2 0 1 1 0;
+%    2 2 1 1 0.5 1 0;
+%    2 1 1 0.5 2 1 0;
+%    2 1 0 2 2 1 0;
+%    2 0 0 2 0 1 0]';
+
+%% square
+% g=[2 0 1 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+%    2 1 1 0 1 1 0;
+%    2 1 0 1 1 1 0;
+%    2 0 0 1 0 1 0]';
+
+g=[2 0.00 1.00  0.00 0.00  1  0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+   2 1.00 1.00  0.00 0.60  1  0;
+   2 1.00 0.83  0.60 0.60  1  0;
+   2 0.83 0.17  0.60 0.60  1  2;
+   2 0.17 0.00  0.60 0.60  1  0;
+   2 0.00 0.00  0.60 0.00  1  0;
+   2 0.83 0.83  0.60 0.80  2  0;
+   2 0.83 0.17  0.80 0.80  2  0;
+   2 0.17 0.17  0.80 0.60  2  0;
+   2 
+   ]';
+
+
+[p,e,t] = initmesh(g,'hmax',0.1); 
+%[p,e,t] = initmesh(g,'hmax',0.6); 
+pdemesh(p,e,t)
+% pdemesh(p,e,t,"NodeLabels","on")
+
+%% GH
+% output from <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>
+%
+% coordinates  p: [2][nnode]
+% connectivity t: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges        e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+
+ascii_write_mesh( p, t, e, mfilename);
+
+
+
+% tmp=t(1:3,:)
+
--- a/ex3/seq/generate_mesh/chip_2materials.m
+++ b/ex3/seq/generate_mesh/chip_2materials.m
@ -0,0 +1,59 @@
+% Square: 
+%   flatpak run org.octave.Octave <filename>
+%      or
+%   octave --no-window-system --no-gui  -qf <filename>
+
+clear all
+clc
+% %% L-shape
+% g=[2 0 2 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+%    2 2 2 0 1 1 0;
+%    2 2 1 1 0.5 1 0;
+%    2 1 1 0.5 2 1 0;
+%    2 1 0 2 2 1 0;
+%    2 0 0 2 0 1 0]';
+
+%% square
+% g=[2 0 1 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+%    2 1 1 0 1 1 0;
+%    2 1 0 1 1 1 0;
+%    2 0 0 1 0 1 0]';
+
+g=[2 0.00 1.00  0.00 0.00  1  0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+   2 1.00 1.00  0.00 0.60  1  0;
+   2 1.00 0.83  0.60 0.60  1  0;
+   2 0.83 0.17  0.60 0.60  1  2;
+   2 0.17 0.00  0.60 0.60  1  0;
+   2 0.00 0.00  0.60 0.00  1  0;
+   2 0.83 0.83  0.60 0.80  2  0;
+   2 0.83 0.17  0.80 0.80  2  0;
+   2 0.17 0.17  0.80 0.60  2  0;
+   2 0.50 0.65  0.15 0.30  1  0;
+   2 0.65 0.50  0.30 0.45  1  0;
+   2 0.50 0.35  0.45 0.30  1  0;
+   2 0.35 0.50  0.30 0.15  1  0
+   ]';
+
+
+[p,e,t] = initmesh(g,'hmax',0.1); 
+%[p,e,t] = initmesh(g,'hmax',0.6); 
+pdemesh(p,e,t)
+% pdemesh(p,e,t,"NodeLabels","on")
+
+%% GH
+% output from <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>
+%
+% coordinates  p: [2][nnode]
+% connectivity t: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges        e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+
+ascii_write_mesh( p, t, e, mfilename);
+
+
+
+% tmp=t(1:3,:)
+
--- a/ex3/seq/generate_mesh/chip_2materials.txt
+++ b/ex3/seq/generate_mesh/chip_2materials.txt
@ -0,0 +1,442 @@
+143
+2
+235
+3
+0	0
+1	0
+1	0.6
+0.83	0.6
+0.17	0.6
+0	0.6
+0.83	0.8
+0.17	0.8
+0.5	0.15
+0.65	0.3
+0.5	0.45
+0.35	0.3
+0.5	0.6
+0.1	0
+0.2	0
+0.3	0
+0.4	0
+0.5	0
+0.6	0
+0.7	0
+0.8	0
+0.9	0
+1	0.09999999999999999
+1	0.2
+1	0.3
+1	0.4
+1	0.5
+0.915	0.6
+0.7474999999999999	0.6
+0.665	0.6
+0.5825	0.6
+0.08500000000000001	0.6
+0	0.5
+0	0.4
+0	0.3
+0	0.2
+0	0.09999999999999998
+0.83	0.6666666666666666
+0.83	0.7333333333333334
+0.7357142857142857	0.8
+0.6414285714285715	0.8
+0.5471428571428572	0.8
+0.4528571428571429	0.8
+0.3585714285714286	0.8
+0.2642857142857143	0.8
+0.17	0.7333333333333334
+0.17	0.6666666666666667
+0.55	0.2
+0.6	0.25
+0.6	0.35
+0.55	0.4
+0.45	0.4
+0.4	0.35
+0.4	0.25
+0.45	0.2
+0.4175	0.6
+0.335	0.6
+0.2525000000000001	0.6
+0.1830860259018252	0.2575705887159407
+0.8214251519279839	0.3400910341194828
+0.7367356972768339	0.1545851641648327
+0.2747763926099513	0.4393279804701981
+0.3380125422979773	0.1490600318288089
+0.6665431110538135	0.4469555482086437
+0.296713959802076	0.6824750976249487
+0.6823887570446404	0.6988381196783868
+0.4119540982524374	0.7139680155366132
+0.5440762920546971	0.6844981967009994
+0.5488863849623231	0.08065734809935345
+0.4593284290899015	0.5303257942069471
+0.04745577342231752	0.04558193320280449
+0.9559081045112219	0.04706196837962208
+0.05571697787757442	0.5345441974182679
+0.9416966513970901	0.5479893966941878
+0.1309631118646534	0.4236995315406518
+0.8774128939624196	0.1557894847622768
+0.1554747804750385	0.1294581622541261
+0.8486232790977517	0.4679895901706703
+0.6534945367503546	0.09649470230789628
+0.3483977396498218	0.4781934195718026
+0.2483039178930304	0.335106363798257
+0.7530934478381617	0.2592989646850651
+0.7295364126041498	0.3701003238893228
+0.2683233659762636	0.2447136364527485
+0.9143800070052596	0.3450037074593404
+0.08822015826359252	0.2520686350157191
+0.4537059191295885	0.0741756455143039
+0.5413507019951419	0.5280898867902438
+0.7096450191787081	0.527961160509049
+0.6159682017017379	0.6625663124322685
+0.3592388637053299	0.07610221596732396
+0.765108404514371	0.6997816378729095
+0.4717177437761203	0.6663200430010416
+0.4854679354770992	0.7329870341912208
+0.3877365265315627	0.1908548279285711
+0.6099395334451564	0.4066137391340408
+0.2306877192446715	0.6964953882347311
+0.3598161445533513	0.6657912069407611
+0.3382100905575304	0.732458120815991
+0.1406562191310727	0.3365009009737865
+0.8627012941703919	0.2566782703816746
+0.2195359644497203	0.524181436329583
+0.2504057789554006	0.09656440959640891
+0.8609688932847446	0.07036423491723381
+0.0792332959701036	0.1598167751696082
+0.6633419637279835	0.2041432332102987
+0.7388720146622824	0.07177243733659958
+0.7591410509968218	0.4592275943922892
+0.9241864378600493	0.4352795696806243
+0.3282800786487374	0.3684755144155539
+0.6245001642495908	0.5273679175283577
+0.2130990814013797	0.1830560609252357
+0.5860181519062729	0.1635732828628243
+0.6061461985296173	0.729233213750633
+0.8749038211558559	0.5412768202367876
+0.06388571113668819	0.4576480010754793
+0.1589795129567917	0.06144175630803391
+0.9474526566112427	0.1539503712832068
+0.4177070399492428	0.4686211281102174
+0.4145875508081387	0.1396001690291554
+0.5820204934211082	0.4600303136127304
+0.6511979703975962	0.3749576639917713
+0.3372051416899794	0.2181481953330509
+0.3797444568480455	0.5346313255081716
+0.1963130627907262	0.3991956927382116
+0.1304008926063841	0.5145372655094962
+0.8068811529542688	0.1870998775716188
+0.7950607772011259	0.5328892529726835
+0.3014379962113258	0.5295700784399427
+0.936043535437924	0.2249829012752652
+0.06786843512715206	0.3723100832440311
+0.3696426508429986	0.4176130572447811
+0.8035652723323531	0.1069250059276683
+0.2809344756318412	0.1774925274550569
+0.1433346549100507	0.1960253034218661
+0.0900292624482301	0.08257958227890218
+0.8629115739649835	0.3994504619777429
+0.8044908795850402	0.4077054699969835
+0.9235971226529696	0.08805709002620099
+0.9180779212435105	0.4985449668591332
+0.9284411152026488	0.2819128596134682
+0.07377871769355998	0.3152655928916893
+0.1898942752170752	0.4604566556725514
+27	3	74
+47	5	58
+28	4	115
+58	5	102
+29	4	38
+22	2	72
+52	11	119
+50	10	122
+39	7	40
+69	9	87
+124	57	129
+71	14	136
+48	9	113
+104	22	139
+17	18	87
+51	50	96
+18	19	69
+79	20	107
+20	21	107
+21	22	104
+23	24	118
+25	26	85
+85	26	109
+54	12	123
+70	11	88
+74	28	115
+31	13	88
+37	1	71
+32	6	73
+1	14	71
+55	54	95
+14	15	117
+105	37	136
+102	5	126
+73	33	116
+33	34	116
+30	29	66
+31	30	90
+56	13	93
+66	29	92
+29	38	92
+13	31	68
+38	39	92
+45	8	46
+68	42	94
+89	30	111
+68	31	90
+66	41	114
+116	34	131
+43	44	67
+69	19	79
+2	23	72
+19	20	79
+115	4	128
+4	29	128
+67	44	99
+65	45	97
+15	16	103
+16	17	91
+45	46	97
+6	33	73
+57	56	98
+53	52	132
+81	12	110
+12	53	110
+87	9	120
+13	56	70
+30	31	111
+88	11	121
+82	10	106
+72	23	139
+70	56	124
+5	32	126
+9	55	120
+81	59	84
+11	51	121
+82	60	83
+58	57	65
+46	47	97
+40	41	66
+42	43	94
+65	57	98
+44	45	99
+41	42	114
+67	56	93
+49	48	113
+10	49	106
+56	57	124
+57	58	129
+86	36	105
+91	63	103
+118	24	130
+26	27	109
+35	36	86
+36	37	105
+3	28	74
+29	30	89
+86	59	100
+32	73	126
+85	60	101
+107	21	133
+84	59	112
+112	59	135
+96	64	121
+83	60	138
+127	61	133
+106	49	113
+81	62	125
+110	53	132
+80	62	132
+125	62	143
+79	61	106
+82	61	127
+137	78	138
+10	82	83
+103	63	134
+12	81	84
+104	76	133
+109	27	140
+100	81	125
+37	71	136
+95	54	123
+18	69	87
+96	50	122
+13	70	88
+89	64	108
+109	78	137
+30	66	90
+90	66	114
+95	63	120
+17	87	91
+39	40	92
+40	66	92
+43	67	94
+13	68	93
+93	68	94
+67	93	94
+123	84	134
+91	87	120
+83	64	122
+31	88	111
+47	58	97
+58	65	97
+45	65	99
+56	67	98
+98	67	99
+65	98	99
+59	81	100
+34	35	131
+60	82	101
+24	25	130
+116	75	126
+102	62	129
+105	77	135
+16	91	103
+22	72	139
+101	76	130
+117	77	136
+59	86	135
+9	69	113
+61	82	106
+61	79	107
+21	104	133
+64	83	108
+108	78	128
+27	74	140
+60	85	137
+11	70	119
+62	81	110
+64	89	111
+111	88	121
+112	103	134
+77	103	112
+69	79	113
+79	106	113
+42	68	114
+68	90	114
+29	89	128
+115	78	140
+100	75	131
+131	35	142
+103	77	117
+15	103	117
+130	25	141
+118	76	139
+119	70	124
+119	80	132
+63	91	120
+55	95	120
+51	96	121
+64	111	121
+10	83	122
+64	96	122
+12	84	123
+63	95	123
+62	80	129
+80	119	124
+75	100	125
+126	75	143
+62	102	143
+73	116	126
+101	82	127
+76	101	127
+89	108	128
+78	115	128
+58	102	129
+80	124	129
+25	85	141
+76	118	130
+35	86	142
+75	116	131
+62	110	132
+52	119	132
+61	107	133
+76	127	133
+84	112	134
+63	123	134
+86	105	135
+77	112	135
+77	105	136
+14	117	136
+108	83	138
+85	109	137
+78	108	138
+60	137	138
+76	104	139
+23	118	139
+78	109	140
+74	115	140
+85	101	141
+101	130	141
+86	100	142
+100	131	142
+75	125	143
+102	126	143
+59
+1	14
+14	15
+15	16
+16	17
+17	18
+18	19
+19	20
+20	21
+21	22
+22	2
+2	23
+23	24
+24	25
+25	26
+26	27
+27	3
+3	28
+28	4
+4	29
+29	30
+30	31
+31	13
+5	32
+32	6
+6	33
+33	34
+34	35
+35	36
+36	37
+37	1
+4	38
+38	39
+39	7
+7	40
+40	41
+41	42
+42	43
+43	44
+44	45
+45	8
+8	46
+46	47
+47	5
+9	48
+48	49
+49	10
+10	50
+50	51
+51	11
+11	52
+52	53
+53	12
+12	54
+54	55
+55	9
+13	56
+56	57
+57	58
+58	5
--- a/ex3/seq/generate_mesh/save_mesh2_mini.m
+++ b/ex3/seq/generate_mesh/save_mesh2_mini.m
@ -0,0 +1,35 @@
+function save_mesh2_mini( xc, ia, e, basename)
+% Save the 3D triangular mesh in the minimal way (only coordinates and vertex connectivity)
+% in binary file.
+% The indexing in the connectivity is changed to C-style (starts with 0)
+%
+% coordinates  xc: [2][nnode]
+% connectivity ia: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges         e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+%        basename: file name without extension
+% data from
+% output from <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>
+%
+fname = [basename, '.bin'];
+offset = 1;         % index difference from C to Matlab
+
+nnode = size(xc,2);
+ndim  = size(xc,1);
+nelem = size(ia,2);
+nvert_e = 3;
+
+fileID = fopen(fname,'w');
+fwrite(fileID, nnode, 'int');          % number of nodes
+fwrite(fileID, ndim,  'int');          % space dimension
+fwrite(fileID, nelem, 'int');          % number of elements
+fwrite(fileID, nvert_e, 'int');        % number of vertices per element
+
+fwrite(fileID, xc(:), 'double');       % coordinates
+tmp=ia(1:3,:)-offset;
+fwrite(fileID, tmp(:), 'double');      % connectivity
+
+end
--- a/ex3/seq/generate_mesh/square.m
+++ b/ex3/seq/generate_mesh/square.m
@ -0,0 +1,43 @@
+% Square: 
+%   flatpak run org.octave.Octave <filename>
+%      or
+%   octave --no-window-system --no-gui  -qf <filename>
+
+clear all
+clc
+% %% L-shape
+% g=[2 0 2 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+%    2 2 2 0 1 1 0;
+%    2 2 1 1 0.5 1 0;
+%    2 1 1 0.5 2 1 0;
+%    2 1 0 2 2 1 0;
+%    2 0 0 2 0 1 0]';
+
+%% square
+g=[2 0 1 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+   2 1 1 0 1 1 0;
+   2 1 0 1 1 1 0;
+   2 0 0 1 0 1 0]';
+
+[p,e,t] = initmesh(g,'hmax',0.01); 
+%[p,e,t] = initmesh(g,'hmax',0.6); 
+pdemesh(p,e,t)
+% pdemesh(p,e,t,"NodeLabels","on")
+
+%% GH
+% output from <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>
+%
+% coordinates  p: [2][nnode]
+% connectivity t: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges        e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+
+ascii_write_mesh( p, t, e, mfilename);
+
+
+
+% tmp=t(1:3,:)
+
--- a/ex3/seq/generate_mesh/square.txt
+++ b/ex3/seq/generate_mesh/square.txt
@ -0,0 +1,30 @@
+9
+2
+8
+3
+0	0
+1	0
+1	1
+0	1
+0.5	0
+1	0.5
+0.5	1
+0	0.5
+0.5	0.5
+8	1	9
+5	2	9
+6	3	9
+7	4	9
+1	5	9
+2	6	9
+3	7	9
+4	8	9
+8
+1	5
+5	2
+2	6
+6	3
+3	7
+7	4
+4	8
+8	1
--- a/ex3/seq/generate_mesh/square_06.jpg
+++ b/ex3/seq/generate_mesh/square_06.jpg
--- a/ex3/seq/generate_mesh/square_06_0.txt
+++ b/ex3/seq/generate_mesh/square_06_0.txt
@ -0,0 +1,30 @@
+9
+2
+8
+3
+0	0
+1	0
+1	1
+0	1
+0.5	0
+1	0.5
+0.5	1
+0	0.5
+0.5	0.5
+8	1	9
+5	2	9
+6	3	9
+7	4	9
+1	5	9
+2	6	9
+3	7	9
+4	8	9
+8
+1	5
+5	2
+2	6
+6	3
+3	7
+7	4
+4	8
+8	1
--- a/ex3/seq/generate_mesh/visualize_results.m
+++ b/ex3/seq/generate_mesh/visualize_results.m
@ -0,0 +1,20 @@
+%% Visualize results
+%
+%   flatpak run org.octave.Octave <filename>
+%      or
+%   octave --no-window-system --no-gui  -qf <filename>
+%
+%      or
+%   matlab -nosplash <   <filename>
+
+clear all
+clc
+
+%%
+fname = 'uv.txt';
+
+[xc,ia,v] = ascii_read_meshvector(fname);
+
+h = trisurf(ia, xc(:,1), xc(:,2), v);
+
+waitfor(h)                     % wait for closing the figure
--- a/ex3/seq/jacobi_oo_stl/Doxyfile
+++ b/ex3/seq/jacobi_oo_stl/Doxyfile
--- a/ex3/seq/jacobi_oo_stl/Makefile
+++ b/ex3/seq/jacobi_oo_stl/Makefile
@ -0,0 +1,54 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# COMPILER=GCC_SEQ_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+MAIN = main
+SOURCES = ${MAIN}.cpp vdop.cpp geom.cpp\
+	getmatrix.cpp jacsolve.cpp userset.cpp
+# 	dexx.cpp debugd.cpp skalar.cpp  vecaccu.cpp accudiag.cpp
+
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= ${MAIN}.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+# -pg slows down the code on my laptop when using CLANG_
+#LINKFLAGS += -pg
+#CXXFLAGS += -Q --help=optimizers
+#CXXFLAGS += -fopt-info
+
+include ../${COMPILER}default.mk
+
+#############################################################################
+# additional specific cleaning in this directory
+clean_all::
+	@rm -f t.dat*
+
+
+#############################################################################
+# special testing
+# NPROCS	= 4
+#
+TFILE	= t.dat
+# TTMP	= t.tmp
+#
+graph: $(PROGRAM)
+# 	@rm -f $(TFILE).*
+	# next two lines only sequentially
+	./$(PROGRAM)
+	@mv  $(TFILE).000 $(TFILE)
+# 	$(MPIRUN) $(MPIFLAGS) -np $(NPROCS) $(PROGRAM)
+# 	@echo " "; echo "Manipulate data for graphics."; echo " "
+# 	@cat $(TFILE).* > $(TTMP)
+# 	@sort -b -k 2    $(TTMP)   -o $(TTMP).1
+# 	@sort -b -k 1    $(TTMP).1 -o $(TTMP).2
+# 	@awk  -f nl.awk  $(TTMP).2  > $(TFILE)
+# 	@rm -f $(TTMP).* $(TTMP) $(TFILE).*
+#
+	-gnuplot jac.dem
--- a/ex3/seq/jacobi_oo_stl/ToDo.txt
+++ b/ex3/seq/jacobi_oo_stl/ToDo.txt
@ -0,0 +1,5 @@
+// Jan 15, 2019
+
+geom.h:75        void SetValues(std::vector<double> &v) const;  // GH: TODO with functor
+Set vector values using a functor ff(x,y).
+ See solution in Progs/cds
--- a/ex3/seq/jacobi_oo_stl/ascii_read_meshvector.m
+++ b/ex3/seq/jacobi_oo_stl/ascii_read_meshvector.m
@ -0,0 +1,43 @@
+function [ xc, ia, v ] = ascii_read_meshvector( fname )
+%
+% Loads the 2D triangular mesh (coordinates, vertex connectivity) 
+%  together with values on its vertices from an ASCII file.
+%   Matlab indexing is stored  (starts with 1).
+% 
+% The input file format is compatible 
+%  with Mesh_2d_3_matlab:Write_ascii_matlab(..) in jacobi_oo_stl/geom.h
+%
+%
+%  IN: fname - filename
+% OUT: xc    - coordinates
+%      ia    - mesh connectivity
+%      v     - solution vector
+
+DELIMETER = ' ';
+
+fprintf('Read file  %s\n',fname)
+
+% Read mesh constants
+nn = dlmread(fname,DELIMETER,[0 0 0 3]);  %% row_1, col_1, row_2, col_2  in C indexing!!!
+nnode = nn(1);
+ndim  = nn(2);
+nelem = nn(3);
+nvert = nn(4);
+
+% Read coordinates
+row_start = 0+1;
+row_end   = 0+nnode;
+xc = dlmread(fname,DELIMETER,[row_start 0 row_end ndim-1]);
+
+% Read connectivity
+row_start = row_end+1;
+row_end   = row_end+nelem;
+ia = dlmread(fname,DELIMETER,[row_start 0 row_end nvert-1]);
+
+% Read solution
+row_start = row_end+1;
+row_end   = row_end+nnode;
+v = dlmread(fname,DELIMETER,[row_start 0 row_end 0]);
+end
+
+
--- a/ex3/seq/jacobi_oo_stl/ascii_write_mesh.m
+++ b/ex3/seq/jacobi_oo_stl/ascii_write_mesh.m
@ -0,0 +1,49 @@
+function ascii_write_mesh( xc, ia, e, basename)
+%
+% Saves the 2D triangular mesh in the minimal way (only coordinates, vertex connectivity, minimal boundary edge info)
+%  in an ASCII file.
+%  Matlab indexing is stored  (starts with 1).
+% 
+% The output file format is compatible with Mesh_2d_3_matlab:Mesh_2d_3_matlab(std::string const &fname) in jacobi_oo_stl/geom.h
+%
+% IN:
+% coordinates  xc: [2][nnode]
+% connectivity ia: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges         e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+%        basename: file name without extension
+% 
+% Data have been generated via <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>.
+%
+fname = [basename, '.txt'];
+
+nnode = int32(size(xc,2));
+ndim  = int32(size(xc,1));
+nelem = int32(size(ia,2));
+nvert_e = int32(3);
+
+
+dlmwrite(fname,nnode,'delimiter','\t','precision',16)                 % number of nodes
+dlmwrite(fname,ndim,'-append','delimiter','\t','precision',16)        % space dimension
+dlmwrite(fname,nelem,'-append','delimiter','\t','precision',16)       % number of elements
+dlmwrite(fname,nvert_e,'-append','delimiter','\t','precision',16)     % number of vertices per element
+
+% dlmwrite(fname,xc(:),'-append','delimiter','\t','precision',16)       % coordinates
+dlmwrite(fname,xc([1,2],:).','-append','delimiter','\t','precision',16) % coordinates
+
+% no subdomain info transferred
+tmp=int32(ia(1:3,:));
+% dlmwrite(fname,tmp(:),'-append','delimiter','\t','precision',16)      % connectivity in Matlab indexing
+dlmwrite(fname,tmp(:,:).','-append','delimiter','\t','precision',16)    % connectivity in Matlab indexing
+
+% store only start and end point of boundary edges,
+nbedges = size(e,2);
+dlmwrite(fname,nbedges,'-append','delimiter','\t','precision',16)     % number boundary edges
+tmp=int32(e(1:2,:));
+% dlmwrite(fname,tmp(:),'-append','delimiter','\t','precision',16)    % boundary edges in Matlab indexing
+dlmwrite(fname,tmp(:,:).','-append','delimiter','\t','precision',16)  % boundary edges in Matlab indexing
+
+end
--- a/ex3/seq/jacobi_oo_stl/compile.log
+++ b/ex3/seq/jacobi_oo_stl/compile.log
@ -0,0 +1 @@
+
--- a/ex3/seq/jacobi_oo_stl/geom.cpp
+++ b/ex3/seq/jacobi_oo_stl/geom.cpp
@ -0,0 +1,522 @@
+// see:   http://llvm.org/docs/CodingStandards.html#include-style
+#include "geom.h"
+
+#include <algorithm>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <string>
+#include <vector>
+using namespace std;
+
+Mesh::Mesh(int ndim, int nvert_e, int ndof_e)
+    : _nelem(0), _nvert_e(nvert_e), _ndof_e(ndof_e), _nnode(0), _ndim(ndim), _ia(0), _xc(0)
+{
+}
+
+Mesh::~Mesh()
+{}
+
+void Mesh::SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const
+{
+    int const nnode = Nnodes();            // number of vertices in mesh
+    assert( nnode == static_cast<int>(v.size()) );
+    for (int k = 0; k < nnode; ++k)
+    {
+        v[k] = func( _xc[2 * k], _xc[2 * k + 1] );
+    }
+}
+
+
+void Mesh::Debug() const
+{
+    cout << "\n ############### Debug  M E S H  ###################\n";
+    cout << "\n ...............    Coordinates       ...................\n";
+    for (int k = 0; k < _nnode; ++k)
+    {
+        cout << k << " : " ;
+        for (int i = 0; i < _ndof_e; ++i )
+        {
+            cout << _xc[2*k+i] << "  ";
+        }
+        cout << endl;
+    }
+    cout << "\n ...............    Elements        ...................\n";
+    for (int k = 0; k < _nelem; ++k)
+    {
+        cout << k << " : ";
+        for (int i = 0; i < _ndof_e; ++i )
+            cout << _ia[_ndof_e * k + i] << "  ";
+        cout << endl;
+    }
+    return;
+}
+
+void Mesh::Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const
+{
+    assert(Nnodes() ==  static_cast<int>(v.size()));  // fits vector length to mesh information?
+
+    ofstream fout(fname);                             // open file ASCII mode
+    if ( !fout.is_open() )
+    {
+        cout << "\nFile " << fname << " has not been opened.\n\n" ;
+        assert( fout.is_open() && "File not opened."  );
+    }
+
+    string const DELIMETER(" ");    // define the same delimeter as in matlab/ascii_read*.m
+    int const    OFFSET(1);         // convert C-indexing to matlab
+
+    // Write data: #nodes, #space dimensions, #elements, #vertices per element
+    fout << Nnodes() << DELIMETER << Ndims() << DELIMETER << Nelems() << DELIMETER << NverticesElements() << endl;
+
+    // Write cordinates: x_k, y_k   in seperate lines
+    assert( Nnodes()*Ndims() ==  static_cast<int>(_xc.size()));
+    for (int k = 0, kj = 0; k < Nnodes(); ++k)
+    {
+        for (int j = 0; j < Ndims(); ++j, ++kj)
+        {
+            fout << _xc[kj] << DELIMETER;
+        }
+        fout << endl;
+    }
+
+    // Write connectivity: ia_k,0, ia_k,1 etc  in seperate lines
+    assert( Nelems()*NverticesElements() ==  static_cast<int>(_ia.size()));
+    for (int k = 0, kj = 0; k < Nelems(); ++k)
+    {
+        for (int j = 0; j < NverticesElements(); ++j, ++kj)
+        {
+            fout << _ia[kj] + OFFSET << DELIMETER;     // C to matlab
+        }
+        fout << endl;
+    }
+
+    // Write vector
+    for (int k = 0; k < Nnodes(); ++k)
+    {
+        fout << v[k] << endl;
+    }
+
+    fout.close();
+    return;
+}
+
+
+void Mesh::Visualize(std::vector<double> const &v) const
+{
+    // define external command
+    const string exec_m("matlab -nosplash < visualize_results.m");                 // Matlab
+    //const string exec_m("octave --no-window-system --no-gui visualize_results.m"); // Octave
+    //const string exec_m("flatpak run org.octave.Octave visualize_results.m");      // Octave (flatpak): desktop GH
+
+    const string fname("uv.txt");
+    Write_ascii_matlab(fname, v);
+
+    int ierror = system(exec_m.c_str());                                 // call external command
+
+    if (ierror != 0)
+    {
+        cout << endl << "Check path to Matlab/octave on your system" << endl;
+    }
+    cout << endl;
+    return;
+}
+
+
+
+
+// #####################################################################
+Mesh_2d_3_square::Mesh_2d_3_square(int nx, int ny, int myid, int procx, int procy)
+    : Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
+      _myid(myid), _procx(procx), _procy(procy), _neigh{{-1, -1, -1, -1}}, _color(0),
+_xl(0.0), _xr(1.0), _yb(0.0), _yt(1.0), _nx(nx), _ny(ny)
+{
+    //void IniGeom(int const myid, int const procx, int const procy, int neigh[], int &color)
+    int const ky = _myid / _procx;
+    int const kx = _myid % _procy;  //    MOD(myid,procx)
+    // Determine the neighbors of domain/rank myid
+    _neigh[0] = (ky == 0)       ?  -1 : _myid - _procx;    //   South
+    _neigh[1] = (kx == _procx - 1) ?  -1 : _myid + 1;      //   East
+    _neigh[2] = (ky == _procy - 1) ?  -1 : _myid + _procx;  //   North
+    _neigh[3] = (kx == 0)       ?  -1 : _myid - 1;        //   West
+
+    _color = (kx + ky) & 1 ;
+
+    // subdomain is part of unit square
+    double const hx = 1. / _procx;
+    double const hy = 1. / _procy;
+    _xl = kx * hx;                      //  left
+    _xr = (kx + 1) * hx;                //  right
+    _yb = ky * hy;                      //  bottom
+    _yt = (ky + 1) * hy;                //  top
+
+    // Calculate coordinates
+    int const nnode = (_nx + 1) * (_ny + 1); // number of nodes
+    Resize_Coords(nnode, 2);                 // coordinates in 2D [nnode][ndim]
+    GetCoordsInRectangle(_nx, _ny, _xl, _xr, _yb, _yt, GetCoords().data());
+
+    // Calculate element connectivity (linear triangles)
+    int const nelem = 2 * _nx * _ny;         // number of elements
+    Resize_Connectivity(nelem, 3);           // connectivity matrix [nelem][3]
+    GetConnectivityInRectangle(_nx, _ny, GetConnectivity().data());
+
+    return;
+}
+
+
+void Mesh_2d_3_square::SetU(std::vector<double> &u) const
+{
+    int dx    = _nx + 1;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        int k = j * dx;
+        for (int i = 0; i <= _nx; ++i, ++k)
+        {
+            u[k] = 0.0;
+        }
+    }
+
+}
+
+void Mesh_2d_3_square::SetF(std::vector<double> &f) const
+{
+    int dx    = _nx + 1;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        int k = j * dx;
+        for (int i = 0; i <= _nx; ++i, ++k)
+        {
+            f[k] = 1.0;
+        }
+    }
+
+}
+
+
+std::vector<int> Mesh_2d_3_square::Index_DirichletNodes() const
+{
+    int const dx = 1,
+              dy = _nx + 1,
+              bl  = 0,
+              br  = _nx,
+              tl  = _ny * (_nx + 1),
+              tr  = (_ny + 1) * (_nx + 1) - 1;
+    int const start[4] = { bl, br, tl, bl},
+                         end[4] = { br, tr, tr, tl},
+                                  step[4] = { dx, dy, dx, dy};
+
+    vector<int> idx(0);
+    for (int j = 0; j < 4; j++)
+    {
+        if (_neigh[j] < 0)
+        {
+            for (int i = start[j]; i <= end[j]; i += step[j])
+            {
+                idx.push_back(i);        // node i is Dirichlet node
+            }
+        }
+    }
+    //    remove multiple elements
+    sort(idx.begin(), idx.end());                          // sort
+    idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
+
+    return idx;
+}
+
+void Mesh_2d_3_square::SaveVectorP(std::string const &name, vector<double> const &u) const
+{
+//  construct the file name for subdomain myid
+    const string tmp( std::to_string(_myid / 100) + to_string((_myid % 100) / 10) + to_string(_myid % 10) );
+
+    const string namep = name + "." + tmp;
+    ofstream ff(namep.c_str());
+    ff.precision(6);
+    ff.setf(ios::fixed, ios::floatfield);
+
+    // assumes tensor product grid in unit square; rowise numbered (as generated in class constructor)
+    // output is provided for tensor product grid visualization ( similar to Matlab-surf() )
+    auto const &xc = GetCoords();
+    int k = 0;
+    for (int j = 0; j <= _ny; ++j)
+    {
+        for (int i = 0; i <= _nx; ++i, ++k)
+            ff << xc[2 * k + 0] << "   " << xc[2 * k + 1] << "   " << u[k] << endl;
+        ff << endl;
+    }
+
+    ff.close();
+    return;
+}
+
+void Mesh_2d_3_square::GetCoordsInRectangle(int const nx, int const ny,
+        double const xl, double const xr, double const yb, double const yt,
+        double xc[])
+{
+    const double hx = (xr - xl) / nx,
+                 hy = (yt - yb) / ny;
+
+    int k = 0;
+    for (int j = 0; j <= ny; ++j)
+    {
+        const double y0 = yb + j * hy;
+        for (int i = 0; i <= nx; ++i, k += 2)
+        {
+            xc[k  ] = xl + i * hx;
+            xc[k + 1] = y0;
+        }
+    }
+
+    return;
+}
+
+void Mesh_2d_3_square::GetConnectivityInRectangle(int const nx, int const ny, int ia[])
+{
+    const int dx = nx + 1;
+    int k  = 0;
+    int l  = 0;
+    for (int j = 0; j < ny; ++j, ++k)
+    {
+        for (int i = 0; i < nx; ++i, ++k)
+        {
+            ia[l  ] = k;
+            ia[l + 1] = k + 1;
+            ia[l + 2] = k + dx + 1;
+            l += 3;
+            ia[l  ] = k;
+            ia[l + 1] = k + dx;
+            ia[l + 2] = k + dx + 1;
+            l += 3;
+        }
+    }
+    return;
+}
+
+// #################### still some old code (--> MPI) ############################
+
+
+//  Copies the values of w corresponding to the boundary
+//  South (ib==1), East (ib==2), North (ib==3), West (ib==4)
+
+void GetBound(int const ib, int const nx, int const ny, double const w[], double s[])
+{
+    const int //dx = 1,
+    dy = nx + 1,
+    bl  = 0,
+    br  = nx,
+    tl  = ny * (nx + 1),
+    tr  = (ny + 1) * (nx + 1) - 1;
+    switch (ib)
+    {
+        case 1:
+        {
+            for (int i = bl, j = 0; i <= br; ++i, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 3:
+        {
+            for (int i = tl, j = 0; i <= tr; ++i, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 4:
+        {
+            for (int i = bl, j = 0; i <= tl; i += dy, ++j)
+                s[j] = w[i];
+            break;
+        }
+        case 2:
+        {
+            for (int i = br, j = 0; i <= tr; i += dy, ++j)
+                s[j] = w[i];
+            break;
+        }
+        default:
+        {
+            cout << endl << "Wrong parameter  ib in " << __FILE__ << ":" << __LINE__ << endl;
+        }
+    }
+    return;
+}
+
+// ----------------------------------------------------------------------------------------------------------
+// Computes w:  = w + s at nodes on  the boundary
+// South (ib == 1), East (ib == 2), North (ib == 3), West (ib == 4)
+
+void AddBound(int const ib, int const nx, int const ny, double w[], double const s[])
+{
+    int const dy = nx + 1,
+              bl  = 0,
+              br  = nx,
+              tl  = ny * (nx + 1),
+              tr  = (ny + 1) * (nx + 1) - 1;
+    switch (ib)
+    {
+        case 1:
+        {
+            for (int i = bl, j = 0; i <= br; ++i, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 3:
+        {
+            for (int i = tl, j = 0; i <= tr; ++i, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 4:
+        {
+            for (int i = bl, j = 0; i <= tl; i += dy, ++j)
+                w[i] += s[j];
+            break;
+        }
+        case 2:
+        {
+            for (int i = br, j = 0; i <= tr; i += dy, ++j)
+                w[i] += s[j];
+            break;
+        }
+        default:
+        {
+            cout << endl << "Wrong parameter  ib in " << __FILE__ << ":" << __LINE__ << endl;
+        }
+    }
+    return;
+}
+
+
+// ####################################################################
+
+Mesh_2d_3_matlab::Mesh_2d_3_matlab(string const &fname)
+    : Mesh(2, 3, 3),  // two dimensions, 3 vertices, 3 dofs
+      bedges(0)
+{
+    ifstream ifs(fname);
+    if (!(ifs.is_open() && ifs.good()))
+    {
+        cerr << "Mesh_2d_3_matlab: Error cannot open file " << fname << endl;
+        assert(ifs.is_open());
+    }
+
+    int const OFFSET(1);             // Matlab to C indexing
+    cout << "ASCI file  " << fname << "  opened" << endl;
+
+    // Read some mesh constants
+    int nnode, ndim, nelem, nvert_e;
+    ifs >> nnode >> ndim >> nelem >> nvert_e;
+    cout << nnode << "  " << ndim << "  " << nelem << "  " << nvert_e << endl;
+    assert(ndim == 2 && nvert_e == 3);
+
+    // Allocate memory
+    Resize_Coords(nnode, ndim);                 // coordinates in 2D [nnode][ndim]
+    Resize_Connectivity(nelem, nvert_e);        // connectivity matrix [nelem][nvert]
+
+    // Read ccordinates
+    auto &xc = GetCoords();
+    for (int k = 0; k < nnode * ndim; ++k)
+    {
+        ifs >> xc[k];
+    }
+
+    // Read connectivity
+    auto &ia = GetConnectivity();
+    for (int k = 0; k < nelem * nvert_e; ++k)
+    {
+        ifs >> ia[k];
+        ia[k] -= OFFSET;                // Matlab to C indexing
+    }
+
+    // additional read of boundary information (only start/end point)
+    int nbedges;
+    ifs >> nbedges;
+
+    bedges.resize(nbedges * 2);
+    for (int k = 0; k < nbedges * 2; ++k)
+    {
+        ifs >> bedges[k];
+        bedges[k] -= OFFSET;            // Matlab to C indexing
+    }
+
+    return;
+}
+
+// binary
+//{
+//ifstream ifs(fname, ios_base::in | ios_base::binary);
+//if(!(ifs.is_open() && ifs.good()))
+//{
+//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
+//assert(ifs.is_open());
+//}
+//cout << "ReadBinMatrix: file opened" << file << endl;
+
+
+//}
+
+// binaryIO.cpp
+//void read_binMatrix(const string& file, vector<int> &cnt, vector<int> &col, vector<double> &ele)
+//{
+
+//ifstream ifs(file, ios_base::in | ios_base::binary);
+
+//if(!(ifs.is_open() && ifs.good()))
+//{
+//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
+//assert(ifs.is_open());
+//}
+//cout << "ReadBinMatrix: Opened file " << file << endl;
+
+//int _size;
+
+//ifs.read(reinterpret_cast<char*>(&_size), sizeof(int));   // old: ifs.read((char*)&_size, sizeof(int));
+//cnt.resize(_size);
+//cout << "ReadBinMatrix: cnt size: " << _size << endl;
+
+//ifs.read((char*)&_size, sizeof(int));
+//col.resize(_size);
+//cout << "ReadBinMatrix: col size: " << _size << endl;
+
+//ifs.read((char*)&_size, sizeof(int));
+//ele.resize(_size);
+//cout << "ReadBinMatrix: ele size: " << _size << endl;
+
+
+//ifs.read((char*)cnt.data(), cnt.size() * sizeof(int));
+//ifs.read((char*)col.data(), col.size() * sizeof(int));
+//ifs.read((char*)ele.data(), ele.size() * sizeof(double));
+
+//ifs.close();
+//cout << "ReadBinMatrix: Finished reading matrix.." << endl;
+
+//}
+
+
+std::vector<int> Mesh_2d_3_matlab::Index_DirichletNodes() const
+{
+    vector<int> idx(bedges);                              // copy
+
+    sort(idx.begin(), idx.end());                         // sort
+    idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
+
+    return idx;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/ex3/seq/jacobi_oo_stl/geom.h
+++ b/ex3/seq/jacobi_oo_stl/geom.h
@ -0,0 +1,381 @@
+#ifndef GEOM_FILE
+#define GEOM_FILE
+#include <array>
+#include <functional>             // function; C++11
+#include <string>
+#include <vector>
+
+/**
+ * Basis class for finite element meshes.
+ */
+class Mesh
+{
+public:
+    /**
+     * Constructor initializing the members with default values.
+      *
+     * @param[in] ndim  space dimensions (dimension for coordinates)
+      * @param[in] nvert_e  number of vertices per element (dimension for connectivity)
+      * @param[in] ndof_e   degrees of freedom per element (= @p nvert_e for linear elements)
+     */
+    explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0);
+
+    /**
+     * Destructor.
+     *
+     * See clang warning on
+     * <a href="https://stackoverflow.com/questions/28786473/clang-no-out-of-line-virtual-method-definitions-pure-abstract-c-class/40550578">weak-vtables</a>.
+     */
+    virtual ~Mesh();
+
+    /**
+     * Number of finite elements in (sub)domain.
+     * @return number of elements.
+     */
+    int Nelems() const
+    {
+        return _nelem;
+    }
+
+    /**
+     * Global number of vertices for each finite element.
+     * @return number of vertices per element.
+     */
+    int NverticesElements() const
+    {
+        return _nvert_e;
+    }
+
+    /**
+     * Global number of degrees of freedom (dof) for each finite element.
+     * @return degrees of freedom per element.
+     */
+    int NdofsElement() const
+    {
+        return _ndof_e;
+    }
+
+    /**
+     * Number of vertices in mesh.
+     * @return number of vertices.
+     */
+    int Nnodes() const
+    {
+        return _nnode;
+    }
+
+    /**
+     * Space dimension.
+     * @return number of dimensions.
+     */
+    int Ndims() const
+    {
+        return _ndim;
+    }
+
+    /**
+     * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
+      *
+     * @param[in] nelem    number of elements
+      * @param[in] nvert_e  number of vertices per element
+     */
+    void Resize_Connectivity(int nelem, int nvert_e)
+    {
+        SetNelem(nelem);               // number of elements
+        SetNverticesElement(nvert_e);  // vertices per element
+        _ia.resize(nelem * nvert_e);
+    }
+
+    /**
+     * Read connectivity information (g1,g2,g3)_i.
+     * @return convectivity vector [nelems*ndofs].
+     */
+    const std::vector<int>  &GetConnectivity() const
+    {
+        return _ia;
+    }
+
+    /**
+     * Access/Change connectivity information (g1,g2,g3)_i.
+     * @return convectivity vector [nelems*ndofs].
+     */
+    std::vector<int>  &GetConnectivity()
+    {
+        return _ia;
+    }
+
+    /**
+     * (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
+      *
+     * @param[in] nnodes    number of nodes
+      * @param[in] ndim      space dimension
+     */
+    void Resize_Coords(int nnodes, int ndim)
+    {
+        SetNnode(nnodes);       // number of nodes
+        SetNdim(ndim);          // space dimension
+        _xc.resize(nnodes * ndim);
+    }
+
+    /**
+     * Read coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    const std::vector<double> &GetCoords() const
+    {
+        return _xc;
+    }
+
+    /**
+     * Access/Change coordinates of vertices (x,y)_i.
+     * @return coordinates vector [nnodes*2].
+     */
+    std::vector<double> &GetCoords()
+    {
+        return _xc;
+    }
+
+    /**
+     * Calculate values in vector @p v via function @p func(x,y)
+     * @param[in] v     vector
+      * @param[in] func  function of (x,y) returning a double value.
+     */
+    void SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const;
+
+    /**
+     * Prints the information for a finite element mesh
+     */
+    void Debug() const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    virtual std::vector<int> Index_DirichletNodes() const = 0;
+
+    /**
+     * Write vector @p v toghether with its mesh information to an ASCii file @p fname.
+      *
+      * The data are written in C-style.
+      *
+      * @param[in] fname  file name
+      * @param[in] v      vector
+     */
+    void Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const;
+
+    /**
+     * Visualize @p v together with its mesh information via matlab or octave.
+      *
+      * Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
+      * that are supported on your system.
+      *
+      * @param[in] v    vector
+      *
+      * @warning matlab files ascii_read_meshvector.m  visualize_results.m
+      *          must be in the executing directory.
+     */
+    void Visualize(std::vector<double> const &v) const;
+
+
+protected:
+    void SetNelem(int nelem)
+    {
+        _nelem = nelem;
+    }
+
+    void SetNverticesElement(int nvert)
+    {
+        _nvert_e = nvert;
+    }
+
+    void SetNdofsElement(int ndof)
+    {
+        _ndof_e = ndof;
+    }
+
+    void SetNnode(int nnode)
+    {
+        _nnode = nnode;
+    }
+
+    void SetNdim(int ndim)
+    {
+        _ndim = ndim;
+    }
+
+private:
+    int _nelem;         //!< number elements
+    int _nvert_e;       //!< number of vertices per element
+    int _ndof_e;        //!< degrees of freedom (d.o.f.) per element
+    int _nnode;         //!< number nodes/vertices
+    int _ndim;          //!< space dimension of the problem (1, 2, or 3)
+    std::vector<int> _ia;    //!< element connectivity
+    std::vector<double> _xc; //!< coordinates
+};
+
+/**
+ * 2D finite element mesh of the square consiting of linear triangular elements.
+ */
+class Mesh_2d_3_square: public Mesh
+{
+public:
+    /**
+     * Generates the f.e. mesh for the unit square.
+     *
+     * @param[in] nx    number of discretization intervals in x-direction
+     * @param[in] ny    number of discretization intervals in y-direction
+     * @param[in] myid  my MPI-rank / subdomain
+     * @param[in] procx number of ranks/subdomains in x-direction
+     * @param[in] procy number of processes in y-direction
+    */
+    Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1);
+
+    /**
+     * Destructor
+     */
+    ~Mesh_2d_3_square() override
+    {}
+
+    /**
+     * Set solution vector based on a tensor product grid in the rectangle.
+     * @param[in] u solution vector
+     */
+    void SetU(std::vector<double> &u) const;
+
+    /**
+     * Set right hand side (rhs) vector on a tensor product grid in the rectangle.
+     * @param[in] f rhs vector
+     */
+    void SetF(std::vector<double> &f) const;
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    std::vector<int> Index_DirichletNodes() const override;
+
+    /**
+      * Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot.
+      * The file stores rowise the x- and y- coordinates together with the value from  @p u .
+      * The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals.
+      *
+      * @param[in] name basename of file name (file name will be extended by the rank number)
+      * @param[in] u    local vector
+      *
+      * @warning   Assumes tensor product grid in unit square; rowise numbered
+      *            (as generated in class constructor).
+      *            The output is provided for tensor product grid visualization
+      *            ( similar to Matlab-surf() ).
+      *
+      * @see Mesh_2d_3_square
+      */
+    void SaveVectorP(std::string const &name, std::vector<double> const &u) const;
+
+    // here will still need to implement in the class
+    //  GetBound(), AddBound()
+    //  or better a generalized way with indices and their appropriate ranks for MPI communication
+
+private:
+    /**
+      * Determines the coordinates of the dicretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt]
+      * which is discretized into @p nx x @p ny intervals.
+      *
+      * @param[in] ny   number of discretization intervals in y-direction
+      * @param[in] xl   x-coordinate of left boundary
+      * @param[in] xr   x-coordinate of right boundary
+      * @param[in] yb   y-coordinate of lower boundary
+      * @param[in] yt   y-coordinate of upper boundary
+      * @param[out] xc  coordinate vector of length 2n with x(2*k,2*k+1) as coodinates of node k
+      */
+
+    void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt,
+                              double xc[]);
+    /**
+      * Determines the element connectivity of linear triangular elements of a FEM discretization
+      * of a rectangle using @p nx x @p ny equidistant intervals for discretization.
+      * @param[in] nx   number of discretization intervals in x-direction
+      * @param[in] ny   number of discretization intervals in y-direction
+      * @param[out] ia  element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s
+      */
+    void GetConnectivityInRectangle(int nx, int ny, int ia[]);
+
+private:
+    int _myid;          //!< my MPI rank
+    int _procx;         //!< number of MPI ranks in x-direction
+    int _procy;         //!< number of MPI ranks in y-direction
+    std::array<int, 4> _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.)
+    int _color;         //!< red/black coloring (checker board) of subdomains
+
+    double _xl;         //!< x coordinate of lower left  corner of square
+    double _xr;         //!< x coordinate of lower right corner of square
+    double _yb;         //!< y coordinate or lower left  corner of square
+    double _yt;         //!< y coordinate of upper right corner of square
+    int    _nx;         //!< number of intervals in x-direction
+    int    _ny;         //!< number of intervals in y-direction
+};
+
+// #################### still some old code (--> MPI) ############################
+/**
+ * Copies the values of @p w corresponding to boundary @p ib
+ * onto vector s.  South (ib==1), East (ib==2), North (ib==3), West (ib==4).
+ * The vector @p s has to be long enough!!
+ * @param[in] ib    my local boundary
+ * @param[in] nx    number of discretization intervals in x-direction
+ * @param[in] ny    number of discretization intervals in y-direction
+ * @param[in] w     vector for all nodes of local discretization
+ * @param[out] s    short vector with values on boundary @p ib
+*/
+// GH_NOTE: Absicherung bei s !!
+void GetBound(int ib, int nx, int ny, double const w[], double s[]);
+
+
+/**
+ * Computes @p w := @p w + @p s  at the interface/boundary nodes on the
+ * boundary @p ib .  South (ib==1), East (ib==2), North (ib==3), West (ib==4)
+ * @param[in] ib    my local boundary
+ * @param[in] nx    number of discretization intervals in x-direction
+ * @param[in] ny    number of discretization intervals in y-direction
+ * @param[in,out] w vector for all nodes of local discretization
+ * @param[in] s     short vector with values on boundary @p ib
+*/
+void AddBound(int ib, int nx, int ny, double w[], double const s[]);
+
+// #################### Mesh from Matlab ############################
+/**
+ * 2D finite element mesh of the square consiting of linear triangular elements.
+ */
+class Mesh_2d_3_matlab: public Mesh
+{
+public:
+    /**
+     * Reads mesh data from a binary file.
+     *
+     * File format, see ascii_write_mesh.m
+     *
+     * @param[in] fname file name
+    */
+    explicit Mesh_2d_3_matlab(std::string const &fname);
+
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions.
+     * @return index vector.
+      *
+      * @warning All boundary nodes are considered as Dirchlet nodes.
+     */
+    std::vector<int> Index_DirichletNodes() const override;
+
+private:
+    /**
+     * Determines the indices of those vertices with Dirichlet boundary conditions
+     * @return index vector.
+     */
+    int Nnbedges() const
+    {
+        return static_cast<int>(bedges.size());
+    }
+
+    std::vector<int> bedges;     //!< boundary edges [nbedges][2] storing start/end vertex
+
+};
+
+#endif
--- a/ex3/seq/jacobi_oo_stl/getmatrix.cpp
+++ b/ex3/seq/jacobi_oo_stl/getmatrix.cpp
@ -0,0 +1,348 @@
+#include "getmatrix.h"
+#include "userset.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <vector>
+using namespace std;
+
+
+//  general routine for lin. triangular elements
+
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3])
+//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe)
+{
+    const int  i1  = 2 * ial[0],   i2 = 2 * ial[1],   i3 = 2 * ial[2];
+    const double x13 = xc[i3 + 0] - xc[i1 + 0],  y13 = xc[i3 + 1] - xc[i1 + 1],
+                 x21 = xc[i1 + 0] - xc[i2 + 0],  y21 = xc[i1 + 1] - xc[i2 + 1],
+                 x32 = xc[i2 + 0] - xc[i3 + 0],  y32 = xc[i2 + 1] - xc[i3 + 1];
+    const double jac = fabs(x21 * y13 - x13 * y21);
+
+    ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
+    ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
+    ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
+    ske[1][0] = ske[0][1];
+    ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
+    ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
+    ske[2][0] = ske[0][2];
+    ske[2][1] = ske[1][2];
+    ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
+
+    const double xm    = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
+                 ym    = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
+    //fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0;
+    fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0;
+}
+
+
+// general routine for lin. triangular elements,
+// non-symm. matrix
+// node numbering in element:  a s c e n d i n g   indices !!
+[[deprecated("Use CRS_Matrix::AddElem_3(...) instead.")]]
+void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
+             int const id[], int const ik[], double sk[], double f[])
+{
+    for (int i = 0; i < 3; ++i)
+    {
+        const int ii  = ial[i],           // row ii (global index)
+                  id1 = id[ii],           // start and
+                  id2 = id[ii + 1];       // end of row ii in matrix
+        int ip  = id1;
+        for (int j = 0; j < 3; ++j)       // no symmetry assumed
+        {
+            const int jj = ial[j];
+            bool not_found = true;
+            do       // find entry jj (global index) in row ii
+            {
+                not_found = (ik[ip] != jj);
+                ++ip;
+            }
+            while (not_found && ip < id2);
+
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (not_found)     // no entry found !!
+            {
+                cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                     << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(!not_found);
+            }
+#endif
+            sk[ip - 1] += ske[i][j];
+        }
+        f[ii] += fe[i];
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+
+
+
+
+// ####################################################################
+
+CRS_Matrix::CRS_Matrix(Mesh const &mesh)
+    : _mesh(mesh), _nrows(0), _nnz(0), _id(0), _ik(0), _sk(0)
+{
+    Derive_Matrix_Pattern();
+    return;
+}
+
+void CRS_Matrix::Derive_Matrix_Pattern()
+{
+    int const nelem(_mesh.Nelems());
+    int const ndof_e(_mesh.NdofsElement());
+    auto const &ia(_mesh.GetConnectivity());
+//  Determine the number of matrix rows
+    _nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
+    ++_nrows;                                 // node numberng: 0 ... nnode-1
+    assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
+
+//  Collect for each node those nodes it is connected to (multiple entries)
+//  Detect the neighboring nodes
+    vector< list<int> > cc(_nrows);             //  cc[i] is the  list of nodes a node i is connected to
+    for (int i = 0; i < nelem; ++i)
+    {
+        int const idx = ndof_e * i;
+        for (int k = 0; k < ndof_e; ++k)
+        {
+            list<int> &cck = cc.at(ia[idx + k]);
+            cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e );
+        }
+    }
+//  Delete the multiple entries
+    _nnz = 0;
+    for (auto &it : cc)
+    {
+        it.sort();
+        it.unique();
+        _nnz += static_cast<int>(it.size());
+        // cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator<int,char>(cout,"  ")); cout << endl;
+    }
+
+// CSR data allocation
+    _id.resize(_nrows + 1);                  // Allocate memory for CSR row pointer
+    _ik.resize(_nnz);                        // Allocate memory for CSR column index vector
+
+//  copy CSR data
+    _id[0] = 0;                                 // begin of first row
+    for (size_t i = 0; i < cc.size(); ++i)
+    {
+        //cout << i << "   " << nid.at(i) << endl;;
+        const list<int> &ci = cc.at(i);
+        const auto nci = static_cast<int>(ci.size());
+        _id[i + 1] = _id[i] + nci; // begin of next line
+        copy(ci.begin(), ci.end(), _ik.begin() + _id[i] );
+    }
+
+    assert(_nnz == _id[_nrows]);
+    _sk.resize(_nnz);                      // Allocate memory for CSR column index vector
+    return;
+}
+
+
+void CRS_Matrix::Debug() const
+{
+//  ID points to first entry of row
+//  no symmetry assumed
+    cout << "\nMatrix  (nnz = " << _id[_nrows] << ")\n";
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        cout << "Row " << row << " : ";
+        int const id1 = _id[row];
+        int const id2 = _id[row + 1];
+        for (int j = id1; j < id2; ++j)
+        {
+            cout.setf(ios::right, ios::adjustfield);
+            cout << "[" << setw(2) << _ik[j] << "]  " << setw(4) << _sk[j] << "  ";
+        }
+        cout << endl;
+    }
+    return;
+}
+
+void CRS_Matrix::CalculateLaplace(vector<double> &f)
+{
+    assert(_mesh.NdofsElement() == 3);               // only for triangular, linear elements
+    //cout << _nnz << " vs. " << _id[_nrows] << "  " << _nrows<< endl;
+    assert(_nnz == _id[_nrows]);
+
+    for (int k = 0; k < _nrows; ++k)
+    {
+        _sk[k] = 0.0;
+    }
+    for (int k = 0; k < _nrows; ++k)
+    {
+        f[k] = 0.0;
+    }
+
+    double ske[3][3], fe[3];
+    //  Loop over all elements
+    auto const nelem = _mesh.Nelems();
+    auto const &ia   = _mesh.GetConnectivity();
+    auto const &xc   = _mesh.GetCoords();
+
+    for (int i = 0; i < nelem; ++i)
+    {
+        CalcElem(ia.data() + 3 * i, xc.data(), ske, fe);
+        AddElem_3(ia.data() + 3 * i, ske, fe, f);
+    }
+    //Debug();
+
+    return;
+}
+
+void CRS_Matrix::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
+{
+    double const PENALTY = 1e6;
+    auto const idx = _mesh.Index_DirichletNodes();
+    int const nidx = static_cast<int>(idx.size());
+
+    for (int row = 0; row < nidx; ++row)
+    {
+        int const k = idx[row];
+        int const id1 = fetch(k, k); // Find diagonal entry of row
+        assert(id1 >= 0);
+        _sk[id1] += PENALTY;		// matrix weighted scaling feasible
+        f[k] += PENALTY * u[k];
+    }
+
+    return;
+}
+
+void CRS_Matrix::GetDiag(vector<double> &d) const
+{
+    assert( _nrows == static_cast<int>(d.size()) );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        const int ia = fetch(row, row); // Find diagonal entry of row
+        assert(ia >= 0);
+        d[row] = _sk[ia];
+    }
+    return;
+}
+
+bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const
+{
+    bool bn = (nnode == _nrows);     // number of rows
+    if (!bn)
+    {
+        cout << "#########   Error: " << "number of rows" << endl;
+    }
+
+    bool bz = (id[nnode] == _nnz);   // number of non zero elements
+    if (!bz)
+    {
+        cout << "#########   Error: " << "number of non zero elements" << endl;
+    }
+
+    bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts
+    if (!bd)
+    {
+        cout << "#########   Error: " << "row starts" << endl;
+    }
+
+    bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices
+    if (!bk)
+    {
+        cout << "#########   Error: " << "column indices" << endl;
+    }
+
+    bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values
+    if (!bv)
+    {
+        cout << "#########   Error: " << "values" << endl;
+    }
+
+    return bn && bz && bd && bk && bv;
+}
+
+
+void CRS_Matrix::Mult(vector<double> &w, vector<double> const &u) const
+{
+    assert( _nrows == static_cast<int>(w.size()) );
+    assert( w.size() == u.size() );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        double wi = 0.0;
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
+        {
+            wi += _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+void CRS_Matrix::Defect(vector<double> &w,
+                        vector<double> const &f, vector<double> const &u) const
+{
+    assert( _nrows == static_cast<int>(w.size()) );
+    assert( w.size() == u.size() && u.size() == f.size() );
+
+    for (int row = 0; row < _nrows; ++row)
+    {
+        double wi = f[row];
+        for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
+        {
+            wi -= _sk[ij] * u[ _ik[ij] ];
+        }
+        w[row] = wi;
+    }
+    return;
+}
+
+int CRS_Matrix::fetch(int const row, int const col) const
+{
+    int const id2 = _id[row + 1];    // end   and
+    int       ip  = _id[row];        // start of recent row (global index)
+
+    while (ip < id2 && _ik[ip] != col)  // find index col (global index)
+    {
+        ++ip;
+    }
+    if (ip >= id2)
+    {
+        ip = -1;
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+        cout << "No column  " << col << "  in row  " << row << endl;
+        assert(ip >= id2);
+#endif
+    }
+    return ip;
+}
+
+
+// general routine for lin. triangular elements,
+// non-symm. matrix
+// node numbering in element:  a s c e n d i n g   indices !!
+void CRS_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector<double> &f)
+{
+    for (int i = 0; i < 3; ++i)
+    {
+        const int ii  = ial[i];           // row ii (global index)
+        for (int j = 0; j < 3; ++j)       // no symmetry assumed
+        {
+            const int jj = ial[j];        // column jj (global index)
+            int ip = fetch(ii, jj);       // find column entry jj in row ii
+#ifndef NDEBUG                 // compiler option -DNDEBUG switches off the check
+            if (ip < 0)        // no entry found !!
+            {
+                cout << "Error in AddElem: (" << ii << "," << jj << ") ["
+                     << ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
+                assert(ip >= 0);
+            }
+#endif
+            _sk[ip] += ske[i][j];
+        }
+        f[ii] += fe[i];
+    }
+}
+
--- a/ex3/seq/jacobi_oo_stl/getmatrix.h
+++ b/ex3/seq/jacobi_oo_stl/getmatrix.h
@ -0,0 +1,178 @@
+#ifndef GETMATRIX_FILE
+#define GETMATRIX_FILE
+
+#include "geom.h"
+#include <vector>
+
+/**
+ * Calculates the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions.
+ * @param[in]	ial	node indices of the three element vertices
+ * @param[in]	xc	vector of node coordinates with x(2*k,2*k+1) as coodinates of node k
+ * @param[out] ske	element stiffness matrix
+ * @param[out] fe	element load vector
+ */
+void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]);
+
+/**
+ * Adds the element stiffness matrix @p ske and the element load vector @p fe
+ * of one triangular element with linear shape functions to the appropriate positions in
+ * the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik)
+ *
+ * @param[in] ial   node indices of the three element vertices
+ * @param[in] ske	element stiffness matrix
+ * @param[in] fe	element load vector
+ * @param[out] sk	vector non-zero entries of CSR matrix
+ * @param[in] id	index vector containing the first entry in a CSR row
+ * @param[in] ik	column index vector of CSR matrix
+ * @param[out] f	distributed local vector storing the right hand side
+ *
+ * @warning Algorithm requires indices in connectivity @p ial in ascending order.
+ *          Currently deprecated.
+*/
+void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
+             int const id[], int const ik[], double sk[], double f[]);
+
+
+// #####################################################################
+/**
+ * Square matrix in CRS format (compressed row storage; also named CSR),
+ * see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
+ */
+class CRS_Matrix
+{
+    public:
+       /**
+        * Intializes the CRS matrix structure from the given discetization in @p mesh.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+        *
+        * @param[in] mesh given discretization
+        *
+        * @warning A reference to the discretization @p mesh is stored inside this class.
+        *          Therefore, changing @p mesh outside requires also
+        *          to call method @p Derive_Matrix_Pattern explicitely.
+        *
+        * @see Derive_Matrix_Pattern
+        */
+       explicit CRS_Matrix(Mesh const & mesh);
+
+      /**
+        * Destructor.
+        */
+       ~CRS_Matrix()
+       {}
+
+       /**
+        * Generates the sparse matrix pattern and overwrites the existing pattern.
+        *
+        * The sparse matrix pattern is generated but the values are 0.
+       */
+       void Derive_Matrix_Pattern();
+
+        /**
+        * Calculates the entries of f.e. stiffness matrix and load/rhs vector @p f for the Laplace operator in 2D.
+        * No memory is allocated.
+        *
+        * @param[in,out] f (preallocated) rhs/load vector
+        */
+       void CalculateLaplace(std::vector<double> &f);
+
+       /**
+        * Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
+        * The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
+        * is used for incorporating the given values @p u.
+        *
+        * @param[in]     u (global) vector with Dirichlet data
+        * @param[in,out] f load vector
+        */
+       void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
+
+       /**
+        * Extracts the diagonal elemenst of the sparse matrix.
+        *
+        * @param[in,out]  d  (prellocated) vector of diagonal elements
+        */
+       void GetDiag(std::vector<double> &d) const;
+
+       /**
+        * Performs the matrix-vector product  w := K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     u vector
+        */
+       void Mult(std::vector<double> &w, std::vector<double> const &u) const;
+
+        /**
+        * Calculates the defect/residuum w := f - K*u.
+        *
+        * @param[in,out] w resulting vector (preallocated)
+        * @param[in]     f load vector
+        * @param[in]     u vector
+        */
+       void Defect(std::vector<double> &w,
+                   std::vector<double> const &f, std::vector<double> const &u) const;
+
+       /**
+		 * Number rows in matrix.
+		 * @return number of rows.
+		 */
+       int Nrows() const
+          {return _nrows;}
+
+       /**
+		 * Show the matrix entries.
+		 */
+       void Debug() const;
+
+       /**
+		 * Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
+		 *
+		 * @param[in] row	row index
+		 * @param[in] col	column index
+		 * @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
+		 *
+		 * @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
+		*/
+       int fetch(int row, int col) const;
+
+      /**
+        * Adds the element stiffness matrix @p ske and the element load vector @p fe
+        * of one triangular element with linear shape functions to the appropriate positions in
+        * the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
+        *
+        * @param[in]     ial   node indices of the three element vertices
+        * @param[in]     ske   element stiffness matrix
+        * @param[in]     fe    element load vector
+        * @param[in,out] f	   distributed local vector storing the right hand side
+        *
+        * @warning Algorithm assumes  linear triangular elements (ndof_e==3).
+       */
+       void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
+
+        /**
+        * Compare @p this CRS matrix with an external CRS matrix stored in C-Style.
+        *
+        * The method prints statements on differences found.
+        *
+        * @param[in]     nnode  row number of external matrix
+        * @param[in]     id     start indices of matrix rows of external matrix
+        * @param[in]     ik     column indices of external matrix
+        * @param[in]     sk     non-zero values of external matrix
+        *
+        * @return true iff all data are identical.
+        */
+       bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const;
+
+    private:
+       Mesh const & _mesh;      //!< reference to discretization
+       int _nrows;              //!< number of rows in matrix
+       int _nnz;                //!< number of non-zero entries
+       std::vector<int> _id;    //!< start indices of matrix rows
+       std::vector<int> _ik;    //!< column indices
+       std::vector<double> _sk; //!< non-zero values
+
+};
+
+
+#endif
--- a/ex3/seq/jacobi_oo_stl/gmon.out
+++ b/ex3/seq/jacobi_oo_stl/gmon.out
--- a/ex3/seq/jacobi_oo_stl/gnuplot.rot
+++ b/ex3/seq/jacobi_oo_stl/gnuplot.rot
@ -0,0 +1,5 @@
+zrot=(zrot+10)%360
+xrot=(xrot+17)%180
+set view xrot,zrot
+replot
+reread
--- a/ex3/seq/jacobi_oo_stl/gprofng_script1
+++ b/ex3/seq/jacobi_oo_stl/gprofng_script1
@ -0,0 +1,7 @@
+# max 5 lines
+limit 5
+# define metrics
+metrics name:e.llm
+# show absolute numbers
+compare on
+functions
--- a/ex3/seq/jacobi_oo_stl/gprofng_script2
+++ b/ex3/seq/jacobi_oo_stl/gprofng_script2
@ -0,0 +1,7 @@
+# max 5 lines
+limit 5
+# define metrics
+metrics name:e.llm
+# show absolute numbers
+compare ratio
+functions
--- a/ex3/seq/jacobi_oo_stl/jac.dem
+++ b/ex3/seq/jacobi_oo_stl/jac.dem
@ -0,0 +1,21 @@
+set style data lines
+set parametric
+set hidden3d
+set nokey
+#set xrange [0:1]
+#set yrange [-0:1]
+#set zrange [-2:2]
+set cntrparam levels 15
+set contour base
+set title "Solution"
+xrot=60
+zrot=0
+splot "t.dat"
+#splot "lsg.gnu"
+pause -1 "Press ENTER to continue."
+#load "gnuplot.rot"
+#set title ""
+#set autosc
+#set nohidden
+#set nopara
+#set key
--- a/ex3/seq/jacobi_oo_stl/jacobi.cbp
+++ b/ex3/seq/jacobi_oo_stl/jacobi.cbp
@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_project_file>
+	<FileVersion major="1" minor="6" />
+	<Project>
+		<Option title="jacobi" />
+		<Option pch_mode="2" />
+		<Option compiler="gcc" />
+		<Build>
+			<Target title="Debug">
+				<Option output="bin/Debug/jacobi" prefix_auto="1" extension_auto="1" />
+				<Option object_output="obj/Debug/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-g" />
+				</Compiler>
+			</Target>
+			<Target title="Release">
+				<Option output="bin/Release/jacobi" prefix_auto="1" extension_auto="1" />
+				<Option object_output="obj/Release/" />
+				<Option type="1" />
+				<Option compiler="gcc" />
+				<Compiler>
+					<Add option="-O2" />
+				</Compiler>
+				<Linker>
+					<Add option="-s" />
+				</Linker>
+			</Target>
+		</Build>
+		<Compiler>
+			<Add option="-Wshadow" />
+			<Add option="-Winit-self" />
+			<Add option="-Wredundant-decls" />
+			<Add option="-Wcast-align" />
+			<Add option="-Wundef" />
+			<Add option="-Wfloat-equal" />
+			<Add option="-Wunreachable-code" />
+			<Add option="-Wmissing-declarations" />
+			<Add option="-Wswitch-default" />
+			<Add option="-Weffc++" />
+			<Add option="-Wmain" />
+			<Add option="-pedantic" />
+			<Add option="-Wextra" />
+			<Add option="-Wall" />
+			<Add option="-fexceptions" />
+		</Compiler>
+		<Unit filename="geom.cpp" />
+		<Unit filename="geom.h" />
+		<Unit filename="getmatrix.cpp" />
+		<Unit filename="getmatrix.h" />
+		<Unit filename="jacsolve.cpp" />
+		<Unit filename="jacsolve.h" />
+		<Unit filename="main.cpp" />
+		<Unit filename="userset.cpp" />
+		<Unit filename="userset.h" />
+		<Unit filename="vdop.cpp" />
+		<Unit filename="vdop.h" />
+		<Extensions>
+			<code_completion />
+			<envvars />
+			<lib_finder disable_auto="1" />
+			<debugger />
+			<DoxyBlocks>
+				<comment_style block="0" line="0" />
+				<doxyfile_project />
+				<doxyfile_build extract_all="1" />
+				<doxyfile_warnings />
+				<doxyfile_output />
+				<doxyfile_dot class_diagrams="1" have_dot="1" />
+				<general use_at_in_tags="1" />
+			</DoxyBlocks>
+		</Extensions>
+	</Project>
+</CodeBlocks_project_file>
--- a/ex3/seq/jacobi_oo_stl/jacobi.layout
+++ b/ex3/seq/jacobi_oo_stl/jacobi.layout
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_layout_file>
+	<ActiveTarget name="Debug" />
+</CodeBlocks_layout_file>
--- a/ex3/seq/jacobi_oo_stl/jacsolve.cpp
+++ b/ex3/seq/jacobi_oo_stl/jacsolve.cpp
@ -0,0 +1,61 @@
+#include "vdop.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+// #####################################################################
+// 	const int neigh[], const int color,
+// 	const MPI::Intracomm& icomm,
+void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
+{
+    const double omega   = 1.0;
+    const int    maxiter = 1000;
+    const double tol  = 1e-5,                // tolerance
+                 tol2 = tol * tol;           // tolerance^2
+
+    int nrows = SK.Nrows();                  // number of rows == number of columns
+    assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
+
+    cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s"  << endl;
+    //  Choose initial guess
+    for (int k = 0; k < nrows; ++k)
+    {
+        u[k] = 0.0;                          //  u := 0
+    }
+
+    vector<double> dd(nrows);                // matrix diagonal
+    vector<double>  r(nrows);                // residual
+    vector<double>  w(nrows);                // correction
+
+    SK.GetDiag(dd);                          //  dd := diag(K)
+    ////DebugVector(dd);{int ijk; cin >> ijk;}
+
+    //  Initial sweep
+    SK.Defect(r, f, u);                      //  r := f - K*u
+
+    vddiv(w, r, dd);                         //  w := D^{-1}*r
+    double sigma0 = dscapr(w, r);            // s0 := <w,r>
+
+    // Iteration sweeps
+    int iter  = 0;
+    double sigma = sigma0;
+    while ( sigma > tol2 * sigma0 && maxiter > iter)
+    {
+        ++iter;
+        vdaxpy(u, u, omega, w );             //  u := u + om*w
+        SK.Defect(r, f, u);                  //  r := f - K*u
+        vddiv(w, r, dd);                     //  w := D^{-1}*r
+        sigma = dscapr(w, r);                // s0 := <w,r>
+//      	cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
+    }
+    cout << "aver. Jacobi rate :  " << exp(log(sqrt(sigma / sigma0)) / iter) << "  (" << iter << " iter)" << endl;
+    cout << "final error: " << sqrt(sigma / sigma0) << " (rel)   " << sqrt(sigma) << " (abs)\n";
+
+    return;
+}
+
--- a/ex3/seq/jacobi_oo_stl/jacsolve.h
+++ b/ex3/seq/jacobi_oo_stl/jacsolve.h
@ -0,0 +1,18 @@
+#ifndef JACSOLVE_FILE
+#define JACSOLVE_FILE
+#include "getmatrix.h"
+#include <vector>
+
+
+/**
+ * Solves linear system of equations  K @p u = @p f  via the Jacobi iteration.
+ * We use a distributed symmetric  CSR matrix @p SK and initial guess of the
+ * solution is set to 0.
+ * @param[in] SK	CSR matrix
+ * @param[in] f		distributed local vector storing the right hand side
+ * @param[out] u	accumulated local vector storing the solution.
+*/
+void JacobiSolve(CRS_Matrix const &SK, std::vector<double> const &f, std::vector<double> &u);
+
+
+#endif
--- a/ex3/seq/jacobi_oo_stl/main.GCC_
+++ b/ex3/seq/jacobi_oo_stl/main.GCC_
--- a/ex3/seq/jacobi_oo_stl/main.cpp
+++ b/ex3/seq/jacobi_oo_stl/main.cpp
@ -0,0 +1,129 @@
+//		MPI code in C++.
+//		See [Gropp/Lusk/Skjellum, "Using MPI", p.33/41 etc.]
+//		and  /opt/mpich/include/mpi2c++/comm.h  for details
+
+#include "geom.h"
+#include "getmatrix.h"
+#include "jacsolve.h"
+#include "userset.h"
+#include "vdop.h"
+
+#include <chrono>             // timing
+#include <cmath>
+#include <iostream>
+using namespace std;
+using namespace std::chrono;  // timing
+
+
+int main(int, char ** )
+{
+    const int numprocs = 1;
+    const int myrank   = 0;
+
+    if (myrank == 0)
+    {
+        cout << "\n There are " << numprocs << " processes running.\n \n";
+    }
+
+    const auto procx = static_cast<int>(sqrt(numprocs + 0.0));
+    const int  procy = procx;
+
+    if (procy * procx != numprocs)
+    {
+        cout << "\n Wrong number of processors !\n \n";
+    }
+    else
+    {
+// #####################################################################
+//      Here starts the real code
+// #####################################################################
+        //bool ScaleUp = !true;
+        int nx, ny, NXglob, NYglob; /* number of local intervals on (xl,xr)=:nx, (yb,yt)=:ny */
+        //nx = 1024;
+        //ny = 1024;
+        nx = 100;
+        ny = 100;
+        NXglob = nx * procx;
+        NYglob = ny * procy;
+        cout << "Intervalls: " << NXglob << " x " << NYglob << endl;
+
+// ##################### STL ###########################################
+        {
+            Mesh_2d_3_square const mesh(nx, ny);
+            //mesh.Debug();
+
+            CRS_Matrix SK(mesh);                   // CRS matrix
+            //SK.Debug();
+            vector<double> uv(SK.Nrows(), 0.0);    // temperature
+            vector<double> fv(SK.Nrows(), 0.0);    // r.h.s.
+
+            SK.CalculateLaplace(fv);
+            //SK.Debug();
+
+            //mesh.SetU(uv);         // deprecated
+            //mesh.SetF(fv);         // deprecated
+            // Two ways to initialize the vector
+            //mesh.SetValues(uv,f_zero);             // functional
+            mesh.SetValues(uv, [](double x, double y) -> double {return 0.0 * x *y;} ); // lambda function
+
+            SK.ApplyDirichletBC(uv, fv);
+            //SK.Compare2Old(nnode, id, ik, sk);
+            //SK.Debug();
+
+            auto tstart = system_clock::now();       // start timer
+
+            JacobiSolve(SK, fv, uv );          // solve the system of equations
+
+            auto tend = system_clock::now();         // end timer
+            auto duration = duration_cast<microseconds>(tend - tstart);
+            auto t1 = static_cast<double>(duration.count()) / 1e6 ;   // t1 in seconds
+            cout << "JacobiSolve: timing in sec. : " << t1 << endl;
+
+            //CompareVectors(uv, nnode, u, 1e-6);    // Check correctness
+
+            //mesh.SaveVectorP("t.dat", uv);
+            //mesh.Visualize(uv);
+
+        }
+// ##################### STL ###########################################
+        {
+            //Mesh_2d_3_matlab const mesh("square_tiny.txt");
+            Mesh_2d_3_matlab const mesh("square_100.txt");
+            //Mesh_2d_3_matlab const mesh("L_shape.txt");
+            //mesh.Debug();
+
+            CRS_Matrix SK(mesh);                   // CRS matrix
+            //SK.Debug();
+
+            vector<double> uv(SK.Nrows(), 0.0);    // temperature
+            vector<double> fv(SK.Nrows(), 0.0);    // r.h.s.
+
+            SK.CalculateLaplace(fv);
+            //SK.Debug();
+
+            //mesh.SetU(uv);         // deprecated
+            // Two ways to initialize the vector
+            //mesh.SetValues(uv,f_zero);             // user function
+            mesh.SetValues(uv, [](double x, double y) -> double {return 0.0 * x *y;} ); // lambda function
+
+            SK.ApplyDirichletBC(uv, fv);
+            //SK.Compare2Old(nnode, id, ik, sk);
+            //SK.Debug();
+
+            auto tstart = system_clock::now();       // start timer
+
+            JacobiSolve(SK, fv, uv );          // solve the system of equations
+
+            auto tend = system_clock::now();         // end timer
+            auto duration = duration_cast<microseconds>(tend - tstart);
+            auto t1 = static_cast<double>(duration.count()) / 1e6 ;   // t1 in seconds
+            cout << "JacobiSolve: timing in sec. : " << t1 << endl;
+
+            //mesh.Write_ascii_matlab("uv.txt", uv);
+            //mesh.Visualize(uv);
+        }
+        return 0;
+    }
+}
+
+
--- a/ex3/seq/jacobi_oo_stl/nl.awk
+++ b/ex3/seq/jacobi_oo_stl/nl.awk
@ -0,0 +1,20 @@
+#
+#	Have to add a newline for a new row of coordinates
+#
+BEGIN { OFS="	"; YO=-1.23456789; X=YO; Y=YO; Z=YO }
+      { 
+        if ($1!="")
+	{
+	  if ($1!=YO) { print " "; YO=$1 }
+          if ($1==X && $2==Y)
+	   {
+#	     print $1,$2,($3+Z)/2
+	   }
+	   else
+	   {
+	     print $1,$2,$3
+	   }
+	  X=$1; Y=$2; Z=$3;
+	}
+      }
+END {}
--- a/ex3/seq/jacobi_oo_stl/out_100_GCC.txt
+++ b/ex3/seq/jacobi_oo_stl/out_100_GCC.txt
@ -0,0 +1,9 @@
+
+ There are 1 processes running.
+ 
+Intervalls: 100 x 100
+
+ Start Jacobi solver for 10201 d.o.f.s
+aver. Jacobi rate :  0.997922  (1000 iter)
+final error: 0.124971 (rel)   0.000194029 (abs)
+JacobiSolve: timing in sec. : 0.155127
--- a/ex3/seq/jacobi_oo_stl/small_Doxyfile
+++ b/ex3/seq/jacobi_oo_stl/small_Doxyfile
--- a/ex3/seq/jacobi_oo_stl/square.m
+++ b/ex3/seq/jacobi_oo_stl/square.m
@ -0,0 +1,41 @@
+% Square: 
+%   flatpak run org.octave.Octave <filename>
+%      or
+%   octave --no-window-system --no-gui  -qf <filename>
+
+clear all
+clc
+% %% L-shape
+% g=[2 0 2 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+%    2 2 2 0 1 1 0;
+%    2 2 1 1 0.5 1 0;
+%    2 1 1 0.5 2 1 0;
+%    2 1 0 2 2 1 0;
+%    2 0 0 2 0 1 0]';
+
+%% square
+g=[2 0 1 0 0 1 0;        % #vertices,v_1x, v_2x, v_1y, v_2y, subdomain_left, subdomain_right
+   2 1 1 0 1 1 0;
+   2 1 0 1 1 1 0;
+   2 0 0 1 0 1 0]';
+
+[p,e,t] = initmesh(g,'hmax',0.01); 
+pdemesh(p,e,t)
+
+%% GH
+% output from <https://de.mathworks.com/help/pde/ug/initmesh.html initmesh>
+%
+% coordinates  p: [2][nnode]
+% connectivity t: [4][nelem]   with  t(4,:) are the subdomain numbers
+% edges        e: [7][nedges]  boundary edges
+%                              e([1,2],:) - start/end vertex of edge
+%                              e([3,4],:) - start/end values
+%                              e(5,:)     - segment number
+%                              e([6,7],:) - left/right subdomain
+
+ascii_write_mesh( p, t, e, mfilename);
+
+
+
+% tmp=t(1:3,:)
+
--- a/ex3/seq/jacobi_oo_stl/square_100.txt
+++ b/ex3/seq/jacobi_oo_stl/square_100.txt
--- a/ex3/seq/jacobi_oo_stl/square_tiny.txt
+++ b/ex3/seq/jacobi_oo_stl/square_tiny.txt
@ -0,0 +1,95 @@
+13
+2
+16
+3
+0
+0
+1
+0
+1
+1
+0
+1
+0.5
+0
+1
+0.5
+0.5
+1
+0
+0.5
+0.4999999999999999
+0.4999999999999999
+0.3333333333333333
+0.6666666666666666
+0.6666666666666666
+0.6666666666666666
+0.6666666666666666
+0.3333333333333333
+0.3333333333333333
+0.3333333333333333
+8
+1
+13
+5
+2
+12
+6
+3
+11
+7
+4
+10
+1
+5
+13
+10
+8
+13
+2
+6
+12
+3
+7
+11
+4
+8
+10
+12
+9
+13
+10
+9
+11
+7
+10
+11
+11
+9
+12
+6
+11
+12
+9
+10
+13
+5
+12
+13
+8
+1
+5
+5
+2
+2
+6
+6
+3
+3
+7
+7
+4
+4
+8
+8
+1
--- a/ex3/seq/jacobi_oo_stl/userset.cpp
+++ b/ex3/seq/jacobi_oo_stl/userset.cpp
@ -0,0 +1,16 @@
+#include "userset.h"
+#include <cmath>
+
+
+double FunctF(double const x, double const y)
+{
+// return  std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
+//  return 16.0*1024. ;
+// return (double)1.0 ;
+    return x * x * std::sin(2.5 * 3.14159 * y);
+}
+
+double FunctU(const double /* x */, double const /* y */)
+{
+    return 1.0 ;
+}
--- a/ex3/seq/jacobi_oo_stl/userset.h
+++ b/ex3/seq/jacobi_oo_stl/userset.h
@ -0,0 +1,44 @@
+#ifndef USERSET_FILE
+#define USERSET_FILE
+#include <cmath>
+/**
+ * User function: f(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for right hand side f(@p x,@p y)
+ */
+double FunctF(double const x, double const y);
+
+/**
+ * User function: u(@p x,@p y)
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value for solution vector u(@p x,@p y)
+ */
+double FunctU(double const x, double const y);
+
+
+/**
+ * User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value f(@p x,@p y)
+ */
+inline double fNice(double const x, double const y)
+{
+    return x * x * std::sin(2.5 * 3.14159 * y);
+}
+
+/**
+ * User function: f(@p x,@p y) = 0$.
+ * @param[in] x		x-coordinate of discretization point
+ * @param[in] y		y-coordinate of discretization point
+ * @return  value 0
+ */
+inline double f_zero(double const x, double const y)
+//double f_zero(double const  /*x*/, double const /*y*/)
+{
+    return 0.0 + 0.0*(x+y);
+}
+
+#endif
--- a/ex3/seq/jacobi_oo_stl/vdop.cpp
+++ b/ex3/seq/jacobi_oo_stl/vdop.cpp
@ -0,0 +1,84 @@
+#include "vdop.h"
+#include <cassert>               // assert()
+#include <cmath>
+#include <iostream>
+#include <vector>
+using namespace std;
+
+
+void vddiv(vector<double> &x, vector<double> const &y,
+           vector<double> const &z)
+{
+    assert( x.size() == y.size() && y.size() == z.size() );
+    size_t n = x.size();
+
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] / z[k];
+    }
+    return;
+}
+
+//******************************************************************************
+
+void vdaxpy(std::vector<double> &x, std::vector<double> const &y,
+            double alpha, std::vector<double> const &z )
+{
+    assert( x.size() == y.size() && y.size() == z.size() );
+    size_t n = x.size();
+
+    for (size_t k = 0; k < n; ++k)
+    {
+        x[k] = y[k] + alpha * z[k];
+    }
+    return;
+}
+//******************************************************************************
+
+double dscapr(std::vector<double> const &x, std::vector<double> const &y)
+{
+    assert( x.size() == y.size());
+    size_t n = x.size();
+
+    double    s = 0.0;
+    for (size_t k = 0; k < n; ++k)
+    {
+        s += x[k] * y[k];
+    }
+
+    return s;
+}
+
+//******************************************************************************
+void DebugVector(vector<double> const &v)
+{
+    cout << "\nVector  (nnode = " << v.size() << ")\n";
+    for (size_t j = 0; j < v.size(); ++j)
+    {
+        cout.setf(ios::right, ios::adjustfield);
+        cout << v[j] << "   ";
+    }
+    cout << endl;
+
+    return;
+}
+//******************************************************************************
+bool CompareVectors(std::vector<double> const &x, int const n, double const y[], double const eps)
+{
+    bool bn = (static_cast<int>(x.size()) == n);
+    if (!bn)
+    {
+        cout << "#########   Error: " << "number of elements" << endl;
+    }
+    //bool bv = equal(x.cbegin(),x.cend(),y);
+    bool bv = equal(x.cbegin(), x.cend(), y,
+                    [eps](double a, double b) -> bool
+    { return std::abs(a - b) < eps * (1.0 + 0.5 * (std::abs(a) + std::abs(a))); }
+                   );
+    if (!bv)
+    {
+        assert(static_cast<int>(x.size()) == n);
+        cout << "#########   Error: " << "values" << endl;
+    }
+    return bn && bv;
+}
--- a/ex3/seq/jacobi_oo_stl/vdop.h
+++ b/ex3/seq/jacobi_oo_stl/vdop.h
@ -0,0 +1,58 @@
+#ifndef VDOP_FILE
+#define VDOP_FILE
+#include <vector>
+
+/** @brief  Element-wise vector divison x_k = y_k/z_k.
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  z  source vector
+ *
+ */
+void vddiv(std::vector<double> & x, std::vector<double> const& y,
+                                    std::vector<double> const& z);
+
+/** @brief  Element-wise daxpy operation x(k) = y(k) + alpha*z(k).
+ *
+ * @param[out] x  target vector
+ * @param[in]  y  source vector
+ * @param[in]  alpha  scalar
+ * @param[in]  z  source vector
+ *
+ */
+void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
+                       double alpha, std::vector<double> const& z );
+
+
+/** @brief  Calculates the Euclidian inner product of two vectors.
+ *
+ * @param[in]  x vector
+ * @param[in]  y vector
+ * @return Euclidian inner product @f$\langle x,y \rangle@f$
+ *
+ */
+double dscapr(std::vector<double> const& x, std::vector<double> const& y);
+
+
+/**
+ * Print entries of a vector.
+ * @param[in] v	    vector values
+*/
+void DebugVector(std::vector<double> const &v);
+
+/** @brief  Compares an STL vector with POD vector.
+ *
+ * The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$
+ * follows the book by
+ * <a href="https://www.springer.com/la/book/9783319446592">Stoyan/Baran</a>, p.8.
+ *
+ * @param[in]  x    STL vector
+ * @param[in]  n    length of POD vector
+ * @param[in]  y    POD vector
+ * @param[in]  eps  relative accuracy criteria (default := 0.0).
+ * @return true iff pairwise vector elements are relatively close to each other.
+ *
+ */
+bool CompareVectors(std::vector<double> const& x, int n, double const y[], double const eps=0.0);
+
+#endif
--- a/ex3/seq/jacobi_oo_stl/visualize_results.m
+++ b/ex3/seq/jacobi_oo_stl/visualize_results.m
@ -0,0 +1,20 @@
+%% Visualize results
+%
+%   flatpak run org.octave.Octave <filename>
+%      or
+%   octave --no-window-system --no-gui  -qf <filename>
+%
+%      or
+%   matlab -nosplash <   <filename>
+
+clear all
+clc
+
+%%
+fname = 'uv.txt';
+
+[xc,ia,v] = ascii_read_meshvector(fname);
+
+h = trisurf(ia, xc(:,1), xc(:,2), v);
+
+waitfor(h)                     % wait for closing the figure
--- a/ex3/seq/skalar/Doxyfile
+++ b/ex3/seq/skalar/Doxyfile
--- a/ex3/seq/skalar/Makefile
+++ b/ex3/seq/skalar/Makefile
@ -0,0 +1,32 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp mylib.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -O3 -ftree-vectorize -fopt-info-vec-missed -fopt-info-vec-optimized 
+CXXFLAGS += -mavx2 -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store
+#-funroll-loops
+LINKFLAGS += -g -ltbb
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
--- a/ex3/seq/skalar/c_main.cpp
+++ b/ex3/seq/skalar/c_main.cpp
@ -0,0 +1,97 @@
+#include "mylib.h"
+
+#include <chrono>             // timing
+#include <cstdlib>            // atoi()
+#include <cstring>            // strncmp()
+#include <iostream>
+#include <sstream>
+
+using namespace std;
+using namespace std::chrono;  // timing
+
+int main(int argc, char **argv)
+{
+    int const NLOOPS = 50;        // chose a value such that the benchmark runs at least 10 sec.
+    unsigned int N = 50000001;
+//##########################################################################
+//   Read Paramater from command line  (C++ style)
+    cout << "Checking command line parameters for: -n <number> " << endl;
+    for (int i = 1; i < argc; i++)
+    {
+        cout << " arg[" << i << "] = " << argv[i] << endl;
+        if (std::strncmp(argv[i], "-n", 2) == 0 && i + 1 < argc) // found "-n" followed by another parameter
+        {
+            N = static_cast<unsigned int>(atoi(argv[i + 1]));
+        }
+        else
+        {
+            cout << "Corect call: " << argv[0] << " -n  <number>\n";
+        }
+    }
+
+    cout << "\nN = " << N << endl;
+
+//##########################################################################
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    double *x, *y;
+    x = new double [N];
+    y = new double [N];
+
+    cout.precision(2);
+    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//##########################################################################
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        x[i] = i + 1;
+        y[i] = 1.0 / x[i];
+    }
+
+//##########################################################################
+    cout << "\nStart Benchmarking\n";
+
+    auto tstart = system_clock::now();       // start timer
+
+// Do calculation
+    double sk(0.0);
+
+    for (int i = 0; i < NLOOPS; ++i)
+    {
+        sk = scalar(N, x, y);
+//      sk = norm(N,x);
+    }
+
+    auto tend = system_clock::now();         // end timer
+    auto duration = duration_cast<microseconds>(tend - tstart);
+    auto t1 = static_cast<double>(duration.count()) / 1e6 ;   // t1 in seconds
+    t1 /= NLOOPS;           // divide by number of function calls
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "Timing in sec. : " << t1 << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s        : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+
+//##########################################################################
+//  Free allocated memory
+    delete [] y;
+    delete [] x;
+
+    return 0;
+}
--- a/ex3/seq/skalar/compile.log
+++ b/ex3/seq/skalar/compile.log
@ -0,0 +1 @@
+
--- a/ex3/seq/skalar/main.cpp
+++ b/ex3/seq/skalar/main.cpp
@ -0,0 +1,105 @@
+#include "mylib.h"
+
+#include <chrono>             // timing
+#include <cstdlib>            // atoi()
+#include <cstring>            // strncmp()
+#include <execution>          // policy
+#include <iostream>
+#include <numeric>            // transform_reduce
+#include <sstream>
+#include <vector>
+
+using namespace std;
+using namespace std::chrono;  // timing
+
+int main(int argc, char **argv)
+{
+    int const NLOOPS = 50;        // chose a value such that the benchmark runs at least 10 sec.
+    unsigned int N = 50000001;
+//##########################################################################
+//   Read Paramater from command line  (C++ style)
+    cout << "Checking command line parameters for: -n <number> " << endl;
+    for (int i = 1; i < argc; i++)
+    {
+        cout << " arg[" << i << "] = " << argv[i] << endl;
+        if (std::strncmp(argv[i], "-n", 2) == 0 && i + 1 < argc) // found "-n" followed by another parameter
+        {
+            N = static_cast<unsigned int>(atoi(argv[i + 1]));
+        }
+        else
+        {
+            cout << "Corect call: " << argv[0] << " -n  <number>\n";
+        }
+    }
+
+    cout << "\nN = " << N << endl;
+
+//##########################################################################
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    //double *x = new double [N];
+    //double *y = new double [N];
+    vector<double> x(N);
+    vector<double> y(N);
+    //alignas(16) double x[N], y[N];
+
+    cout.precision(2);
+    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GiByte Memory allocated\n";
+    cout.precision(6);
+
+//##########################################################################
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        x[i] = i + 1;
+        y[i] = 1.0 / x[i];
+    }
+
+//##########################################################################
+    cout << "\nStart Benchmarking\n";
+
+    auto tstart = system_clock::now();       // start timer
+
+// Do calculation
+    double sk(0.0);
+
+    for (int i = 0; i < NLOOPS; ++i)
+    {
+        //sk = scalar(N, x, y);
+        sk += scalar_unroll(N, x.data(), y.data());
+        //sk += transform_reduce(std::execution::par_unseq,cbegin(x),cend(x),cbegin(y),0.0);
+//      sk = norm(N,x);
+    }
+
+    auto tend = system_clock::now();         // end timer
+    auto duration = duration_cast<microseconds>(tend - tstart);
+    auto t1 = static_cast<double>(duration.count()) / 1e6 ;   // t1 in seconds
+    t1 /= NLOOPS;           // divide by number of function calls
+
+//##########################################################################
+// Check the correct result
+    sk /= NLOOPS;
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "Timing in sec. : " << t1 << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s        : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+
+//##########################################################################
+//  Free allocated memory
+    //delete [] y;
+    //delete [] x;
+
+    return 0;
+}
--- a/ex3/seq/skalar/mylib.cpp
+++ b/ex3/seq/skalar/mylib.cpp
@ -0,0 +1,56 @@
+#include "mylib.h"
+#include <cassert>                  // assert()
+#include <cmath>
+#include <cstdlib>                  // alignof()
+//#include <iostream>
+#include <numeric>
+#include <vector>
+using namespace std;
+
+
+double scalar(unsigned int const N, double const x[], double const y[])
+{
+    double sum = 0.0;
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+//     sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+double scalar_unroll(unsigned int const N, double const x[], double const y[])
+{
+    constexpr unsigned int Stride{8};
+    alignas(32) double sk[Stride] = {0.0};
+    assert(alignof(sk)==32);
+    assert(alignof(x)==8);
+    assert(alignof(y)==8);
+    for (unsigned int i = 0; i < (N/Stride)*Stride; i+=Stride)
+    {
+        for (unsigned int k=0; k<Stride; ++k)
+        {
+            sk[k] += x[i+k] * y[i+k];
+        }
+    }
+    double sum = std::accumulate(sk, sk+Stride,0.0);
+    for (unsigned int i = (N/Stride)*Stride; i < N; ++i)
+    {
+        sum += x[i]*y[i];
+    }
+    
+    return sum;
+}
+
+
+
+double norm(unsigned int const N, double const x[])
+{
+    double sum = 0.0;
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        sum += x[i] * x[i];
+    }
+    return std::sqrt(sum);
+}
+
--- a/ex3/seq/skalar/mylib.h
+++ b/ex3/seq/skalar/mylib.h
@ -0,0 +1,17 @@
+#pragma once
+
+/** 	Inner product
+	@param[in] N	number of vector elements
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar(unsigned int N, double const x[], double const y[]);
+double scalar_unroll(unsigned int const N, double const x[], double const y[]);
+
+/** 	L_2 Norm of a vector
+	@param[in] N	number of vector elements
+	@param[in] x	vector
+	@return 	    resulting Euclidian norm <x,y>
+*/
+double norm(unsigned int N, double const x[]);
--- a/ex3/seq/skalar/small_Doxyfile
+++ b/ex3/seq/skalar/small_Doxyfile
--- a/ex3/seq/skalar_stl/Doxyfile
+++ b/ex3/seq/skalar_stl/Doxyfile
--- a/ex3/seq/skalar_stl/Makefile
+++ b/ex3/seq/skalar_stl/Makefile
@ -0,0 +1,30 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+
+SOURCES = main.cpp mylib.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
--- a/ex3/seq/skalar_stl/compile.log
+++ b/ex3/seq/skalar_stl/compile.log
@ -0,0 +1 @@
+
--- a/ex3/seq/skalar_stl/main.cpp
+++ b/ex3/seq/skalar_stl/main.cpp
@ -0,0 +1,118 @@
+#include "mylib.h"
+#include <cassert>
+#include <chrono>           // timing
+#include <cmath>            // sqrt()
+#include <cstdlib>          // atoi()
+#include <cstring>          // strncmp()
+#include <ctime>
+#include <iostream>
+#include <sstream>
+using namespace std;
+using namespace std::chrono;  // timing
+
+int main(int argc, char **argv)
+{
+    int const NLOOPS = 50;        // chose a value such that the benchmark runs at least 10 sec.
+    unsigned int N = 50000001;
+//##########################################################################
+//   Read Paramater from command line  (C++ style)
+    cout << "Checking command line parameters for: -n <number> " << endl;
+    for (int i = 1; i < argc; i++)
+    {
+        cout << " arg[" << i << "] = " << argv[i] << endl;
+        if (std::strncmp(argv[i], "-n", 2) == 0 && i + 1 < argc) // found "-n" followed by another parameter
+        {
+            N = static_cast<unsigned int>(atoi(argv[i + 1]));
+        }
+        else
+        {
+            cout << "Corect call: " << argv[0] << " -n  <number>\n";
+        }
+    }
+
+    cout << "\nN = " << N << endl;
+
+//##########################################################################
+//  Memory allocation
+    cout << "Memory allocation\n";
+
+    vector<double> x(N), y(N);
+
+    cout.precision(2);
+    cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
+    cout.precision(6);
+
+//##########################################################################
+//  Data initialization
+//  Special:  x_i = i+1;  y_i = 1/x_i  ==> <x,y> == N
+    for (unsigned int i = 0; i < N; ++i)
+    {
+        x[i] = i + 1;
+        y[i] = 1.0 / x[i];
+    }
+
+//##########################################################################
+    cout << "\nStart Benchmarking\n";
+
+    auto t1 = system_clock::now(); // start timer
+// Do calculation
+    double sk(0.0),ss(0.0);
+    for (int i = 0; i < NLOOPS; ++i)
+    {
+        sk = scalar(x, y);
+        //sk = scalar_cblas(x, y);
+      //sk = norm(x);
+        ss += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t2 = system_clock::now();  // stop timer
+    auto duration = duration_cast<microseconds>(t2 - t1);        // duration in microseconds
+    double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
+    t_diff = t_diff/NLOOPS;                                      // duration per loop seconds
+    
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "==" 
+
+//##########################################################################
+// Check the correct result
+    cout << "\n <x,y> = " << sk << endl;
+    if (static_cast<unsigned int>(sk) != N)
+    {
+        cout << "  !!   W R O N G  result   !!\n";
+    }
+    cout << endl;
+
+//##########################################################################
+// Timings  and Performance
+    cout << endl;
+    cout.precision(2);
+    cout << "Timing in sec. : " << t_diff << endl;
+    cout << "GFLOPS         : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 << endl;
+    cout << "GiByte/s       : " << 2.0 * N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
+
+//##########################################################################
+    cout << "\nStart Benchmarking norm\n";
+
+    auto t3 = system_clock::now(); // start timer
+// Do calculation
+    double ss2(0.0);
+    for (int i = 0; i < NLOOPS; ++i)
+    {
+        auto sk = sqrt(scalar(x, x));
+        //auto sk = norm(x);         // the same timing as scalar(x, x)
+        ss2 += sk;                   // prevents the optimizer from removing unused calculation results.
+    }
+
+    auto t4 = system_clock::now();  // stop timer
+    auto duration2 = duration_cast<microseconds>(t4 - t3);        // duration in microseconds
+    double t_diff2 = static_cast<double>(duration2.count()) / 1e6; // overall duration in seconds
+    t_diff2 = t_diff2/NLOOPS;                                      // duration per loop seconds
+    
+    //assert(std::abs(ss/NLOOPS-sk)<1e-5);  // avoids unsafe floating point comparison "==" 
+    cout << "ss(norm): " << ss2 << endl;
+    cout << "Timing in sec. : " << t_diff2 << endl;
+
+//##########################################################################
+
+
+    return 0;
+}  // memory for x and y will be deallocated by their destructors
--- a/ex3/seq/skalar_stl/mylib.cpp
+++ b/ex3/seq/skalar_stl/mylib.cpp
@ -0,0 +1,65 @@
+#include "mylib.h"
+#include <cassert>       // assert()
+#include <cmath>
+#include <vector>
+
+#ifdef __INTEL_CLANG_COMPILER
+#pragma message(" ##########  Use of MKL  ###############")
+#include <mkl.h>
+#else
+#pragma message(" ##########  Use of CBLAS  ###############")
+//extern "C"
+//{
+#include <cblas.h>               // cBLAS Library
+#include <lapacke.h>             // Lapack
+//}
+#endif
+
+using namespace std;
+
+double scalar(vector<double> const &x, vector<double> const &y)
+{
+    assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * y[i];
+        //sum += exp(x[i])*log(y[i]);
+    }
+    return sum;
+}
+
+
+double scalar_cblas(vector<double> const &x, vector<double> const &y)
+{
+    int const asize = static_cast<int>(size(x));
+    int const bsize = static_cast<int>(size(y));
+    assert(asize == bsize); // switch off via compile flag: -DNDEBUG
+	return cblas_ddot(asize,x.data(),1,y.data(),1);    
+    //assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+	//return cblas_ddot(x.size(),x.data(),1,y.data(),1);
+}
+
+float scalar_cblas(vector<float> const &x, vector<float> const &y)
+{
+    int const asize = static_cast<int>(size(x));
+    int const bsize = static_cast<int>(size(y));
+    assert(asize == bsize); // switch off via compile flag: -DNDEBUG
+	return cblas_sdot(asize,x.data(),1,y.data(),1);    
+    //assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
+	//return cblas_ddot(x.size(),x.data(),1,y.data(),1);
+}
+
+
+double norm(vector<double> const &x)
+{
+    size_t const N = x.size();
+    double sum = 0.0;
+    for (size_t i = 0; i < N; ++i)
+    {
+        sum += x[i] * x[i];
+    }
+    return std::sqrt(sum);
+}
+
--- a/ex3/seq/skalar_stl/mylib.h
+++ b/ex3/seq/skalar_stl/mylib.h
@ -0,0 +1,30 @@
+#ifndef FILE_MYLIB
+#define FILE_MYLIB
+#include <vector>
+
+/** 	Inner product
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar(std::vector<double> const &x, std::vector<double> const &y);
+
+/** 	Inner product using BLAS routines
+	@param[in] x	vector
+	@param[in] y	vector
+	@return 	    resulting Euclidian inner product <x,y>
+*/
+double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
+float scalar_cblas(std::vector<float> const &x, std::vector<float> const &y);
+
+
+/** 	L_2 Norm of a vector
+	@param[in] x	vector
+	@return 	    resulting Euclidian norm <x,y>
+*/
+double norm(std::vector<double> const &x);
+
+
+
+
+#endif
--- a/ex3/seq/skalar_stl/small_Doxyfile
+++ b/ex3/seq/skalar_stl/small_Doxyfile
--- a/ex3/seq/thread_17/Doxyfile
+++ b/ex3/seq/thread_17/Doxyfile
--- a/ex3/seq/thread_17/Makefile
+++ b/ex3/seq/thread_17/Makefile
@ -0,0 +1,48 @@
+#
+# use GNU-Compiler tools
+COMPILER=GCC_
+# alternatively from the shell
+# export COMPILER=GCC_
+# or, alternatively from the shell
+# make COMPILER=GCC_
+
+# use Intel compilers
+#COMPILER=ICC_
+
+# use PGI compilers
+# COMPILER=PGI_
+
+# use CLANG compilers
+# COMPILER=CLANG_
+
+
+#SOURCES = main.cpp
+SOURCES = gh_main.cpp
+OBJECTS = $(SOURCES:.cpp=.o)
+
+PROGRAM	= main.${COMPILER}
+
+# uncomment the next to lines for debugging and detailed performance analysis
+CXXFLAGS += -g
+LINKFLAGS += -g
+# needed for parallel threads in C++-17
+LINKFLAGS += -ltbb
+
+#https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned
+#dpcpp my_app.cpp -DPSTL_USE_PARALLEL_POLICIES=0 # for libstdc++ 9
+#dpcpp my_app.cpp -D_GLIBCXX_USE_TBB_PAR_BACKEND=0 # for libstdc++ 10
+
+
+# do not use -pg with PGI compilers
+
+ifndef COMPILER
+  COMPILER=GCC_
+endif
+
+include ../${COMPILER}default.mk
+
+
+task:
+	@pdflatex task
+	@pdflatex task
+
--- a/ex3/seq/thread_17/compile.log
+++ b/ex3/seq/thread_17/compile.log
@ -0,0 +1 @@
+
--- a/ex3/seq/thread_17/gh_main.cpp
+++ b/ex3/seq/thread_17/gh_main.cpp
@ -0,0 +1,101 @@
+// C++ Vorlesung  xxx
+// C++-17: Threads, execution policies
+//   Great web pages, great book by Filipek:   https://www.bfilipek.com/2018/06/parstl-tests.html
+
+/*
+ g++ -O3 -std=c++17 gh_main.cpp -ltbb
+ g++ -O3 -std=c++17 -pedantic -Weffc++ -Wall -Wextra -pedantic -Wswitch-default -Wfloat-equal -Wundef -Wredundant-decls -Winit-self -Wshadow -Wparentheses -Wshadow -Wunreachable-code -Wuninitialized -Wmaybe-uninitialized gh_main.cpp -ltbb
+ ---
+ cppcheck --enable=all --inconclusive --std=c++11 --std=posix --suppress=missingIncludeSystem gh_main.cpp
+ clang++ -O3 -std=c++17 -ltbb gh_main.cpp
+ clang++ -std=c++17 -fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic gh_main.cpp
+ clang++ -std=c++17 -Weverything -Wno-c++98-compat -Wno-padded -ltbb gh_main.cpp
+ clang++ -cc1 --help
+ ---
+ icpc -O3 -std=c++17 -ltbb -Wall -Wextra -pedantic gh_main.cpp
+*/
+#include <algorithm>
+#include <chrono>
+#include <execution>          // execution policy
+#include <iostream>
+#include <list>               // list
+#include <numeric>            // accumulate
+#include <random>
+#include <vector>
+using namespace std;
+using namespace std::chrono;  // timing
+
+
+/** \brief Prints the whole vector of base class pointers
+ *  \param[in,out]  s   output stream
+ *  \param[in]      v   vector
+ *  \return        changed output stream
+ */
+template <class T>
+ostream& operator<<(ostream &s, const vector<T>& v)
+{
+    for (const auto& it: v)             // Reference is required with unique_ptr. No copy constructor for unique_ptr available!
+    {
+        cout << *it << " ";
+    }
+    return s;
+}
+
+
+int main()
+{
+    cout << "Threads C++17" << endl;
+
+    size_t const N = 1<<25;
+    vector<double> v(N);
+    //list<double> v(N);       // execution::par not possible : missing 'operator-'
+    iota(v.begin(), v.end(), 1);
+    std::shuffle(v.begin(), v.end(), std::mt19937{std::random_device{}()});
+
+    auto const v_bak(v);
+
+    {
+        v = v_bak;
+        cout << "----   sort old   ----" << endl;
+        auto t1 = system_clock::now();
+        sort(v.begin(), v.end());
+        auto t2 = system_clock::now();
+        auto duration = duration_cast<microseconds>(t2 - t1);
+        cout << "sort old   : " << static_cast<double>(duration.count()) / 1e6 << " sec." << endl;
+    }
+
+    {
+        v = v_bak;
+        cout << "----   sort seq   ----" << endl;
+        auto t1 = system_clock::now();
+        sort(std::execution::seq, v.begin(), v.end());
+        auto t2 = system_clock::now();
+        auto duration = duration_cast<microseconds>(t2 - t1);
+        cout << "sort seq   : " << static_cast<double>(duration.count()) / 1e6 << " sec." << endl;
+    }
+
+    {
+        v = v_bak;
+        cout << "----   sort par   ----" << endl;
+        auto t1 = system_clock::now();
+        sort(std::execution::par, v.begin(), v.end());
+        //auto cnt = count(std::execution::par, v.begin(), v.end(), 17);
+        auto t2 = system_clock::now();
+        auto duration = duration_cast<microseconds>(t2 - t1);
+        cout << "sort par   : " << static_cast<double>(duration.count()) / 1e6 << " sec." << endl;
+    }
+    
+    {
+        v = v_bak;
+        cout << "----   sort par_unseq   ----" << endl;
+        auto t1 = system_clock::now();
+        sort(std::execution::par_unseq, v.begin(), v.end());
+        auto t2 = system_clock::now();
+        auto duration = duration_cast<microseconds>(t2 - t1);
+        cout << "sort par_unseq: " << static_cast<double>(duration.count()) / 1e6 << " sec." << endl;
+    }
+
+
+    return 0;
+}
+
--- a/ex3/seq/thread_17/main.cpp
+++ b/ex3/seq/thread_17/main.cpp
@ -0,0 +1,84 @@
+// C++ Vorlesung  xxx
+// C++-17: Threads, execution policies
+//   Great web pages, great book by Filipek:   https://www.bfilipek.com/2018/06/parstl-tests.html
+
+/*
+ g++ -O3 -std=c++17 main.cpp -ltbb
+ g++ -O3 -std=c++17 -pedantic -Weffc++ -Wall -Wextra -pedantic -Wswitch-default -Wfloat-equal -Wundef -Wredundant-decls -Winit-self -Wshadow -Wparentheses -Wshadow -Wunreachable-code -Wuninitialized -Wmaybe-uninitialized main.cpp -ltbb
+ ---
+ cppcheck --enable=all --inconclusive --std=c++11 --std=posix --suppress=missingIncludeSystem main.cpp
+ clang++ -O3 -std=c++17 -ltbb main.cpp
+ clang++ -std=c++17 -fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic main.cpp
+ clang++ -std=c++17 -Weverything -Wno-c++98-compat -Wno-padded -ltbb main.cpp
+ clang++ -cc1 --help
+ ---
+ icpc -O3 -std=c++17 -ltbb -Wall -Wextra -pedantic main.cpp
+*/
+
+#include <algorithm>
+#include <chrono>
+#include <execution>          // execution policy
+#include <iostream>
+#include <numeric>            //  accumulate
+#include <random>
+#include <vector>
+using namespace std;
+using namespace std::chrono;  // timing
+
+// Great web pages, great book by Filipek
+// https://www.bfilipek.com/2018/06/parstl-tests.html
+template <typename TFunc> void RunAndMeasure(const char *title, TFunc func)
+{
+    const auto start = std::chrono::steady_clock::now();
+    auto ret = func();
+    const auto end = std::chrono::steady_clock::now();
+    std::cout << title << ": " <<
+              std::chrono::duration <double, std::milli>(end - start).count()
+              << " ms, res " << ret << "\n";
+}
+
+int main()
+{
+    //std::vector<double> v(6000000, 0.5);
+    std::vector<double> v(1<<30, 0.5);
+
+    RunAndMeasure("std::warm up", [&v]
+    {
+        return std::reduce(std::execution::seq, v.begin(), v.end(), 0.0);
+    });
+
+    RunAndMeasure("std::accumulate", [&v]
+    {
+        return std::accumulate(v.begin(), v.end(), 0.0);
+    });
+
+    RunAndMeasure("std::reduce, seq", [&v]
+    {
+        return std::reduce(std::execution::seq, v.begin(), v.end(), 0.0);
+    });
+
+    RunAndMeasure("std::reduce, par", [&v]
+    {
+        return std::reduce(std::execution::par, v.begin(), v.end(), 0.0);
+    });
+
+    RunAndMeasure("std::reduce, par_unseq", [&v]
+    {
+        return std::reduce(std::execution::par_unseq, v.begin(), v.end(), 0.0);
+    });
+
+    RunAndMeasure("std::find, seq", [&v]
+    {
+        auto res = std::find(std::execution::seq, std::begin(v), std::end(v), 0.6);
+        return res == std::end(v) ? 0.0 : 1.0;
+    });
+
+    RunAndMeasure("std::find, par", [&v]
+    {
+        auto res = std::find(std::execution::par, std::begin(v), std::end(v), 0.6);
+        return res == std::end(v) ? 0.0 : 1.0;
+    });
+
+    return 0;
+}
+
Author	SHA1	Message	Date
dino.celebic	3882aee07a	task8	2025-11-11 15:50:51 +01:00
dino.celebic	3763c53dab	task8	2025-11-11 15:50:18 +01:00