Pushing everything again, accidentally deleted my remote repository

This commit is contained in:
jakob.schratter 2025-12-09 22:06:13 +01:00
commit 1bee3e8e5b
101 changed files with 9428 additions and 0 deletions

71
ex3/.vscode/settings.json vendored Normal file
View file

@ -0,0 +1,71 @@
{
"files.associations": {
"array": "cpp",
"atomic": "cpp",
"bit": "cpp",
"cctype": "cpp",
"charconv": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"condition_variable": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"deque": "cpp",
"forward_list": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"unordered_set": "cpp",
"vector": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"random": "cpp",
"ratio": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"format": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"semaphore": "cpp",
"span": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"thread": "cpp",
"cinttypes": "cpp",
"typeindex": "cpp",
"typeinfo": "cpp",
"variant": "cpp",
"cassert": "cpp",
"list": "cpp",
"fstream": "cpp"
}
}

123
ex3/CLANG_default.mk Normal file
View file

@ -0,0 +1,123 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=CLANG_
#CLANGPATH=//usr/lib/llvm-10/bin/
CC = ${CLANGPATH}clang
CXX = ${CLANGPATH}clang++
#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
#F77 = gfortran
LINKER = ${CXX}
#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
# don't use -Ofast
# -ftrapv
LINKFLAGS += -O3
# different libraries in Ubuntu or manajaró
ifndef UBUNTU
UBUNTU=1
endif
# BLAS, LAPACK
LINKFLAGS += -llapack -lblas
# -lopenblas
ifeq ($(UBUNTU),1)
# ubuntu
else
# on archlinux
LINKFLAGS += -lcblas
endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
# very good check
# http://clang.llvm.org/extra/clang-tidy/
# good check, see: http://llvm.org/docs/CodingStandards.html#include-style
SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
#READABILITY=,readability*${SWITCH_OFF}
#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
#TIDYFLAGS += -checks='modernize*
# ???
#TIDYFLAGS = -checks='cert*' -header-filter=.*
# MPI checks ??
#TIDYFLAGS = -checks='mpi*'
# ??
#TIDYFLAGS = -checks='performance*' -header-filter=.*
#TIDYFLAGS = -checks='portability-*' -header-filter=.*
#TIDYFLAGS = -checks='readability-*' -header-filter=.*
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
codecheck: tidy_check
tidy_check:
clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
run: clean ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
codecheck: tidy_check

130
ex3/GCC_AMD32_default.mk Normal file
View file

@ -0,0 +1,130 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=GCC_
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
# on mephisto:
#CXXFLAGS += -I/share/apps/atlas/include
#LINKFLAGS += -L/share/apps/atlas/lib
#LINKFLAGS += -lcblas -latlas
#LINKFLAGS += -lblas
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
#WARNINGS = -pedantic -pedantic-errors -Wall -Wextra -Werror -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow
WARNINGS = -pedantic -Wall -Wextra -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
# -Wunreachable-code
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
#-msse3
# -ftree-vectorizer-verbose=2 -DNDEBUG
# -ftree-vectorizer-verbose=5
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
LINKFLAGS += -O3
# BLAS, LAPACK
OPENBLAS_DIR = /opt/openblas_GCCseq
#OPENBLAS_DIR = /opt/openblas_GCC
OPENBLAS_LIBDIR = ${OPENBLAS_DIR}/lib
OPENBLAS_INCDIR = ${OPENBLAS_DIR}/include
CXXFLAGS += -I${OPENBLAS_INCDIR}
LINKFLAGS += -L${OPENBLAS_LIBDIR} -lopenblas
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
# profiling tools
#CXXFLAGS += -pg
#LINKFLAGS += -pg
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig
-@rm -r html
run: clean ${PROGRAM}
# time ./${PROGRAM}
# ./${PROGRAM}
( export LD_LIBRARY_PATH=${OPENBLAS_LIBDIR}:${LD_LIBRARY_PATH} ; ./${PROGRAM} )
# or 'export LD_LIBRARY_PATH=/opt/openblas_gcc/lib:${LD_LIBRARY_PATH}' in your ~/.bashrc
# tar the current directory
MY_DIR = `basename ${PWD}`
tar:
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
# no "-pg" in compile/link options
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
./$^
gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
#Trace your heap:
#> heaptrack ./main.GCC_
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
heap: ${PROGRAM}
heaptrack ./$^ 11
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
########################################################################
# get the detailed status of all optimization flags
info:
echo "detailed status of all optimization flags"
$(CXX) --version
$(CXX) -Q $(CXXFLAGS) --help=optimizers

183
ex3/GCC_default.mk Normal file
View file

@ -0,0 +1,183 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=GCC_
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
#-msse3
# -ftree-vectorizer-verbose=2 -DNDEBUG
# -ftree-vectorizer-verbose=5
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
LINKFLAGS += -O3
#architecture
#CPU = -march=znver2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU}
# different libraries in Ubuntu or manajaró
ifndef UBUNTU
UBUNTU=1
endif
# BLAS, LAPACK
ifeq ($(UBUNTU),1)
LINKFLAGS += -llapack -lblas
# -lopenblas
else
# on archlinux
LINKFLAGS += -llapack -lopenblas -lcblas
endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
# for debugging purpose (save code)
# -fsanitize=leak # only one out the three can be used
# -fsanitize=address
# -fsanitize=thread
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
#CXXFLAGS += ${SANITARY}
#LINKFLAGS += ${SANITARY}
# profiling tools
#CXXFLAGS += -pg
#LINKFLAGS += -pg
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
-@rm -rf html
run: clean ${PROGRAM}
#run: ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
#find . -size +10M > large_files
#--exclude-from ${MY_DIR}/large_files
zip: clean
@echo "Zip the directory: " ${MY_DIR}
@cd .. ;\
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
doc:
doxygen Doxyfile
#########################################################################
.SUFFIXES: .f90
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
.f90.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
# no "-pg" in compile/link options
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Graphical interface
# valkyrie
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
# * install
# * sudo vi /etc/sysctl.conf
# add kernel.perf_event_paranoid = 0
#Trace your heap:
#> heaptrack ./main.GCC_
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
heap: ${PROGRAM}
heaptrack ./$^ ${PARAMS}
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
codecheck: $(SOURCES)
cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
########################################################################
# get the detailed status of all optimization flags
info:
echo "detailed status of all optimization flags"
$(CXX) --version
$(CXX) -Q $(CXXFLAGS) --help=optimizers
lscpu
inxi -C
lstopo
# Excellent hardware info
# hardinfo
# Life monitoring of CPU frequency etc.
# sudo i7z
# Memory consumption
# vmstat -at -SM 3
# xfce4-taskmanager
# https://www.tecmint.com/check-linux-cpu-information/
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
# Debugging:
# https://wiki.archlinux.org/index.php/Debugging

137
ex3/ICC_default.mk Normal file
View file

@ -0,0 +1,137 @@
# Basic Defintions for using INTEL compiler suite sequentially
# requires setting of COMPILER=ICC_
#BINDIR = /opt/intel/bin/
# special on my sony [GH]
#BINDIR = /opt/save.intel/bin/
# very special on my sony [GH]
# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
# Error with g++-4.8 using icpc14.0,x:
# find directory wherein bits/c++config.h is located
# 'locate bits/c++config.h'
#FOUND_CONFIG = -I/usr/include/x86_64-linux-gnu/c++/4.8
CC = ${BINDIR}icc
CXX = ${BINDIR}icpc
F77 = ${BINDIR}ifort
LINKER = ${CXX}
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3
# -Winline -Wredundant-decls -Wunreachable-code
CXXFLAGS += -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl ${FOUND_CONFIG}
# profiling tools
#CXXFLAGS += -pg
#LINKFLAGS += -pg
# -vec-report=3
# -qopt-report=5 -qopt-report-phase=vec
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
CXXFLAGS += -align
# use MKL by INTEL
# https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/linking-your-application-with-the-intel-math-kernel-library/linking-quick-start/using-the-mkl-compiler-option.html
# https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html
# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
#LINKFLAGS += -O3 -L/opt/intel/mkl/lib -mkl
LINKFLAGS += -O3 -mkl=sequential
# interprocedural optimization
CXXFLAGS += -ipo
LINKFLAGS += -ipo
# annotated assembler file
ANNOTED = -fsource-asm -S
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# # some tools
# # Cache behaviour (CXXFLAGS += -g tracks down to source lines)
# cache: ${PROGRAM}
# valgrind --tool=callgrind --simulate-cache=yes ./$^
# # kcachegrind callgrind.out.<pid> &
#
# # Check for wrong memory accesses, memory leaks, ...
# # use smaller data sets
# mem: ${PROGRAM}
# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
#
# # Simple run time profiling of your code
# # CXXFLAGS += -g -pg
# # LINKFLAGS += -pg
# prof: ${PROGRAM}
# ./$^
# gprof -b ./$^ > gp.out
# # kprof -f gp.out -p gprof &
#
mem: inspector
prof: amplifier
cache: amplifier
gap_par_report:
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
# GUI for performance report
amplifier: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# alternatively to the solution above:
#edit file /etc/sysctl.d/10-ptrace.conf and set variable kernel.yama.ptrace_scope variable to 0 .
amplxe-gui &
# GUI for Memory and Thread analyzer (race condition)
inspector: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
inspxe-gui &
advisor:
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
advixe-gui &
icc-info:
icpc -# main.cpp

176
ex3/ONEAPI_default.mk Normal file
View file

@ -0,0 +1,176 @@
# Basic Defintions for using INTEL compiler suite sequentially
# requires setting of COMPILER=ONEAPI_
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# requires
# source /opt/intel/oneapi/setvars.sh
# on AMD: export MKL_DEBUG_CPU_TYPE=5
#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
#export KMP_AFFINITY=verbose,compact
CC = ${BINDIR}icc
CXX = ${BINDIR}dpcpp
F77 = ${BINDIR}ifort
LINKER = ${CXX}
## Compiler flags
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
WARNINGS += -Wpessimizing-move -Wredundant-move
#-wd2015,2012,2014 -wn3
# -Winline -Wredundant-decls -Wunreachable-code
# -qopt-subscript-in-range
# -vec-threshold0
CXXFLAGS += -O3 -std=c++17 ${WARNINGS}
#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include"
#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include"
LINKFLAGS += -O3
# interprocedural optimization
CXXFLAGS += -ipo
LINKFLAGS += -ipo
LINKFLAGS += -flto
# annotated Assembler file
ANNOTED = -fsource-asm -S
#architecture
CPU = -march=core-avx2
#CPU += -mtp=zen
# -xCORE-AVX2
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU}
# use MKL by INTEL
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# sequential MKL
# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
CXXFLAGS += -qmkl=sequential -UMKL_ILP64
LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static
# install intel-oneapi-compiler-static
# or
LINKFLAGS += -shared-intel
OPENMP = -qopenmp
CXXFLAGS += ${OPENMP}
LINKFLAGS += ${OPENMP}
# profiling tools
#CXXFLAGS += -pg
#LINKFLAGS += -pg
# -vec-report=3
# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
# Redirect report from *.optrpt to stderr
# -qopt-report-file=stderr
# Guided paralellization
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
## run time checks
# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines)
# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
mem: inspector
prof: vtune
cache: inspector
gap_par_report:
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
# GUI for performance report
amplifier: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
amplxe-gui &
# GUI for Memory and Thread analyzer (race condition)
inspector: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# inspxe-gui &
vtune-gui ./${PROGRAM} &
advisor:
source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
advisor --collect=survey ./${PROGRAM}
# advisor --collect=roofline ./${PROGRAM}
advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
vtune:
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
export ADVIXE_EXPERIMENTAL=roofline
vtune -collect hotspots ./${PROGRAM}
vtune -report hotspots -r r000hs > vtune.out
# vtune-gui ./${PROGRAM} &
icc-info:
icpc -# main.cpp
# MKL on AMD
# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
#
# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
# export MKL_DEBUG_CPU_TYPE=5
# export MKL_NUM_THRAEDS=1
# export MKL_DYNAMIC=false
# on Intel compiler
# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html

93
ex3/PGI_default.mk Normal file
View file

@ -0,0 +1,93 @@
# Basic Defintions for using PGI-compiler suite sequentially
# requires setting of COMPILER=PGI_
# OPTIRUN = optirun
CC = pgcc
CXX = pgc++
F77 = pgfortran
LINKER = ${CXX}
# on mephisto:
#CXXFLAGS += -I/share/apps/atlas/include
#LINKFLAGS += -L/share/apps/atlas/lib
#LINKFLAGS += -lcblas -latlas
#LINKFLAGS += -lblas
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
WARNINGS = -Minform=warn
# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow -Wredundant-decls
# -pedantic -Wunreachable-code -Wextra -Winline
# -Wunreachable-code
#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel
PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
# -Minfo
# -Mprof=time
# -Mprof=lines
# take care with option -Msafeptr
CXXFLAGS += -O3 -std=c++17 ${WARNINGS}
#CXXFLAGS += -O3 -std=c++11 -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
# -fastsse -fargument-noalias ${WARNINGS} -msse3 -vec-report=3
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# # some tools
# # Simple run time profiling of your code
# # CXXFLAGS += -g -pg
# # LINKFLAGS += -pg
# Profiling options PGI, see: pgcollect -help
# CPU_PROF = -allcache
CPU_PROF = -time
# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
#GPU_PROF = -cuda=branch:cc20
#
PROF_FILE = pgprof.out
cache: prof
prof: ${PROGRAM}
${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^
${OPTIRUN} ${BINDIR}pgprof -exe ./$^ $(PROF_FILE) &
info:
pgaccelinfo -v

30
ex3/ex3/Makefile Normal file
View file

@ -0,0 +1,30 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp vdop.cpp getmatrix.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

333
ex3/ex3/benchmark_tests.cpp Normal file
View file

@ -0,0 +1,333 @@
#include "benchmark_tests.h"
#include "benchmarks.h"
#include <chrono>
#include <iostream>
#include <math.h>
using namespace std::chrono;
vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function)
{
cout << "#################### (A) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nN = " << N << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> x(N), y(N);
cout.precision(2);
cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
for (size_t i = 0; i < N; ++i)
{
x[i] = i % 219 + 1;
y[i] = 1.0/x[i];
}
cout << "\nStart Benchmarking scalar\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
double check(0.0),ss(0.0);
for (size_t i = 0; i < NLOOPS; ++i)
{
check = scalar_function(x, y);
ss += check; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n <x,y> = " << check << endl;
if (static_cast<unsigned int>(check) != N)
cout << " !! W R O N G result !!\n";
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
//##########################################################################
cout << "\nStart Benchmarking norm\n";
auto t3 = system_clock::now(); // start timer
// Do calculation
double ss2(0.0);
for (size_t i = 0; i < NLOOPS; ++i)
{
auto sk1 = sqrt(scalar(x, x));
ss2 += sk1; // prevents the optimizer from removing unused calculation results.
}
auto t4 = system_clock::now(); // stop timer
auto duration2 = duration_cast<microseconds>(t4 - t3); // duration in microseconds
double t_diff2 = static_cast<double>(duration2.count()) / 1e6; // overall duration in seconds
t_diff2 = t_diff2/NLOOPS; // duration per loop seconds
cout << "ss(norm): " << ss2 << endl;
cout << "Timing in sec. : " << t_diff2 << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function)
{
cout << "#################### (B) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nN = " << N << endl;
cout << "\nM = " << M << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> A(M*N);
vector<double> x(N);
cout.precision(2);
cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
for (size_t i = 0; i < M; ++i)
for (size_t j = 0; j < N; ++j)
A[N*i + j] = (i + j) % 219 + 1;
for (size_t j = 0; j < N; ++j)
{
x[j] = 1.0/A[N*17 + j];
}
cout << "\nStart Benchmarking MatVec\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
vector<double> b(M);
for (size_t i = 0; i < NLOOPS; ++i)
{
b = MatVec_function(A, x);
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n <A[17,*],x> = " << b[17] << endl;
if (static_cast<size_t>(b[17]) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function)
{
cout << "#################### (C) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nL = " << L << endl;
cout << "\nM = " << M << endl;
cout << "\nN = " << N << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> A(M*L);
vector<double> B(L*N);
cout.precision(2);
cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
for (size_t i = 0; i < M; ++i)
for (size_t k = 0; k < L; ++k)
A[L*i + k] = (i + k) % 219 + 1;
for (size_t k = 0; k < L; ++k)
for (size_t j = 0; j < N; ++j)
B[N*k + j] = 1.0/A[L*17 + k];
cout << "\nStart Benchmarking MatMat\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
vector<double> C(M*N);
double check;
double check_sum = 0;
for (size_t i = 0; i < NLOOPS; ++i)
{
C = MatMat_function(A, B, L);
check = C[N*17];
check_sum += check; // prevents the optimizer from removing unused calculation results.
}
cout << check_sum;
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n C[17,0] = " << check << endl;
if (static_cast<unsigned int>(check) != L)
{
cout << " !! W R O N G result !!, should be " << L <<"\n";
}
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
{
cout << "#################### (D) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nN = " << N << endl;
cout << "\np = " << p << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> a(p + 1, 0);
vector<double> x(N);
cout.precision(2);
cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
for (size_t j = 0; j < N; ++j)
x[j] = 1.0*j;
for (size_t k = 0; k < p + 1; ++k)
a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
cout << "\nStart Benchmarking poly\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
vector<double> y(N);
double check;
double check_sum;
for (size_t i = 0; i < NLOOPS; ++i)
{
y = poly(a, x);
check = y[0];
check_sum += check; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n poly(" << x[0] << ") = " << check << endl;
if (abs(check - 1.0) > 1.0/1e6)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}

15
ex3/ex3/benchmark_tests.h Normal file
View file

@ -0,0 +1,15 @@
#pragma once
#include <vector>
#include <functional>
using namespace std;
vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function);
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function);
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function);
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);

245
ex3/ex3/benchmarks.cpp Normal file
View file

@ -0,0 +1,245 @@
#include "benchmarks.h"
#include "vdop.h"
#include <iostream>
#include <vector>
#include <cmath>
#include <cassert> // assert()
#ifdef __INTEL_CLANG_COMPILER
#pragma message(" ########## Use of MKL ###############")
#include <mkl.h>
#else
#pragma message(" ########## Use of CBLAS ###############")
#include <cblas.h> // cBLAS Library
#include <lapacke.h> // Lapack
#endif
// (A) Inner product of two vectors (from skalar_stl)
double scalar(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size());
size_t const N = x.size();
double sum = 0.0;
for (size_t i = 0; i < N; ++i)
{
sum += x[i] * y[i];
}
return sum;
}
// (A) 5.(b) Kahan scalar product
double Kahan_skalar(vector<double> const &x, vector<double> const &y)
{
double sum = 0.0;
double c = 0.0;
size_t n = x.size();
for (size_t i = 0; i < n; ++i)
{
double z = x[i]*y[i] - c; // c is the part that got lost in the last iteration
double t = sum + z; // when adding sum + z, the lower digits are lost if sum is large
c = (t - sum) - z; // now we recover the lower digits to add in the next iteration
sum = t;
}
return sum;
}
// (A) 6. cBLAS scalar product
double scalar_cBLAS(vector<double> const &x, vector<double> const &y)
{
return cblas_ddot(x.size(), x.data(), 1, y.data(), 1); // x.data() = &x[0]
}
// (B) Matrix-vector product (from intro_vector_densematrix)
vector<double> MatVec(vector<double> const &A, vector<double> const &x)
{
size_t const nelem = A.size();
size_t const N = x.size();
assert(nelem % N == 0); // make sure multiplication is possible
size_t const M = nelem/N;
vector<double> b(M);
for (size_t i = 0; i < M; ++i)
{
double tmp = 0.0;
for (size_t j = 0; j < N; ++j)
tmp += A[N*i + j] * x[j];
b[i] = tmp;
}
return b;
}
// (B) cBLAS Matrix-vector product
vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x)
{
size_t const nelem = A.size();
size_t const N = x.size();
assert(nelem % N == 0); // make sure multiplication is possible
size_t const M = nelem/N;
vector<double> b(M);
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0.0, b.data(), 1);
return b;
}
// (C) Matrix-matrix product
vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &L)
{
size_t const nelem_A = A.size();
size_t const nelem_B = B.size();
assert(nelem_A % L == 0 && nelem_B % L == 0);
size_t const M = nelem_A/L;
size_t const N = nelem_B/L;
vector<double> C(M*N);
for (size_t i = 0; i < M; ++i)
{
for (size_t k = 0; k < L; ++k)
{
for (size_t j = 0; j < N; ++j)
{
C[N*i + j] += A[L*i + k]*B[N*k + j];
}
}
}
return C;
}
// (C) cBLAS matrix-matrix product
vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &L)
{
size_t const nelem_A = A.size();
size_t const nelem_B = B.size();
assert(nelem_A % L == 0 && nelem_B % L == 0);
size_t const M = nelem_A/L;
size_t const N = nelem_B/L;
vector<double> C(M*N);
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
return C;
}
// (D) Evaluation of a polynomial function
vector<double> poly(vector<double> const &a, vector<double> const &x)
{
size_t const N = x.size();
size_t const p = a.size() - 1;
vector<double> y(N, 0);
for (size_t i = 0; i < N; ++i)
{
double x_temp = x[i];
double y_temp = 0;
for (size_t k = 0; k < p + 1; ++k)
{
y_temp += x_temp*y_temp + a[p - k];
}
y[i] = y_temp;
}
return y;
}
// (E) Solves linear system of equations
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
{
const double omega = 1.0;
const int maxiter = 1000;
const double tol = 1e-5, // tolerance
tol2 = tol * tol; // tolerance^2
int nrows = SK.Nrows(); // number of rows == number of columns
assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl;
// Choose initial guess
for (int k = 0; k < nrows; ++k)
{
u[k] = 0.0; // u := 0
}
vector<double> dd(nrows); // matrix diagonal
vector<double> r(nrows); // residual
vector<double> w(nrows); // correction
SK.GetDiag(dd); // dd := diag(K)
////DebugVector(dd);{int ijk; cin >> ijk;}
// Initial sweep
SK.Defect(r, f, u); // r := f - K*u
vddiv(w, r, dd); // w := D^{-1}*r
double sigma0 = dscapr(w, r); // s0 := <w,r>
// Iteration sweeps
int iter = 0;
double sigma = sigma0;
while ( sigma > tol2 * sigma0 && maxiter > iter)
{
++iter;
vdaxpy(u, u, omega, w ); // u := u + om*w
SK.Defect(r, f, u); // r := f - K*u
vddiv(w, r, dd); // w := D^{-1}*r
sigma = dscapr(w, r); // s0 := <w,r>
// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
}
cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl;
cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n";
return;
}

89
ex3/ex3/benchmarks.h Normal file
View file

@ -0,0 +1,89 @@
#pragma once
#include "getmatrix.h"
#include <vector>
using namespace std;
/** (A) Inner product of two vectors (from skalar_stl)
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double scalar(vector<double> const &x, vector<double> const &y);
/** (A) 5.(b) Inner product of two vectors using the Kahan scalar product
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double Kahan_skalar(vector<double> const &x, vector<double> const &y);
/** (A) 6. cBLAS scalar product of two vectors
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double scalar_cBLAS(vector<double> const &x, vector<double> const &y);
/** (B) Matrix-vector product (from intro_vector_densematrix)
* @param[in] A dense matrix (1D access)
* @param[in] u vector
*
* @return resulting vector
*/
vector<double> MatVec(vector<double> const &A, vector<double> const &x);
/** (B) 6. cBLAS Matrix-vector product
* @param[in] A dense matrix (1D access)
* @param[in] u vector
*
* @return resulting vector
*/
vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x);
/** (C) Matrix-matrix product
* @param[in] A MxL dense matrix (1D access)
* @param[in] B LxN dense matrix (1D access)
* @param[in] shared_dim shared dimension L
*
* @return resulting MxN matrix
*/
vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
/** (C) 6. cBLAS Matrix-matrix product
* @param[in] A MxL dense matrix (1D access)
* @param[in] B LxN dense matrix (1D access)
* @param[in] shared_dim shared dimension L
*
* @return resulting MxN matrix
*/
vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
/** (D) Evaluation of a polynomial function using Horner's scheme
* @param[in] a coefficient vector
* @param[in] x vector with input values
*
* @return vector with output values
*/
vector<double> poly(vector<double> const &a, vector<double> const &x);
/** (E) Solves linear system of equations K @p u = @p f via the Jacobi iteration (from jaboci_oo_stl)
* We use a distributed symmetric CSR matrix @p SK and initial guess of the
* solution is set to 0.
* @param[in] SK CSR matrix
* @param[in] f distributed local vector storing the right hand side
* @param[out] u accumulated local vector storing the solution.
*/
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u);

View file

@ -0,0 +1,39 @@
#include "factorization_solve.h"
#include <vector>
#include <math.h>
#include <assert.h>
#include <lapack.h>
#include <iostream>
using namespace std;
void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs)
{
int32_t nelem = A.size();
int32_t N = sqrt(nelem);
assert(N == static_cast<int32_t>(b.size())/n_rhs);
vector<int> IPIV(N); // pivot indices
int info_factorization;
dgetrf_(&N, &N, A.data(), &N, IPIV.data(), &info_factorization);
const char transp = 'N';
int info_solve;
dgetrs_(&transp, &N, &n_rhs, A.data(), &N, IPIV.data(), b.data(), &N, &info_solve, 1); // 1 is length of parameter 'N'
}
void print_matrix(vector<double> &A, size_t M, size_t N)
{
for (size_t i = 0; i < M; ++i)
{
for (size_t j = 0; j < N; ++j)
{
cout << A[N*i + j] << "\t";
}
cout << endl;
}
cout << endl;
}

View file

@ -0,0 +1,21 @@
#pragma once
#include <vector>
#include <cstdint>
using namespace std;
/** Solve linear system of equations with multiple right hand sides using LU factorization
* @param[inout] A NxN Matrix (1D access), gets modified to contain the LU decomposition of A
* @param[inout] B N x n_rhs Matrix of right-hand-sides (1D access), gets modified to contain the solution vectors x
* @param[in] n_rhs number of right-hand-sides b
*
*/
void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs);
/** Print a matrix to console
* @param[in] A MxN Matrix (1D access)
* @param[in] M rows
* @param[in] N columns
*
*/
void print_matrix(vector<double> &A, size_t M, size_t N);

View file

@ -0,0 +1,89 @@
#include "factorization_solve_tests.h"
#include "factorization_solve.h"
#include "benchmarks.h"
#include <chrono>
#include <iostream>
using namespace std::chrono;
void CheckCorrectness()
{
cout.precision(4);
size_t N = 5;
size_t n_rhs = 5;
vector<double> A(N*N);
vector<double> b(N*n_rhs);
// initialize A
for (size_t i = 0; i < N; ++i)
{
for (size_t j = 0; j < N; ++j)
if (i == j)
A[N*i + j] = 4.0;
else
A[N*i + j] = 1.0/((i - j)*(i - j));
}
const vector<double> A_copy = A;
// initialize b as identity matrix
for (size_t j = 0; j < N; ++j)
{
for (size_t l = 0; l < n_rhs; ++l)
if (l == j)
b[N*l + j] = 1.0;
else
b[N*l + j] = 0.0;
}
cout << "A = " << endl;
print_matrix(A, N, N);
cout << "b = " << endl;
print_matrix(b, N, n_rhs);
// solve system
factorization_solve(A, b, n_rhs);
vector<double> check_matrix_identity = MatMat_cBLAS(A_copy, b, N);
cout << "A*A^{-1} = " << endl;
print_matrix(check_matrix_identity, N, n_rhs);
}
void CheckDuration(const size_t &N)
{
for (size_t n_rhs : {1, 2, 4, 8, 16, 32})
{
auto time_start = system_clock::now();
vector<double> A(N*N);
vector<double> b(N*n_rhs);
// initialize A
for (size_t i = 0; i < N; ++i)
{
for (size_t j = 0; j < N; ++j)
if (i == j)
A[N*i + j] = 4.0;
else
A[N*i + j] = 1.0/((i - j)*(i - j));
}
// initialize b
for (size_t j = 0; j < N; ++j)
for (size_t l = 0; l < n_rhs; ++l)
b[N*l + j] = 1.0;
// solve system
factorization_solve(A, b, n_rhs);
auto time_end = system_clock::now();
auto duration = duration_cast<microseconds>(time_end - time_start); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
cout << "n_rhs = " << n_rhs << "\tTime per rhs: " << t_diff/n_rhs << endl;
}
}

View file

@ -0,0 +1,7 @@
#pragma once
#include <cstddef>
void CheckCorrectness();
void CheckDuration(const size_t &N);

522
ex3/ex3/geom.cpp Normal file
View file

@ -0,0 +1,522 @@
// see: http://llvm.org/docs/CodingStandards.html#include-style
#include "geom.h"
#include <algorithm>
#include <cassert>
#include <fstream>
#include <iostream>
#include <list>
#include <string>
#include <vector>
using namespace std;
Mesh::Mesh(int ndim, int nvert_e, int ndof_e)
: _nelem(0), _nvert_e(nvert_e), _ndof_e(ndof_e), _nnode(0), _ndim(ndim), _ia(0), _xc(0)
{
}
Mesh::~Mesh()
{}
void Mesh::SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const
{
int const nnode = Nnodes(); // number of vertices in mesh
assert( nnode == static_cast<int>(v.size()) );
for (int k = 0; k < nnode; ++k)
{
v[k] = func( _xc[2 * k], _xc[2 * k + 1] );
}
}
void Mesh::Debug() const
{
cout << "\n ############### Debug M E S H ###################\n";
cout << "\n ............... Coordinates ...................\n";
for (int k = 0; k < _nnode; ++k)
{
cout << k << " : " ;
for (int i = 0; i < _ndof_e; ++i )
{
cout << _xc[2*k+i] << " ";
}
cout << endl;
}
cout << "\n ............... Elements ...................\n";
for (int k = 0; k < _nelem; ++k)
{
cout << k << " : ";
for (int i = 0; i < _ndof_e; ++i )
cout << _ia[_ndof_e * k + i] << " ";
cout << endl;
}
return;
}
void Mesh::Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const
{
assert(Nnodes() == static_cast<int>(v.size())); // fits vector length to mesh information?
ofstream fout(fname); // open file ASCII mode
if ( !fout.is_open() )
{
cout << "\nFile " << fname << " has not been opened.\n\n" ;
assert( fout.is_open() && "File not opened." );
}
string const DELIMETER(" "); // define the same delimeter as in matlab/ascii_read*.m
int const OFFSET(1); // convert C-indexing to matlab
// Write data: #nodes, #space dimensions, #elements, #vertices per element
fout << Nnodes() << DELIMETER << Ndims() << DELIMETER << Nelems() << DELIMETER << NverticesElements() << endl;
// Write cordinates: x_k, y_k in seperate lines
assert( Nnodes()*Ndims() == static_cast<int>(_xc.size()));
for (int k = 0, kj = 0; k < Nnodes(); ++k)
{
for (int j = 0; j < Ndims(); ++j, ++kj)
{
fout << _xc[kj] << DELIMETER;
}
fout << endl;
}
// Write connectivity: ia_k,0, ia_k,1 etc in seperate lines
assert( Nelems()*NverticesElements() == static_cast<int>(_ia.size()));
for (int k = 0, kj = 0; k < Nelems(); ++k)
{
for (int j = 0; j < NverticesElements(); ++j, ++kj)
{
fout << _ia[kj] + OFFSET << DELIMETER; // C to matlab
}
fout << endl;
}
// Write vector
for (int k = 0; k < Nnodes(); ++k)
{
fout << v[k] << endl;
}
fout.close();
return;
}
void Mesh::Visualize(std::vector<double> const &v) const
{
// define external command
const string exec_m("matlab -nosplash < visualize_results.m"); // Matlab
//const string exec_m("octave --no-window-system --no-gui visualize_results.m"); // Octave
//const string exec_m("flatpak run org.octave.Octave visualize_results.m"); // Octave (flatpak): desktop GH
const string fname("uv.txt");
Write_ascii_matlab(fname, v);
int ierror = system(exec_m.c_str()); // call external command
if (ierror != 0)
{
cout << endl << "Check path to Matlab/octave on your system" << endl;
}
cout << endl;
return;
}
// #####################################################################
Mesh_2d_3_square::Mesh_2d_3_square(int nx, int ny, int myid, int procx, int procy)
: Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
_myid(myid), _procx(procx), _procy(procy), _neigh{{-1, -1, -1, -1}}, _color(0),
_xl(0.0), _xr(1.0), _yb(0.0), _yt(1.0), _nx(nx), _ny(ny)
{
//void IniGeom(int const myid, int const procx, int const procy, int neigh[], int &color)
int const ky = _myid / _procx;
int const kx = _myid % _procy; // MOD(myid,procx)
// Determine the neighbors of domain/rank myid
_neigh[0] = (ky == 0) ? -1 : _myid - _procx; // South
_neigh[1] = (kx == _procx - 1) ? -1 : _myid + 1; // East
_neigh[2] = (ky == _procy - 1) ? -1 : _myid + _procx; // North
_neigh[3] = (kx == 0) ? -1 : _myid - 1; // West
_color = (kx + ky) & 1 ;
// subdomain is part of unit square
double const hx = 1. / _procx;
double const hy = 1. / _procy;
_xl = kx * hx; // left
_xr = (kx + 1) * hx; // right
_yb = ky * hy; // bottom
_yt = (ky + 1) * hy; // top
// Calculate coordinates
int const nnode = (_nx + 1) * (_ny + 1); // number of nodes
Resize_Coords(nnode, 2); // coordinates in 2D [nnode][ndim]
GetCoordsInRectangle(_nx, _ny, _xl, _xr, _yb, _yt, GetCoords().data());
// Calculate element connectivity (linear triangles)
int const nelem = 2 * _nx * _ny; // number of elements
Resize_Connectivity(nelem, 3); // connectivity matrix [nelem][3]
GetConnectivityInRectangle(_nx, _ny, GetConnectivity().data());
return;
}
void Mesh_2d_3_square::SetU(std::vector<double> &u) const
{
int dx = _nx + 1;
for (int j = 0; j <= _ny; ++j)
{
int k = j * dx;
for (int i = 0; i <= _nx; ++i, ++k)
{
u[k] = 0.0;
}
}
}
void Mesh_2d_3_square::SetF(std::vector<double> &f) const
{
int dx = _nx + 1;
for (int j = 0; j <= _ny; ++j)
{
int k = j * dx;
for (int i = 0; i <= _nx; ++i, ++k)
{
f[k] = 1.0;
}
}
}
std::vector<int> Mesh_2d_3_square::Index_DirichletNodes() const
{
int const dx = 1,
dy = _nx + 1,
bl = 0,
br = _nx,
tl = _ny * (_nx + 1),
tr = (_ny + 1) * (_nx + 1) - 1;
int const start[4] = { bl, br, tl, bl},
end[4] = { br, tr, tr, tl},
step[4] = { dx, dy, dx, dy};
vector<int> idx(0);
for (int j = 0; j < 4; j++)
{
if (_neigh[j] < 0)
{
for (int i = start[j]; i <= end[j]; i += step[j])
{
idx.push_back(i); // node i is Dirichlet node
}
}
}
// remove multiple elements
sort(idx.begin(), idx.end()); // sort
idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
return idx;
}
void Mesh_2d_3_square::SaveVectorP(std::string const &name, vector<double> const &u) const
{
// construct the file name for subdomain myid
const string tmp( std::to_string(_myid / 100) + to_string((_myid % 100) / 10) + to_string(_myid % 10) );
const string namep = name + "." + tmp;
ofstream ff(namep.c_str());
ff.precision(6);
ff.setf(ios::fixed, ios::floatfield);
// assumes tensor product grid in unit square; rowise numbered (as generated in class constructor)
// output is provided for tensor product grid visualization ( similar to Matlab-surf() )
auto const &xc = GetCoords();
int k = 0;
for (int j = 0; j <= _ny; ++j)
{
for (int i = 0; i <= _nx; ++i, ++k)
ff << xc[2 * k + 0] << " " << xc[2 * k + 1] << " " << u[k] << endl;
ff << endl;
}
ff.close();
return;
}
void Mesh_2d_3_square::GetCoordsInRectangle(int const nx, int const ny,
double const xl, double const xr, double const yb, double const yt,
double xc[])
{
const double hx = (xr - xl) / nx,
hy = (yt - yb) / ny;
int k = 0;
for (int j = 0; j <= ny; ++j)
{
const double y0 = yb + j * hy;
for (int i = 0; i <= nx; ++i, k += 2)
{
xc[k ] = xl + i * hx;
xc[k + 1] = y0;
}
}
return;
}
void Mesh_2d_3_square::GetConnectivityInRectangle(int const nx, int const ny, int ia[])
{
const int dx = nx + 1;
int k = 0;
int l = 0;
for (int j = 0; j < ny; ++j, ++k)
{
for (int i = 0; i < nx; ++i, ++k)
{
ia[l ] = k;
ia[l + 1] = k + 1;
ia[l + 2] = k + dx + 1;
l += 3;
ia[l ] = k;
ia[l + 1] = k + dx;
ia[l + 2] = k + dx + 1;
l += 3;
}
}
return;
}
// #################### still some old code (--> MPI) ############################
// Copies the values of w corresponding to the boundary
// South (ib==1), East (ib==2), North (ib==3), West (ib==4)
void GetBound(int const ib, int const nx, int const ny, double const w[], double s[])
{
const int //dx = 1,
dy = nx + 1,
bl = 0,
br = nx,
tl = ny * (nx + 1),
tr = (ny + 1) * (nx + 1) - 1;
switch (ib)
{
case 1:
{
for (int i = bl, j = 0; i <= br; ++i, ++j)
s[j] = w[i];
break;
}
case 3:
{
for (int i = tl, j = 0; i <= tr; ++i, ++j)
s[j] = w[i];
break;
}
case 4:
{
for (int i = bl, j = 0; i <= tl; i += dy, ++j)
s[j] = w[i];
break;
}
case 2:
{
for (int i = br, j = 0; i <= tr; i += dy, ++j)
s[j] = w[i];
break;
}
default:
{
cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl;
}
}
return;
}
// ----------------------------------------------------------------------------------------------------------
// Computes w: = w + s at nodes on the boundary
// South (ib == 1), East (ib == 2), North (ib == 3), West (ib == 4)
void AddBound(int const ib, int const nx, int const ny, double w[], double const s[])
{
int const dy = nx + 1,
bl = 0,
br = nx,
tl = ny * (nx + 1),
tr = (ny + 1) * (nx + 1) - 1;
switch (ib)
{
case 1:
{
for (int i = bl, j = 0; i <= br; ++i, ++j)
w[i] += s[j];
break;
}
case 3:
{
for (int i = tl, j = 0; i <= tr; ++i, ++j)
w[i] += s[j];
break;
}
case 4:
{
for (int i = bl, j = 0; i <= tl; i += dy, ++j)
w[i] += s[j];
break;
}
case 2:
{
for (int i = br, j = 0; i <= tr; i += dy, ++j)
w[i] += s[j];
break;
}
default:
{
cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl;
}
}
return;
}
// ####################################################################
Mesh_2d_3_matlab::Mesh_2d_3_matlab(string const &fname)
: Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
bedges(0)
{
ifstream ifs(fname);
if (!(ifs.is_open() && ifs.good()))
{
cerr << "Mesh_2d_3_matlab: Error cannot open file " << fname << endl;
assert(ifs.is_open());
}
int const OFFSET(1); // Matlab to C indexing
cout << "ASCI file " << fname << " opened" << endl;
// Read some mesh constants
int nnode, ndim, nelem, nvert_e;
ifs >> nnode >> ndim >> nelem >> nvert_e;
cout << nnode << " " << ndim << " " << nelem << " " << nvert_e << endl;
assert(ndim == 2 && nvert_e == 3);
// Allocate memory
Resize_Coords(nnode, ndim); // coordinates in 2D [nnode][ndim]
Resize_Connectivity(nelem, nvert_e); // connectivity matrix [nelem][nvert]
// Read ccordinates
auto &xc = GetCoords();
for (int k = 0; k < nnode * ndim; ++k)
{
ifs >> xc[k];
}
// Read connectivity
auto &ia = GetConnectivity();
for (int k = 0; k < nelem * nvert_e; ++k)
{
ifs >> ia[k];
ia[k] -= OFFSET; // Matlab to C indexing
}
// additional read of boundary information (only start/end point)
int nbedges;
ifs >> nbedges;
bedges.resize(nbedges * 2);
for (int k = 0; k < nbedges * 2; ++k)
{
ifs >> bedges[k];
bedges[k] -= OFFSET; // Matlab to C indexing
}
return;
}
// binary
//{
//ifstream ifs(fname, ios_base::in | ios_base::binary);
//if(!(ifs.is_open() && ifs.good()))
//{
//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
//assert(ifs.is_open());
//}
//cout << "ReadBinMatrix: file opened" << file << endl;
//}
// binaryIO.cpp
//void read_binMatrix(const string& file, vector<int> &cnt, vector<int> &col, vector<double> &ele)
//{
//ifstream ifs(file, ios_base::in | ios_base::binary);
//if(!(ifs.is_open() && ifs.good()))
//{
//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
//assert(ifs.is_open());
//}
//cout << "ReadBinMatrix: Opened file " << file << endl;
//int _size;
//ifs.read(reinterpret_cast<char*>(&_size), sizeof(int)); // old: ifs.read((char*)&_size, sizeof(int));
//cnt.resize(_size);
//cout << "ReadBinMatrix: cnt size: " << _size << endl;
//ifs.read((char*)&_size, sizeof(int));
//col.resize(_size);
//cout << "ReadBinMatrix: col size: " << _size << endl;
//ifs.read((char*)&_size, sizeof(int));
//ele.resize(_size);
//cout << "ReadBinMatrix: ele size: " << _size << endl;
//ifs.read((char*)cnt.data(), cnt.size() * sizeof(int));
//ifs.read((char*)col.data(), col.size() * sizeof(int));
//ifs.read((char*)ele.data(), ele.size() * sizeof(double));
//ifs.close();
//cout << "ReadBinMatrix: Finished reading matrix.." << endl;
//}
std::vector<int> Mesh_2d_3_matlab::Index_DirichletNodes() const
{
vector<int> idx(bedges); // copy
sort(idx.begin(), idx.end()); // sort
idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
return idx;
}

381
ex3/ex3/geom.h Normal file
View file

@ -0,0 +1,381 @@
#ifndef GEOM_FILE
#define GEOM_FILE
#include <array>
#include <functional> // function; C++11
#include <string>
#include <vector>
/**
* Basis class for finite element meshes.
*/
class Mesh
{
public:
/**
* Constructor initializing the members with default values.
*
* @param[in] ndim space dimensions (dimension for coordinates)
* @param[in] nvert_e number of vertices per element (dimension for connectivity)
* @param[in] ndof_e degrees of freedom per element (= @p nvert_e for linear elements)
*/
explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0);
/**
* Destructor.
*
* See clang warning on
* <a href="https://stackoverflow.com/questions/28786473/clang-no-out-of-line-virtual-method-definitions-pure-abstract-c-class/40550578">weak-vtables</a>.
*/
virtual ~Mesh();
/**
* Number of finite elements in (sub)domain.
* @return number of elements.
*/
int Nelems() const
{
return _nelem;
}
/**
* Global number of vertices for each finite element.
* @return number of vertices per element.
*/
int NverticesElements() const
{
return _nvert_e;
}
/**
* Global number of degrees of freedom (dof) for each finite element.
* @return degrees of freedom per element.
*/
int NdofsElement() const
{
return _ndof_e;
}
/**
* Number of vertices in mesh.
* @return number of vertices.
*/
int Nnodes() const
{
return _nnode;
}
/**
* Space dimension.
* @return number of dimensions.
*/
int Ndims() const
{
return _ndim;
}
/**
* (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
*
* @param[in] nelem number of elements
* @param[in] nvert_e number of vertices per element
*/
void Resize_Connectivity(int nelem, int nvert_e)
{
SetNelem(nelem); // number of elements
SetNverticesElement(nvert_e); // vertices per element
_ia.resize(nelem * nvert_e);
}
/**
* Read connectivity information (g1,g2,g3)_i.
* @return convectivity vector [nelems*ndofs].
*/
const std::vector<int> &GetConnectivity() const
{
return _ia;
}
/**
* Access/Change connectivity information (g1,g2,g3)_i.
* @return convectivity vector [nelems*ndofs].
*/
std::vector<int> &GetConnectivity()
{
return _ia;
}
/**
* (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
*
* @param[in] nnodes number of nodes
* @param[in] ndim space dimension
*/
void Resize_Coords(int nnodes, int ndim)
{
SetNnode(nnodes); // number of nodes
SetNdim(ndim); // space dimension
_xc.resize(nnodes * ndim);
}
/**
* Read coordinates of vertices (x,y)_i.
* @return coordinates vector [nnodes*2].
*/
const std::vector<double> &GetCoords() const
{
return _xc;
}
/**
* Access/Change coordinates of vertices (x,y)_i.
* @return coordinates vector [nnodes*2].
*/
std::vector<double> &GetCoords()
{
return _xc;
}
/**
* Calculate values in vector @p v via function @p func(x,y)
* @param[in] v vector
* @param[in] func function of (x,y) returning a double value.
*/
void SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const;
/**
* Prints the information for a finite element mesh
*/
void Debug() const;
/**
* Determines the indices of those vertices with Dirichlet boundary conditions
* @return index vector.
*/
virtual std::vector<int> Index_DirichletNodes() const = 0;
/**
* Write vector @p v toghether with its mesh information to an ASCii file @p fname.
*
* The data are written in C-style.
*
* @param[in] fname file name
* @param[in] v vector
*/
void Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const;
/**
* Visualize @p v together with its mesh information via matlab or octave.
*
* Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
* that are supported on your system.
*
* @param[in] v vector
*
* @warning matlab files ascii_read_meshvector.m visualize_results.m
* must be in the executing directory.
*/
void Visualize(std::vector<double> const &v) const;
protected:
void SetNelem(int nelem)
{
_nelem = nelem;
}
void SetNverticesElement(int nvert)
{
_nvert_e = nvert;
}
void SetNdofsElement(int ndof)
{
_ndof_e = ndof;
}
void SetNnode(int nnode)
{
_nnode = nnode;
}
void SetNdim(int ndim)
{
_ndim = ndim;
}
private:
int _nelem; //!< number elements
int _nvert_e; //!< number of vertices per element
int _ndof_e; //!< degrees of freedom (d.o.f.) per element
int _nnode; //!< number nodes/vertices
int _ndim; //!< space dimension of the problem (1, 2, or 3)
std::vector<int> _ia; //!< element connectivity
std::vector<double> _xc; //!< coordinates
};
/**
* 2D finite element mesh of the square consiting of linear triangular elements.
*/
class Mesh_2d_3_square: public Mesh
{
public:
/**
* Generates the f.e. mesh for the unit square.
*
* @param[in] nx number of discretization intervals in x-direction
* @param[in] ny number of discretization intervals in y-direction
* @param[in] myid my MPI-rank / subdomain
* @param[in] procx number of ranks/subdomains in x-direction
* @param[in] procy number of processes in y-direction
*/
Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1);
/**
* Destructor
*/
~Mesh_2d_3_square() override
{}
/**
* Set solution vector based on a tensor product grid in the rectangle.
* @param[in] u solution vector
*/
void SetU(std::vector<double> &u) const;
/**
* Set right hand side (rhs) vector on a tensor product grid in the rectangle.
* @param[in] f rhs vector
*/
void SetF(std::vector<double> &f) const;
/**
* Determines the indices of those vertices with Dirichlet boundary conditions
* @return index vector.
*/
std::vector<int> Index_DirichletNodes() const override;
/**
* Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot.
* The file stores rowise the x- and y- coordinates together with the value from @p u .
* The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals.
*
* @param[in] name basename of file name (file name will be extended by the rank number)
* @param[in] u local vector
*
* @warning Assumes tensor product grid in unit square; rowise numbered
* (as generated in class constructor).
* The output is provided for tensor product grid visualization
* ( similar to Matlab-surf() ).
*
* @see Mesh_2d_3_square
*/
void SaveVectorP(std::string const &name, std::vector<double> const &u) const;
// here will still need to implement in the class
// GetBound(), AddBound()
// or better a generalized way with indices and their appropriate ranks for MPI communication
private:
/**
* Determines the coordinates of the dicretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt]
* which is discretized into @p nx x @p ny intervals.
*
* @param[in] ny number of discretization intervals in y-direction
* @param[in] xl x-coordinate of left boundary
* @param[in] xr x-coordinate of right boundary
* @param[in] yb y-coordinate of lower boundary
* @param[in] yt y-coordinate of upper boundary
* @param[out] xc coordinate vector of length 2n with x(2*k,2*k+1) as coodinates of node k
*/
void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt,
double xc[]);
/**
* Determines the element connectivity of linear triangular elements of a FEM discretization
* of a rectangle using @p nx x @p ny equidistant intervals for discretization.
* @param[in] nx number of discretization intervals in x-direction
* @param[in] ny number of discretization intervals in y-direction
* @param[out] ia element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s
*/
void GetConnectivityInRectangle(int nx, int ny, int ia[]);
private:
int _myid; //!< my MPI rank
int _procx; //!< number of MPI ranks in x-direction
int _procy; //!< number of MPI ranks in y-direction
std::array<int, 4> _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.)
int _color; //!< red/black coloring (checker board) of subdomains
double _xl; //!< x coordinate of lower left corner of square
double _xr; //!< x coordinate of lower right corner of square
double _yb; //!< y coordinate or lower left corner of square
double _yt; //!< y coordinate of upper right corner of square
int _nx; //!< number of intervals in x-direction
int _ny; //!< number of intervals in y-direction
};
// #################### still some old code (--> MPI) ############################
/**
* Copies the values of @p w corresponding to boundary @p ib
* onto vector s. South (ib==1), East (ib==2), North (ib==3), West (ib==4).
* The vector @p s has to be long enough!!
* @param[in] ib my local boundary
* @param[in] nx number of discretization intervals in x-direction
* @param[in] ny number of discretization intervals in y-direction
* @param[in] w vector for all nodes of local discretization
* @param[out] s short vector with values on boundary @p ib
*/
// GH_NOTE: Absicherung bei s !!
void GetBound(int ib, int nx, int ny, double const w[], double s[]);
/**
* Computes @p w := @p w + @p s at the interface/boundary nodes on the
* boundary @p ib . South (ib==1), East (ib==2), North (ib==3), West (ib==4)
* @param[in] ib my local boundary
* @param[in] nx number of discretization intervals in x-direction
* @param[in] ny number of discretization intervals in y-direction
* @param[in,out] w vector for all nodes of local discretization
* @param[in] s short vector with values on boundary @p ib
*/
void AddBound(int ib, int nx, int ny, double w[], double const s[]);
// #################### Mesh from Matlab ############################
/**
* 2D finite element mesh of the square consiting of linear triangular elements.
*/
class Mesh_2d_3_matlab: public Mesh
{
public:
/**
* Reads mesh data from a binary file.
*
* File format, see ascii_write_mesh.m
*
* @param[in] fname file name
*/
explicit Mesh_2d_3_matlab(std::string const &fname);
/**
* Determines the indices of those vertices with Dirichlet boundary conditions.
* @return index vector.
*
* @warning All boundary nodes are considered as Dirchlet nodes.
*/
std::vector<int> Index_DirichletNodes() const override;
private:
/**
* Determines the indices of those vertices with Dirichlet boundary conditions
* @return index vector.
*/
int Nnbedges() const
{
return static_cast<int>(bedges.size());
}
std::vector<int> bedges; //!< boundary edges [nbedges][2] storing start/end vertex
};
#endif

347
ex3/ex3/getmatrix.cpp Normal file
View file

@ -0,0 +1,347 @@
#include "getmatrix.h"
#include "userset.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iomanip>
#include <iostream>
#include <list>
#include <vector>
using namespace std;
// general routine for lin. triangular elements
void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3])
//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe)
{
const int i1 = 2 * ial[0], i2 = 2 * ial[1], i3 = 2 * ial[2];
const double x13 = xc[i3 + 0] - xc[i1 + 0], y13 = xc[i3 + 1] - xc[i1 + 1],
x21 = xc[i1 + 0] - xc[i2 + 0], y21 = xc[i1 + 1] - xc[i2 + 1],
x32 = xc[i2 + 0] - xc[i3 + 0], y32 = xc[i2 + 1] - xc[i3 + 1];
const double jac = fabs(x21 * y13 - x13 * y21);
ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
ske[1][0] = ske[0][1];
ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
ske[2][0] = ske[0][2];
ske[2][1] = ske[1][2];
ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
const double xm = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
ym = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
//fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0;
fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0;
}
// general routine for lin. triangular elements,
// non-symm. matrix
// node numbering in element: a s c e n d i n g indices !!
// GH: deprecated
void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
int const id[], int const ik[], double sk[], double f[])
{
for (int i = 0; i < 3; ++i)
{
const int ii = ial[i], // row ii (global index)
id1 = id[ii], // start and
id2 = id[ii + 1]; // end of row ii in matrix
int ip = id1;
for (int j = 0; j < 3; ++j) // no symmetry assumed
{
const int jj = ial[j];
bool not_found = true;
do // find entry jj (global index) in row ii
{
not_found = (ik[ip] != jj);
++ip;
}
while (not_found && ip < id2);
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
if (not_found) // no entry found !!
{
cout << "Error in AddElem: (" << ii << "," << jj << ") ["
<< ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
assert(!not_found);
}
#endif
sk[ip - 1] += ske[i][j];
}
f[ii] += fe[i];
}
}
// ----------------------------------------------------------------------------
// ####################################################################
CRS_Matrix::CRS_Matrix(Mesh const &mesh)
: _mesh(mesh), _nrows(0), _nnz(0), _id(0), _ik(0), _sk(0)
{
Derive_Matrix_Pattern();
return;
}
void CRS_Matrix::Derive_Matrix_Pattern()
{
int const nelem(_mesh.Nelems());
int const ndof_e(_mesh.NdofsElement());
auto const &ia(_mesh.GetConnectivity());
// Determine the number of matrix rows
_nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
++_nrows; // node numberng: 0 ... nnode-1
assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
// Collect for each node those nodes it is connected to (multiple entries)
// Detect the neighboring nodes
vector< list<int> > cc(_nrows); // cc[i] is the list of nodes a node i is connected to
for (int i = 0; i < nelem; ++i)
{
int const idx = ndof_e * i;
for (int k = 0; k < ndof_e; ++k)
{
list<int> &cck = cc.at(ia[idx + k]);
cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e );
}
}
// Delete the multiple entries
_nnz = 0;
for (auto &it : cc)
{
it.sort();
it.unique();
_nnz += static_cast<int>(it.size());
// cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator<int,char>(cout," ")); cout << endl;
}
// CSR data allocation
_id.resize(_nrows + 1); // Allocate memory for CSR row pointer
_ik.resize(_nnz); // Allocate memory for CSR column index vector
// copy CSR data
_id[0] = 0; // begin of first row
for (size_t i = 0; i < cc.size(); ++i)
{
//cout << i << " " << nid.at(i) << endl;;
const list<int> &ci = cc.at(i);
const auto nci = static_cast<int>(ci.size());
_id[i + 1] = _id[i] + nci; // begin of next line
copy(ci.begin(), ci.end(), _ik.begin() + _id[i] );
}
assert(_nnz == _id[_nrows]);
_sk.resize(_nnz); // Allocate memory for CSR column index vector
return;
}
void CRS_Matrix::Debug() const
{
// ID points to first entry of row
// no symmetry assumed
cout << "\nMatrix (nnz = " << _id[_nrows] << ")\n";
for (int row = 0; row < _nrows; ++row)
{
cout << "Row " << row << " : ";
int const id1 = _id[row];
int const id2 = _id[row + 1];
for (int j = id1; j < id2; ++j)
{
cout.setf(ios::right, ios::adjustfield);
cout << "[" << setw(2) << _ik[j] << "] " << setw(4) << _sk[j] << " ";
}
cout << endl;
}
return;
}
void CRS_Matrix::CalculateLaplace(vector<double> &f)
{
assert(_mesh.NdofsElement() == 3); // only for triangular, linear elements
//cout << _nnz << " vs. " << _id[_nrows] << " " << _nrows<< endl;
assert(_nnz == _id[_nrows]);
for (int k = 0; k < _nrows; ++k)
{
_sk[k] = 0.0;
}
for (int k = 0; k < _nrows; ++k)
{
f[k] = 0.0;
}
double ske[3][3], fe[3];
// Loop over all elements
auto const nelem = _mesh.Nelems();
auto const &ia = _mesh.GetConnectivity();
auto const &xc = _mesh.GetCoords();
for (int i = 0; i < nelem; ++i)
{
CalcElem(ia.data() + 3 * i, xc.data(), ske, fe);
//AddElem(ia.data()+3 * i, ske, fe, _id.data(), _ik.data(), _sk.data(), f.data()); // GH: deprecated
AddElem_3(ia.data() + 3 * i, ske, fe, f);
}
//Debug();
return;
}
void CRS_Matrix::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
{
double const PENALTY = 1e6;
auto const idx = _mesh.Index_DirichletNodes();
int const nidx = static_cast<int>(idx.size());
for (int row = 0; row < nidx; ++row)
{
int const k = idx[row];
int const id1 = fetch(k, k); // Find diagonal entry of row
assert(id1 >= 0);
_sk[id1] += PENALTY; // matrix weighted scaling feasible
f[k] += PENALTY * u[k];
}
return;
}
void CRS_Matrix::GetDiag(vector<double> &d) const
{
assert( _nrows == static_cast<int>(d.size()) );
for (int row = 0; row < _nrows; ++row)
{
const int ia = fetch(row, row); // Find diagonal entry of row
assert(ia >= 0);
d[row] = _sk[ia];
}
return;
}
bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const
{
bool bn = (nnode == _nrows); // number of rows
if (!bn)
{
cout << "######### Error: " << "number of rows" << endl;
}
bool bz = (id[nnode] == _nnz); // number of non zero elements
if (!bz)
{
cout << "######### Error: " << "number of non zero elements" << endl;
}
bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts
if (!bd)
{
cout << "######### Error: " << "row starts" << endl;
}
bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices
if (!bk)
{
cout << "######### Error: " << "column indices" << endl;
}
bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values
if (!bv)
{
cout << "######### Error: " << "values" << endl;
}
return bn && bz && bd && bk && bv;
}
void CRS_Matrix::Mult(vector<double> &w, vector<double> const &u) const
{
assert( _nrows == static_cast<int>(w.size()) );
assert( w.size() == u.size() );
for (int row = 0; row < _nrows; ++row)
{
double wi = 0.0;
for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
{
wi += _sk[ij] * u[ _ik[ij] ];
}
w[row] = wi;
}
return;
}
void CRS_Matrix::Defect(vector<double> &w,
vector<double> const &f, vector<double> const &u) const
{
assert( _nrows == static_cast<int>(w.size()) );
assert( w.size() == u.size() && u.size() == f.size() );
for (int row = 0; row < _nrows; ++row)
{
double wi = f[row];
for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
{
wi -= _sk[ij] * u[ _ik[ij] ];
}
w[row] = wi;
}
return;
}
int CRS_Matrix::fetch(int const row, int const col) const
{
int const id2 = _id[row + 1]; // end and
int ip = _id[row]; // start of recent row (global index)
while (ip < id2 && _ik[ip] != col) // find index col (global index)
{
++ip;
}
if (ip >= id2)
{
ip = -1;
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
cout << "No column " << col << " in row " << row << endl;
assert(ip >= id2);
#endif
}
return ip;
}
void CRS_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector<double> &f)
{
for (int i = 0; i < 3; ++i)
{
const int ii = ial[i]; // row ii (global index)
for (int j = 0; j < 3; ++j) // no symmetry assumed
{
const int jj = ial[j]; // column jj (global index)
int ip = fetch(ii, jj); // find column entry jj in row ii
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
if (ip < 0) // no entry found !!
{
cout << "Error in AddElem: (" << ii << "," << jj << ") ["
<< ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
assert(ip >= 0);
}
#endif
_sk[ip] += ske[i][j];
}
f[ii] += fe[i];
}
}

178
ex3/ex3/getmatrix.h Normal file
View file

@ -0,0 +1,178 @@
#ifndef GETMATRIX_FILE
#define GETMATRIX_FILE
#include "geom.h"
#include <vector>
/**
* Calculates the element stiffness matrix @p ske and the element load vector @p fe
* of one triangular element with linear shape functions.
* @param[in] ial node indices of the three element vertices
* @param[in] xc vector of node coordinates with x(2*k,2*k+1) as coodinates of node k
* @param[out] ske element stiffness matrix
* @param[out] fe element load vector
*/
void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]);
/**
* Adds the element stiffness matrix @p ske and the element load vector @p fe
* of one triangular element with linear shape functions to the appropriate positions in
* the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik)
*
* @param[in] ial node indices of the three element vertices
* @param[in] ske element stiffness matrix
* @param[in] fe element load vector
* @param[out] sk vector non-zero entries of CSR matrix
* @param[in] id index vector containing the first entry in a CSR row
* @param[in] ik column index vector of CSR matrix
* @param[out] f distributed local vector storing the right hand side
*
* @warning Algorithm requires indices in connectivity @p ial in ascending order.
* Currently deprecated.
*/
void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
int const id[], int const ik[], double sk[], double f[]);
// #####################################################################
/**
* Square matrix in CRS format (compressed row storage; also named CSR),
* see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
*/
class CRS_Matrix
{
public:
/**
* Intializes the CRS matrix structure from the given discetization in @p mesh.
*
* The sparse matrix pattern is generated but the values are 0.
*
* @param[in] mesh given discretization
*
* @warning A reference to the discretization @p mesh is stored inside this class.
* Therefore, changing @p mesh outside requires also
* to call method @p Derive_Matrix_Pattern explicitely.
*
* @see Derive_Matrix_Pattern
*/
explicit CRS_Matrix(Mesh const & mesh);
/**
* Destructor.
*/
~CRS_Matrix()
{}
/**
* Generates the sparse matrix pattern and overwrites the existing pattern.
*
* The sparse matrix pattern is generated but the values are 0.
*/
void Derive_Matrix_Pattern();
/**
* Calculates the entries of f.e. stiffness matrix and load/rhs vector @p f for the Laplace operator in 2D.
* No memory is allocated.
*
* @param[in,out] f (preallocated) rhs/load vector
*/
void CalculateLaplace(std::vector<double> &f);
/**
* Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
* The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
* is used for incorporating the given values @p u.
*
* @param[in] u (global) vector with Dirichlet data
* @param[in,out] f load vector
*/
void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
/**
* Extracts the diagonal elemenst of the sparse matrix.
*
* @param[in,out] d (prellocated) vector of diagonal elements
*/
void GetDiag(std::vector<double> &d) const;
/**
* Performs the matrix-vector product w := K*u.
*
* @param[in,out] w resulting vector (preallocated)
* @param[in] u vector
*/
void Mult(std::vector<double> &w, std::vector<double> const &u) const;
/**
* Calculates the defect/residuum w := f - K*u.
*
* @param[in,out] w resulting vector (preallocated)
* @param[in] f load vector
* @param[in] u vector
*/
void Defect(std::vector<double> &w,
std::vector<double> const &f, std::vector<double> const &u) const;
/**
* Number rows in matrix.
* @return number of rows.
*/
int Nrows() const
{return _nrows;}
/**
* Show the matrix entries.
*/
void Debug() const;
/**
* Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
*
* @param[in] row row index
* @param[in] col column index
* @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
*
* @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
*/
int fetch(int row, int col) const;
/**
* Adds the element stiffness matrix @p ske and the element load vector @p fe
* of one triangular element with linear shape functions to the appropriate positions in
* the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
*
* @param[in] ial node indices of the three element vertices
* @param[in] ske element stiffness matrix
* @param[in] fe element load vector
* @param[in,out] f distributed local vector storing the right hand side
*
* @warning Algorithm assumes linear triangular elements (ndof_e==3).
*/
void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
/**
* Compare @p this CRS matrix with an external CRS matrix stored in C-Style.
*
* The method prints statements on differences found.
*
* @param[in] nnode row number of external matrix
* @param[in] id start indices of matrix rows of external matrix
* @param[in] ik column indices of external matrix
* @param[in] sk non-zero values of external matrix
*
* @return true iff all data are identical.
*/
bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const;
private:
Mesh const & _mesh; //!< reference to discretization
int _nrows; //!< number of rows in matrix
int _nnz; //!< number of non-zero entries
std::vector<int> _id; //!< start indices of matrix rows
std::vector<int> _ik; //!< column indices
std::vector<double> _sk; //!< non-zero values
};
#endif

61
ex3/ex3/jacsolve.cpp Normal file
View file

@ -0,0 +1,61 @@
#include "vdop.h"
#include "getmatrix.h"
#include "jacsolve.h"
#include <cassert>
#include <cmath>
#include <iostream>
#include <vector>
using namespace std;
// #####################################################################
// const int neigh[], const int color,
// const MPI::Intracomm& icomm,
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
{
const double omega = 1.0;
const int maxiter = 1000;
const double tol = 1e-5, // tolerance
tol2 = tol * tol; // tolerance^2
int nrows = SK.Nrows(); // number of rows == number of columns
assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl;
// Choose initial guess
for (int k = 0; k < nrows; ++k)
{
u[k] = 0.0; // u := 0
}
vector<double> dd(nrows); // matrix diagonal
vector<double> r(nrows); // residual
vector<double> w(nrows); // correction
SK.GetDiag(dd); // dd := diag(K)
////DebugVector(dd);{int ijk; cin >> ijk;}
// Initial sweep
SK.Defect(r, f, u); // r := f - K*u
vddiv(w, r, dd); // w := D^{-1}*r
double sigma0 = dscapr(w, r); // s0 := <w,r>
// Iteration sweeps
int iter = 0;
double sigma = sigma0;
while ( sigma > tol2 * sigma0 && maxiter > iter)
{
++iter;
vdaxpy(u, u, omega, w ); // u := u + om*w
SK.Defect(r, f, u); // r := f - K*u
vddiv(w, r, dd); // w := D^{-1}*r
sigma = dscapr(w, r); // s0 := <w,r>
// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
}
cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl;
cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n";
return;
}

18
ex3/ex3/jacsolve.h Normal file
View file

@ -0,0 +1,18 @@
#ifndef JACSOLVE_FILE
#define JACSOLVE_FILE
#include "getmatrix.h"
#include <vector>
/**
* Solves linear system of equations K @p u = @p f via the Jacobi iteration.
* We use a distributed symmetric CSR matrix @p SK and initial guess of the
* solution is set to 0.
* @param[in] SK CSR matrix
* @param[in] f distributed local vector storing the right hand side
* @param[out] u accumulated local vector storing the solution.
*/
void JacobiSolve(CRS_Matrix const &SK, std::vector<double> const &f, std::vector<double> &u);
#endif

101
ex3/ex3/main.cpp Normal file
View file

@ -0,0 +1,101 @@
#include "benchmarks.h"
#include "benchmark_tests.h"
#include "factorization_solve_tests.h"
#include <iostream>
int main()
{
// ---------------------------------- 1. ----------------------------------
// results in file "test_system.txt"
// ---------------------------------- 2. ----------------------------------
// Memory FLOPS Read-write operations
// (A) 2N 2N 2N
// (B) MN + N 2NM 2NM + M
// (C) ML + LM 2LNM 2LNM + MN
// (D) (p+1) + N 3N(p+1) N(2 + 3(p+1))
// ---------------------------------- 3. ----------------------------------
// implementation in file "benchmarks.cpp"
// ---------------------------------- 4.& 6. ----------------------------------
vector<vector<double>> results;
results.push_back(test_A(250, 50000000, scalar));
results.push_back(test_A(250, 50000000, Kahan_skalar));
results.push_back(test_A(250, 50000000, scalar_cBLAS));
// Timing GFLOPS GiByte/s
// ------------------------------------------
// scalar 0.039 2.4 19
// Kahan_skalar 0.037 2.5 20
// scalar_cBLAS 0.032 2.9 23
results.push_back(test_B(100, 20000, 10000, MatVec));
results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS));
// Timing GFLOPS GiByte/s
// ------------------------------------------
// MatVec 0.1 3.6 29
// MatVec_cBLAS 0.074 5 40
results.push_back(test_C(25, 500, 1000, 1500, MatMat));
results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS));
// Timing GFLOPS GiByte/s
// ------------------------------------------
// MatMat 0.57 2.5 20
// MatMat_cBLAS 0.019 75 6e+02 // unrealistic
results.push_back(test_D(100, 100, 1000000));
// Timing GFLOPS GiByte/s
// ------------------------------------------
// 0.11 2.5 20
cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl;
cout << "------------------------------" << endl;
for (size_t i = 0; i < results.size(); ++i)
cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl;
cout << endl;
// ---------------------------------- 5. ----------------------------------
// 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product.
// Reason: only have to access entries of x, so less memory that has to be accessed
//
// 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product
// ---------------------------------- 6. ----------------------------------
// see 4.
// ---------------------------------- 7. ----------------------------------
CheckCorrectness();
// Checked correctness by computing the inverse of A
CheckDuration(5000);
// The solving time per RHS scales roughly with factor 1/n_rhs
// ---------------------------------- 8. ----------------------------------
// done seperately
return 0;
}

16
ex3/ex3/userset.cpp Normal file
View file

@ -0,0 +1,16 @@
#include "userset.h"
#include <cmath>
double FunctF(double const x, double const y)
{
// return std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
// return 16.0*1024. ;
// return (double)1.0 ;
return x * x * std::sin(2.5 * 3.14159 * y);
}
double FunctU(const double /* x */, double const /* y */)
{
return 1.0 ;
}

44
ex3/ex3/userset.h Normal file
View file

@ -0,0 +1,44 @@
#ifndef USERSET_FILE
#define USERSET_FILE
#include <cmath>
/**
* User function: f(@p x,@p y)
* @param[in] x x-coordinate of discretization point
* @param[in] y y-coordinate of discretization point
* @return value for right hand side f(@p x,@p y)
*/
double FunctF(double const x, double const y);
/**
* User function: u(@p x,@p y)
* @param[in] x x-coordinate of discretization point
* @param[in] y y-coordinate of discretization point
* @return value for solution vector u(@p x,@p y)
*/
double FunctU(double const x, double const y);
/**
* User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$.
* @param[in] x x-coordinate of discretization point
* @param[in] y y-coordinate of discretization point
* @return value f(@p x,@p y)
*/
inline double fNice(double const x, double const y)
{
return x * x * std::sin(2.5 * 3.14159 * y);
}
/**
* User function: f(@p x,@p y) = 0$.
* @param[in] x x-coordinate of discretization point
* @param[in] y y-coordinate of discretization point
* @return value 0
*/
inline double f_zero(double const x, double const y)
//double f_zero(double const /*x*/, double const /*y*/)
{
return 0.0 + 0.0*(x+y);
}
#endif

84
ex3/ex3/vdop.cpp Normal file
View file

@ -0,0 +1,84 @@
#include "vdop.h"
#include <cassert> // assert()
#include <cmath>
#include <iostream>
#include <vector>
using namespace std;
void vddiv(vector<double> &x, vector<double> const &y,
vector<double> const &z)
{
assert( x.size() == y.size() && y.size() == z.size() );
size_t n = x.size();
for (size_t k = 0; k < n; ++k)
{
x[k] = y[k] / z[k];
}
return;
}
//******************************************************************************
void vdaxpy(std::vector<double> &x, std::vector<double> const &y,
double alpha, std::vector<double> const &z )
{
assert( x.size() == y.size() && y.size() == z.size() );
size_t n = x.size();
for (size_t k = 0; k < n; ++k)
{
x[k] = y[k] + alpha * z[k];
}
return;
}
//******************************************************************************
double dscapr(std::vector<double> const &x, std::vector<double> const &y)
{
assert( x.size() == y.size());
size_t n = x.size();
double s = 0.0;
for (size_t k = 0; k < n; ++k)
{
s += x[k] * y[k];
}
return s;
}
//******************************************************************************
void DebugVector(vector<double> const &v)
{
cout << "\nVector (nnode = " << v.size() << ")\n";
for (size_t j = 0; j < v.size(); ++j)
{
cout.setf(ios::right, ios::adjustfield);
cout << v[j] << " ";
}
cout << endl;
return;
}
//******************************************************************************
bool CompareVectors(std::vector<double> const &x, int const n, double const y[], double const eps)
{
bool bn = (static_cast<int>(x.size()) == n);
if (!bn)
{
cout << "######### Error: " << "number of elements" << endl;
}
//bool bv = equal(x.cbegin(),x.cend(),y);
bool bv = equal(x.cbegin(), x.cend(), y,
[eps](double a, double b) -> bool
{ return std::abs(a - b) < eps * (1.0 + 0.5 * (std::abs(a) + std::abs(a))); }
);
if (!bv)
{
assert(static_cast<int>(x.size()) == n);
cout << "######### Error: " << "values" << endl;
}
return bn && bv;
}

58
ex3/ex3/vdop.h Normal file
View file

@ -0,0 +1,58 @@
#ifndef VDOP_FILE
#define VDOP_FILE
#include <vector>
/** @brief Element-wise vector divison x_k = y_k/z_k.
*
* @param[out] x target vector
* @param[in] y source vector
* @param[in] z source vector
*
*/
void vddiv(std::vector<double> & x, std::vector<double> const& y,
std::vector<double> const& z);
/** @brief Element-wise daxpy operation x(k) = y(k) + alpha*z(k).
*
* @param[out] x target vector
* @param[in] y source vector
* @param[in] alpha scalar
* @param[in] z source vector
*
*/
void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
double alpha, std::vector<double> const& z );
/** @brief Calculates the Euclidian inner product of two vectors.
*
* @param[in] x vector
* @param[in] y vector
* @return Euclidian inner product @f$\langle x,y \rangle@f$
*
*/
double dscapr(std::vector<double> const& x, std::vector<double> const& y);
/**
* Print entries of a vector.
* @param[in] v vector values
*/
void DebugVector(std::vector<double> const &v);
/** @brief Compares an STL vector with POD vector.
*
* The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$
* follows the book by
* <a href="https://www.springer.com/la/book/9783319446592">Stoyan/Baran</a>, p.8.
*
* @param[in] x STL vector
* @param[in] n length of POD vector
* @param[in] y POD vector
* @param[in] eps relative accuracy criteria (default := 0.0).
* @return true iff pairwise vector elements are relatively close to each other.
*
*/
bool CompareVectors(std::vector<double> const& x, int n, double const y[], double const eps=0.0);
#endif