Pushing everything again, accidentally deleted my remote repository
This commit is contained in:
commit
1bee3e8e5b
101 changed files with 9428 additions and 0 deletions
123
ex1/CLANG_default.mk
Normal file
123
ex1/CLANG_default.mk
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=CLANG_
|
||||
|
||||
#CLANGPATH=//usr/lib/llvm-10/bin/
|
||||
CC = ${CLANGPATH}clang
|
||||
CXX = ${CLANGPATH}clang++
|
||||
#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
|
||||
#F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
|
||||
WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
|
||||
WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
|
||||
#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
|
||||
|
||||
CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
|
||||
# don't use -Ofast
|
||||
# -ftrapv
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
ifeq ($(UBUNTU),1)
|
||||
# ubuntu
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# very good check
|
||||
# http://clang.llvm.org/extra/clang-tidy/
|
||||
# good check, see: http://llvm.org/docs/CodingStandards.html#include-style
|
||||
SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
|
||||
SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
|
||||
#READABILITY=,readability*${SWITCH_OFF}
|
||||
#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
#TIDYFLAGS += -checks='modernize*
|
||||
# ???
|
||||
#TIDYFLAGS = -checks='cert*' -header-filter=.*
|
||||
# MPI checks ??
|
||||
#TIDYFLAGS = -checks='mpi*'
|
||||
# ??
|
||||
#TIDYFLAGS = -checks='performance*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='portability-*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='readability-*' -header-filter=.*
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
codecheck: tidy_check
|
||||
tidy_check:
|
||||
clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
|
||||
# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
codecheck: tidy_check
|
||||
130
ex1/GCC_AMD32_default.mk
Normal file
130
ex1/GCC_AMD32_default.mk
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=GCC_
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
# on mephisto:
|
||||
#CXXFLAGS += -I/share/apps/atlas/include
|
||||
#LINKFLAGS += -L/share/apps/atlas/lib
|
||||
#LINKFLAGS += -lcblas -latlas
|
||||
|
||||
#LINKFLAGS += -lblas
|
||||
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
|
||||
|
||||
|
||||
#WARNINGS = -pedantic -pedantic-errors -Wall -Wextra -Werror -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow
|
||||
WARNINGS = -pedantic -Wall -Wextra -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -Winline -fmax-errors=1
|
||||
# -Wunreachable-code
|
||||
# -Wunreachable-code
|
||||
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
|
||||
#-msse3
|
||||
# -ftree-vectorizer-verbose=2 -DNDEBUG
|
||||
# -ftree-vectorizer-verbose=5
|
||||
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
|
||||
|
||||
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
|
||||
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
|
||||
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# BLAS, LAPACK
|
||||
OPENBLAS_DIR = /opt/openblas_GCCseq
|
||||
#OPENBLAS_DIR = /opt/openblas_GCC
|
||||
OPENBLAS_LIBDIR = ${OPENBLAS_DIR}/lib
|
||||
OPENBLAS_INCDIR = ${OPENBLAS_DIR}/include
|
||||
CXXFLAGS += -I${OPENBLAS_INCDIR}
|
||||
LINKFLAGS += -L${OPENBLAS_LIBDIR} -lopenblas
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig
|
||||
-@rm -r html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
# time ./${PROGRAM}
|
||||
# ./${PROGRAM}
|
||||
( export LD_LIBRARY_PATH=${OPENBLAS_LIBDIR}:${LD_LIBRARY_PATH} ; ./${PROGRAM} )
|
||||
# or 'export LD_LIBRARY_PATH=/opt/openblas_gcc/lib:${LD_LIBRARY_PATH}' in your ~/.bashrc
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar:
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
# no "-pg" in compile/link options
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
./$^
|
||||
gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
#Trace your heap:
|
||||
#> heaptrack ./main.GCC_
|
||||
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
|
||||
heap: ${PROGRAM}
|
||||
heaptrack ./$^ 11
|
||||
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
|
||||
|
||||
|
||||
|
||||
########################################################################
|
||||
# get the detailed status of all optimization flags
|
||||
info:
|
||||
echo "detailed status of all optimization flags"
|
||||
$(CXX) --version
|
||||
$(CXX) -Q $(CXXFLAGS) --help=optimizers
|
||||
183
ex1/GCC_default.mk
Normal file
183
ex1/GCC_default.mk
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=GCC_
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -Winline -fmax-errors=1
|
||||
# -Wunreachable-code
|
||||
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
|
||||
#-msse3
|
||||
# -ftree-vectorizer-verbose=2 -DNDEBUG
|
||||
# -ftree-vectorizer-verbose=5
|
||||
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
|
||||
|
||||
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
|
||||
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
|
||||
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
LINKFLAGS += -O3
|
||||
|
||||
#architecture
|
||||
#CPU = -march=znver2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU}
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
ifeq ($(UBUNTU),1)
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -llapack -lopenblas -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# for debugging purpose (save code)
|
||||
# -fsanitize=leak # only one out the three can be used
|
||||
# -fsanitize=address
|
||||
# -fsanitize=thread
|
||||
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
|
||||
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
|
||||
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
|
||||
#CXXFLAGS += ${SANITARY}
|
||||
#LINKFLAGS += ${SANITARY}
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
|
||||
-@rm -rf html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
#run: ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
#find . -size +10M > large_files
|
||||
#--exclude-from ${MY_DIR}/large_files
|
||||
|
||||
zip: clean
|
||||
@echo "Zip the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
.SUFFIXES: .f90
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
.f90.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
# no "-pg" in compile/link options
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
# Graphical interface
|
||||
# valkyrie
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
|
||||
# * install
|
||||
# * sudo vi /etc/sysctl.conf
|
||||
# add kernel.perf_event_paranoid = 0
|
||||
|
||||
#Trace your heap:
|
||||
#> heaptrack ./main.GCC_
|
||||
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
|
||||
heap: ${PROGRAM}
|
||||
heaptrack ./$^ ${PARAMS}
|
||||
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
|
||||
|
||||
codecheck: $(SOURCES)
|
||||
cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
|
||||
|
||||
|
||||
########################################################################
|
||||
# get the detailed status of all optimization flags
|
||||
info:
|
||||
echo "detailed status of all optimization flags"
|
||||
$(CXX) --version
|
||||
$(CXX) -Q $(CXXFLAGS) --help=optimizers
|
||||
lscpu
|
||||
inxi -C
|
||||
lstopo
|
||||
|
||||
# Excellent hardware info
|
||||
# hardinfo
|
||||
# Life monitoring of CPU frequency etc.
|
||||
# sudo i7z
|
||||
|
||||
# Memory consumption
|
||||
# vmstat -at -SM 3
|
||||
# xfce4-taskmanager
|
||||
|
||||
|
||||
# https://www.tecmint.com/check-linux-cpu-information/
|
||||
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
|
||||
|
||||
# Debugging:
|
||||
# https://wiki.archlinux.org/index.php/Debugging
|
||||
137
ex1/ICC_default.mk
Normal file
137
ex1/ICC_default.mk
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
# Basic Defintions for using INTEL compiler suite sequentially
|
||||
# requires setting of COMPILER=ICC_
|
||||
|
||||
#BINDIR = /opt/intel/bin/
|
||||
|
||||
# special on my sony [GH]
|
||||
#BINDIR = /opt/save.intel/bin/
|
||||
# very special on my sony [GH]
|
||||
# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
|
||||
|
||||
# Error with g++-4.8 using icpc14.0,x:
|
||||
# find directory wherein bits/c++config.h is located
|
||||
# 'locate bits/c++config.h'
|
||||
#FOUND_CONFIG = -I/usr/include/x86_64-linux-gnu/c++/4.8
|
||||
|
||||
|
||||
CC = ${BINDIR}icc
|
||||
CXX = ${BINDIR}icpc
|
||||
F77 = ${BINDIR}ifort
|
||||
LINKER = ${CXX}
|
||||
|
||||
|
||||
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3
|
||||
# -Winline -Wredundant-decls -Wunreachable-code
|
||||
CXXFLAGS += -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl ${FOUND_CONFIG}
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
# -vec-report=3
|
||||
# -qopt-report=5 -qopt-report-phase=vec
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
CXXFLAGS += -align
|
||||
|
||||
# use MKL by INTEL
|
||||
# https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/linking-your-application-with-the-intel-math-kernel-library/linking-quick-start/using-the-mkl-compiler-option.html
|
||||
# https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html
|
||||
# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
#LINKFLAGS += -O3 -L/opt/intel/mkl/lib -mkl
|
||||
LINKFLAGS += -O3 -mkl=sequential
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -ipo
|
||||
LINKFLAGS += -ipo
|
||||
|
||||
# annotated assembler file
|
||||
ANNOTED = -fsource-asm -S
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# # some tools
|
||||
# # Cache behaviour (CXXFLAGS += -g tracks down to source lines)
|
||||
# cache: ${PROGRAM}
|
||||
# valgrind --tool=callgrind --simulate-cache=yes ./$^
|
||||
# # kcachegrind callgrind.out.<pid> &
|
||||
#
|
||||
# # Check for wrong memory accesses, memory leaks, ...
|
||||
# # use smaller data sets
|
||||
# mem: ${PROGRAM}
|
||||
# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
|
||||
#
|
||||
# # Simple run time profiling of your code
|
||||
# # CXXFLAGS += -g -pg
|
||||
# # LINKFLAGS += -pg
|
||||
# prof: ${PROGRAM}
|
||||
# ./$^
|
||||
# gprof -b ./$^ > gp.out
|
||||
# # kprof -f gp.out -p gprof &
|
||||
#
|
||||
|
||||
|
||||
mem: inspector
|
||||
prof: amplifier
|
||||
cache: amplifier
|
||||
|
||||
gap_par_report:
|
||||
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
|
||||
|
||||
# GUI for performance report
|
||||
amplifier: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# alternatively to the solution above:
|
||||
#edit file /etc/sysctl.d/10-ptrace.conf and set variable kernel.yama.ptrace_scope variable to 0 .
|
||||
amplxe-gui &
|
||||
|
||||
# GUI for Memory and Thread analyzer (race condition)
|
||||
inspector: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
inspxe-gui &
|
||||
|
||||
advisor:
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
advixe-gui &
|
||||
|
||||
icc-info:
|
||||
icpc -# main.cpp
|
||||
|
||||
|
||||
|
||||
|
||||
176
ex1/ONEAPI_default.mk
Normal file
176
ex1/ONEAPI_default.mk
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
# Basic Defintions for using INTEL compiler suite sequentially
|
||||
# requires setting of COMPILER=ONEAPI_
|
||||
|
||||
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
|
||||
# requires
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# on AMD: export MKL_DEBUG_CPU_TYPE=5
|
||||
|
||||
#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
|
||||
#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
|
||||
#export KMP_AFFINITY=verbose,compact
|
||||
|
||||
CC = ${BINDIR}icc
|
||||
CXX = ${BINDIR}dpcpp
|
||||
F77 = ${BINDIR}ifort
|
||||
LINKER = ${CXX}
|
||||
|
||||
## Compiler flags
|
||||
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
|
||||
WARNINGS += -Wpessimizing-move -Wredundant-move
|
||||
#-wd2015,2012,2014 -wn3
|
||||
# -Winline -Wredundant-decls -Wunreachable-code
|
||||
# -qopt-subscript-in-range
|
||||
# -vec-threshold0
|
||||
|
||||
CXXFLAGS += -O3 -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include"
|
||||
#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include"
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -ipo
|
||||
LINKFLAGS += -ipo
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# annotated Assembler file
|
||||
ANNOTED = -fsource-asm -S
|
||||
|
||||
#architecture
|
||||
CPU = -march=core-avx2
|
||||
#CPU += -mtp=zen
|
||||
# -xCORE-AVX2
|
||||
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU}
|
||||
|
||||
# use MKL by INTEL
|
||||
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
|
||||
# sequential MKL
|
||||
# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
|
||||
CXXFLAGS += -qmkl=sequential -UMKL_ILP64
|
||||
LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
|
||||
# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static
|
||||
# install intel-oneapi-compiler-static
|
||||
# or
|
||||
LINKFLAGS += -shared-intel
|
||||
|
||||
|
||||
OPENMP = -qopenmp
|
||||
CXXFLAGS += ${OPENMP}
|
||||
LINKFLAGS += ${OPENMP}
|
||||
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
# -vec-report=3
|
||||
# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
|
||||
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
|
||||
# Redirect report from *.optrpt to stderr
|
||||
# -qopt-report-file=stderr
|
||||
# Guided paralellization
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
|
||||
## run time checks
|
||||
# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
|
||||
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines)
|
||||
# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
|
||||
|
||||
mem: inspector
|
||||
prof: vtune
|
||||
cache: inspector
|
||||
|
||||
gap_par_report:
|
||||
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
|
||||
|
||||
# GUI for performance report
|
||||
amplifier: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
|
||||
amplxe-gui &
|
||||
|
||||
# GUI for Memory and Thread analyzer (race condition)
|
||||
inspector: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# inspxe-gui &
|
||||
vtune-gui ./${PROGRAM} &
|
||||
|
||||
advisor:
|
||||
source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
|
||||
# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
|
||||
advisor --collect=survey ./${PROGRAM}
|
||||
# advisor --collect=roofline ./${PROGRAM}
|
||||
advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
|
||||
|
||||
vtune:
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
|
||||
export ADVIXE_EXPERIMENTAL=roofline
|
||||
vtune -collect hotspots ./${PROGRAM}
|
||||
vtune -report hotspots -r r000hs > vtune.out
|
||||
# vtune-gui ./${PROGRAM} &
|
||||
|
||||
icc-info:
|
||||
icpc -# main.cpp
|
||||
|
||||
# MKL on AMD
|
||||
# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
|
||||
#
|
||||
# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
|
||||
# export MKL_DEBUG_CPU_TYPE=5
|
||||
# export MKL_NUM_THRAEDS=1
|
||||
# export MKL_DYNAMIC=false
|
||||
# on Intel compiler
|
||||
# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html
|
||||
93
ex1/PGI_default.mk
Normal file
93
ex1/PGI_default.mk
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
# Basic Defintions for using PGI-compiler suite sequentially
|
||||
# requires setting of COMPILER=PGI_
|
||||
# OPTIRUN = optirun
|
||||
|
||||
|
||||
CC = pgcc
|
||||
CXX = pgc++
|
||||
F77 = pgfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
# on mephisto:
|
||||
#CXXFLAGS += -I/share/apps/atlas/include
|
||||
#LINKFLAGS += -L/share/apps/atlas/lib
|
||||
#LINKFLAGS += -lcblas -latlas
|
||||
|
||||
#LINKFLAGS += -lblas
|
||||
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
|
||||
|
||||
WARNINGS = -Minform=warn
|
||||
# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow -Wredundant-decls
|
||||
# -pedantic -Wunreachable-code -Wextra -Winline
|
||||
# -Wunreachable-code
|
||||
|
||||
#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel
|
||||
PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
|
||||
# -Minfo
|
||||
# -Mprof=time
|
||||
# -Mprof=lines
|
||||
# take care with option -Msafeptr
|
||||
CXXFLAGS += -O3 -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -O3 -std=c++11 -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
|
||||
# -fastsse -fargument-noalias ${WARNINGS} -msse3 -vec-report=3
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# # some tools
|
||||
# # Simple run time profiling of your code
|
||||
# # CXXFLAGS += -g -pg
|
||||
# # LINKFLAGS += -pg
|
||||
|
||||
|
||||
# Profiling options PGI, see: pgcollect -help
|
||||
# CPU_PROF = -allcache
|
||||
CPU_PROF = -time
|
||||
# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
|
||||
#GPU_PROF = -cuda=branch:cc20
|
||||
#
|
||||
PROF_FILE = pgprof.out
|
||||
|
||||
cache: prof
|
||||
|
||||
prof: ${PROGRAM}
|
||||
${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^
|
||||
${OPTIRUN} ${BINDIR}pgprof -exe ./$^ $(PROF_FILE) &
|
||||
|
||||
info:
|
||||
pgaccelinfo -v
|
||||
28
ex1/ex1A_mean_values/.vscode/tasks.json
vendored
Normal file
28
ex1/ex1A_mean_values/.vscode/tasks.json
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"tasks": [
|
||||
{
|
||||
"type": "cppbuild",
|
||||
"label": "C/C++: gcc build active file",
|
||||
"command": "/usr/bin/gcc",
|
||||
"args": [
|
||||
"-fdiagnostics-color=always",
|
||||
"-g",
|
||||
"${file}",
|
||||
"-o",
|
||||
"${fileDirname}/${fileBasenameNoExtension}"
|
||||
],
|
||||
"options": {
|
||||
"cwd": "${fileDirname}"
|
||||
},
|
||||
"problemMatcher": [
|
||||
"$gcc"
|
||||
],
|
||||
"group": {
|
||||
"kind": "build",
|
||||
"isDefault": true
|
||||
},
|
||||
"detail": "Task generated by Debugger."
|
||||
}
|
||||
],
|
||||
"version": "2.0.0"
|
||||
}
|
||||
30
ex1/ex1A_mean_values/Makefile
Normal file
30
ex1/ex1A_mean_values/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp ../ex1A_mean_values/means.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
BIN
ex1/ex1A_mean_values/main.GCC_
Executable file
BIN
ex1/ex1A_mean_values/main.GCC_
Executable file
Binary file not shown.
35
ex1/ex1A_mean_values/main.cpp
Normal file
35
ex1/ex1A_mean_values/main.cpp
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
#include "means.h"
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
double arithmetic_mean, geometric_mean, harmonic_mean;
|
||||
|
||||
// Fixed version
|
||||
calculate_means(1, 4, 16, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(2, 3, 5, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(1000, 4000, 16000, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
cout << "--------------------------------" << endl;
|
||||
|
||||
|
||||
|
||||
// Scalable version
|
||||
calculate_means(vector<int> {1, 4, 16}, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(vector<int> {2, 3, 5}, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
calculate_means(vector<int> {1000, 4000, 16000}, arithmetic_mean, geometric_mean, harmonic_mean);
|
||||
cout << arithmetic_mean << ", " << geometric_mean << ", " << harmonic_mean << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
BIN
ex1/ex1A_mean_values/main.o
Normal file
BIN
ex1/ex1A_mean_values/main.o
Normal file
Binary file not shown.
30
ex1/ex1A_mean_values/means.cpp
Normal file
30
ex1/ex1A_mean_values/means.cpp
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#include "../ex1A_mean_values/means.h"
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
void calculate_means(int a, int b, int c, double &am, double &gm, double &hm)
|
||||
{
|
||||
am = (a + b + c)/3.0;
|
||||
gm = exp((log(a)+log(b)+log(c))/3);
|
||||
hm = 3.0/(1.0/a + 1.0/b + 1.0/c);
|
||||
}
|
||||
|
||||
void calculate_means(std::vector<int> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
int n = numbers.size();
|
||||
|
||||
am = 0.;
|
||||
gm = 0.;
|
||||
hm = 0.;
|
||||
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
am += numbers[i];
|
||||
gm += log(numbers[i]);
|
||||
hm += 1.0/numbers[i];
|
||||
}
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
23
ex1/ex1A_mean_values/means.h
Normal file
23
ex1/ex1A_mean_values/means.h
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of three integers.
|
||||
@param[in] a first integer
|
||||
@param[in] b second integer
|
||||
@param[in] c third integer
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void calculate_means(int a, int b, int c, double &am, double &gm, double &hm);
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void calculate_means(std::vector<int> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
BIN
ex1/ex1A_mean_values/means.o
Normal file
BIN
ex1/ex1A_mean_values/means.o
Normal file
Binary file not shown.
30
ex1/ex1B_data-IO_and_vectors/Makefile
Normal file
30
ex1/ex1B_data-IO_and_vectors/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp ../ex1A_mean_values/means.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
501
ex1/ex1B_data-IO_and_vectors/data_1.txt
Normal file
501
ex1/ex1B_data-IO_and_vectors/data_1.txt
Normal file
|
|
@ -0,0 +1,501 @@
|
|||
141
|
||||
261
|
||||
87
|
||||
430
|
||||
258
|
||||
298
|
||||
425
|
||||
120
|
||||
496
|
||||
707
|
||||
244
|
||||
786
|
||||
75
|
||||
394
|
||||
4
|
||||
221
|
||||
2
|
||||
190
|
||||
143
|
||||
269
|
||||
175
|
||||
139
|
||||
599
|
||||
902
|
||||
940
|
||||
222
|
||||
483
|
||||
377
|
||||
524
|
||||
265
|
||||
69
|
||||
437
|
||||
174
|
||||
27
|
||||
955
|
||||
431
|
||||
962
|
||||
763
|
||||
8
|
||||
681
|
||||
706
|
||||
646
|
||||
553
|
||||
219
|
||||
773
|
||||
229
|
||||
371
|
||||
891
|
||||
857
|
||||
403
|
||||
319
|
||||
609
|
||||
911
|
||||
910
|
||||
592
|
||||
333
|
||||
854
|
||||
443
|
||||
905
|
||||
34
|
||||
533
|
||||
717
|
||||
180
|
||||
337
|
||||
188
|
||||
322
|
||||
404
|
||||
549
|
||||
49
|
||||
553
|
||||
275
|
||||
242
|
||||
244
|
||||
155
|
||||
957
|
||||
936
|
||||
819
|
||||
729
|
||||
176
|
||||
361
|
||||
189
|
||||
2
|
||||
317
|
||||
700
|
||||
626
|
||||
544
|
||||
440
|
||||
288
|
||||
502
|
||||
762
|
||||
763
|
||||
577
|
||||
748
|
||||
646
|
||||
124
|
||||
505
|
||||
348
|
||||
93
|
||||
148
|
||||
199
|
||||
673
|
||||
432
|
||||
695
|
||||
257
|
||||
10
|
||||
533
|
||||
280
|
||||
947
|
||||
907
|
||||
393
|
||||
25
|
||||
672
|
||||
838
|
||||
972
|
||||
57
|
||||
451
|
||||
583
|
||||
687
|
||||
720
|
||||
651
|
||||
727
|
||||
374
|
||||
582
|
||||
117
|
||||
58
|
||||
980
|
||||
285
|
||||
595
|
||||
963
|
||||
186
|
||||
194
|
||||
342
|
||||
933
|
||||
391
|
||||
274
|
||||
152
|
||||
398
|
||||
375
|
||||
132
|
||||
436
|
||||
92
|
||||
615
|
||||
11
|
||||
574
|
||||
790
|
||||
236
|
||||
449
|
||||
570
|
||||
62
|
||||
497
|
||||
643
|
||||
222
|
||||
838
|
||||
972
|
||||
847
|
||||
506
|
||||
279
|
||||
747
|
||||
237
|
||||
958
|
||||
621
|
||||
601
|
||||
173
|
||||
91
|
||||
256
|
||||
859
|
||||
912
|
||||
700
|
||||
726
|
||||
230
|
||||
577
|
||||
811
|
||||
404
|
||||
989
|
||||
90
|
||||
321
|
||||
512
|
||||
61
|
||||
726
|
||||
557
|
||||
530
|
||||
830
|
||||
859
|
||||
790
|
||||
318
|
||||
453
|
||||
753
|
||||
110
|
||||
110
|
||||
270
|
||||
525
|
||||
973
|
||||
711
|
||||
312
|
||||
292
|
||||
851
|
||||
912
|
||||
640
|
||||
256
|
||||
89
|
||||
839
|
||||
585
|
||||
949
|
||||
62
|
||||
585
|
||||
286
|
||||
828
|
||||
191
|
||||
443
|
||||
394
|
||||
827
|
||||
677
|
||||
208
|
||||
319
|
||||
134
|
||||
672
|
||||
571
|
||||
170
|
||||
148
|
||||
477
|
||||
909
|
||||
553
|
||||
33
|
||||
54
|
||||
806
|
||||
452
|
||||
383
|
||||
790
|
||||
365
|
||||
533
|
||||
712
|
||||
872
|
||||
329
|
||||
651
|
||||
975
|
||||
76
|
||||
588
|
||||
414
|
||||
310
|
||||
264
|
||||
759
|
||||
996
|
||||
187
|
||||
782
|
||||
196
|
||||
993
|
||||
803
|
||||
425
|
||||
729
|
||||
499
|
||||
809
|
||||
357
|
||||
74
|
||||
591
|
||||
911
|
||||
194
|
||||
433
|
||||
750
|
||||
40
|
||||
947
|
||||
764
|
||||
559
|
||||
184
|
||||
498
|
||||
518
|
||||
995
|
||||
855
|
||||
963
|
||||
679
|
||||
404
|
||||
935
|
||||
480
|
||||
232
|
||||
397
|
||||
706
|
||||
559
|
||||
757
|
||||
996
|
||||
963
|
||||
536
|
||||
964
|
||||
116
|
||||
52
|
||||
305
|
||||
581
|
||||
531
|
||||
902
|
||||
541
|
||||
432
|
||||
543
|
||||
713
|
||||
17
|
||||
801
|
||||
143
|
||||
479
|
||||
257
|
||||
370
|
||||
662
|
||||
170
|
||||
279
|
||||
199
|
||||
196
|
||||
327
|
||||
881
|
||||
472
|
||||
404
|
||||
180
|
||||
969
|
||||
408
|
||||
845
|
||||
616
|
||||
377
|
||||
878
|
||||
785
|
||||
465
|
||||
814
|
||||
899
|
||||
430
|
||||
335
|
||||
597
|
||||
902
|
||||
703
|
||||
378
|
||||
735
|
||||
955
|
||||
543
|
||||
541
|
||||
312
|
||||
72
|
||||
182
|
||||
93
|
||||
464
|
||||
10
|
||||
916
|
||||
643
|
||||
2
|
||||
31
|
||||
209
|
||||
455
|
||||
128
|
||||
9
|
||||
728
|
||||
355
|
||||
781
|
||||
437
|
||||
437
|
||||
50
|
||||
50
|
||||
92
|
||||
595
|
||||
242
|
||||
842
|
||||
858
|
||||
964
|
||||
489
|
||||
221
|
||||
227
|
||||
537
|
||||
763
|
||||
348
|
||||
462
|
||||
640
|
||||
918
|
||||
162
|
||||
716
|
||||
578
|
||||
434
|
||||
885
|
||||
394
|
||||
179
|
||||
634
|
||||
625
|
||||
328
|
||||
803
|
||||
1000
|
||||
981
|
||||
128
|
||||
233
|
||||
24
|
||||
608
|
||||
111
|
||||
408
|
||||
885
|
||||
549
|
||||
370
|
||||
209
|
||||
441
|
||||
957
|
||||
125
|
||||
471
|
||||
857
|
||||
44
|
||||
692
|
||||
979
|
||||
284
|
||||
134
|
||||
686
|
||||
910
|
||||
611
|
||||
900
|
||||
194
|
||||
755
|
||||
347
|
||||
419
|
||||
156
|
||||
820
|
||||
625
|
||||
739
|
||||
806
|
||||
68
|
||||
951
|
||||
498
|
||||
756
|
||||
743
|
||||
832
|
||||
157
|
||||
458
|
||||
619
|
||||
933
|
||||
836
|
||||
896
|
||||
583
|
||||
583
|
||||
855
|
||||
35
|
||||
886
|
||||
408
|
||||
37
|
||||
747
|
||||
155
|
||||
144
|
||||
606
|
||||
255
|
||||
325
|
||||
402
|
||||
407
|
||||
387
|
||||
610
|
||||
167
|
||||
189
|
||||
95
|
||||
324
|
||||
770
|
||||
235
|
||||
741
|
||||
693
|
||||
825
|
||||
828
|
||||
294
|
||||
310
|
||||
524
|
||||
326
|
||||
832
|
||||
811
|
||||
557
|
||||
263
|
||||
681
|
||||
234
|
||||
457
|
||||
385
|
||||
539
|
||||
992
|
||||
756
|
||||
981
|
||||
235
|
||||
529
|
||||
52
|
||||
757
|
||||
602
|
||||
858
|
||||
989
|
||||
930
|
||||
410
|
||||
1
|
||||
541
|
||||
208
|
||||
220
|
||||
326
|
||||
96
|
||||
748
|
||||
749
|
||||
544
|
||||
339
|
||||
833
|
||||
553
|
||||
958
|
||||
893
|
||||
357
|
||||
547
|
||||
347
|
||||
623
|
||||
797
|
||||
746
|
||||
126
|
||||
823
|
||||
26
|
||||
415
|
||||
732
|
||||
782
|
||||
368
|
||||
|
||||
BIN
ex1/ex1B_data-IO_and_vectors/main.GCC_
Executable file
BIN
ex1/ex1B_data-IO_and_vectors/main.GCC_
Executable file
Binary file not shown.
53
ex1/ex1B_data-IO_and_vectors/main.cpp
Normal file
53
ex1/ex1B_data-IO_and_vectors/main.cpp
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
#include "../ex1A_mean_values/means.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// read vector from file
|
||||
vector<int> data_vector = {};
|
||||
|
||||
ifstream input_stream("data_1.txt");
|
||||
|
||||
int line;
|
||||
while(input_stream >> line)
|
||||
{
|
||||
data_vector.push_back(line);
|
||||
}
|
||||
data_vector.shrink_to_fit();
|
||||
|
||||
|
||||
// calculate minimum and maximum
|
||||
vector<int>::iterator min_it = min_element(data_vector.begin(), data_vector.end());
|
||||
vector<int>::iterator max_it = max_element(data_vector.begin(), data_vector.end());
|
||||
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am, gm, hm;
|
||||
calculate_means(data_vector, am, gm, hm);
|
||||
|
||||
|
||||
// calculate standard deviation
|
||||
double sd = 0.;
|
||||
int n = data_vector.size();
|
||||
for (int i = 0; i < n; ++i)
|
||||
{
|
||||
sd += pow(data_vector[i] - am, 2);
|
||||
}
|
||||
sd = sqrt(sd/n);
|
||||
|
||||
|
||||
// print results
|
||||
cout << "minimum: " << *min_it << endl;
|
||||
cout << "maximum: " << *max_it << endl;
|
||||
cout << "arithmetic mean: " << am << endl;
|
||||
cout << "geometric mean: " << gm << endl;
|
||||
cout << "harmonic mean: " << hm << endl;
|
||||
cout << "standard deviation: " << sd << endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
BIN
ex1/ex1B_data-IO_and_vectors/main.o
Normal file
BIN
ex1/ex1B_data-IO_and_vectors/main.o
Normal file
Binary file not shown.
31
ex1/ex1C_summation_of_specified_numbers/main.cpp
Normal file
31
ex1/ex1C_summation_of_specified_numbers/main.cpp
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
#include "special_sum.h"
|
||||
#include "../utils/timing.h"
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <stack>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// check results and compare speeds
|
||||
for(size_t n : {15, 1001, 1432987})
|
||||
{
|
||||
cout << "n = " << n << endl;
|
||||
size_t sum_1, sum_2;
|
||||
|
||||
tic();
|
||||
for(size_t i = 0; i < 1000; ++i)
|
||||
sum_1 = special_sum_loop(n);
|
||||
double time_1 = toc();
|
||||
|
||||
tic();
|
||||
for(size_t i = 0; i < 1000; ++i)
|
||||
sum_2 = special_sum_noloop(n);
|
||||
double time_2 = toc();
|
||||
|
||||
cout << "loop: " << sum_1 << "\t\tDuration: " << time_1 << endl;
|
||||
cout << "no loop: " << sum_2 << "\t\tDuration: " << time_2 << endl << "---------------------------------------------------" << endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
28
ex1/ex1C_summation_of_specified_numbers/special_sum.cpp
Normal file
28
ex1/ex1C_summation_of_specified_numbers/special_sum.cpp
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
#include "special_sum.h"
|
||||
|
||||
size_t gauss_sum(size_t n)
|
||||
{
|
||||
return (n*(n+1))/2;
|
||||
}
|
||||
|
||||
size_t special_sum_loop(size_t n)
|
||||
{
|
||||
size_t sum = 0;
|
||||
for (size_t i = 1; i < n+1; ++i)
|
||||
{
|
||||
if (i % 3 == 0 || i % 5 == 0)
|
||||
{
|
||||
sum += i;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
size_t special_sum_noloop(size_t n)
|
||||
{
|
||||
size_t factor_3 = gauss_sum(n/3); // dividing int by int automatically gets rounded off
|
||||
size_t factor_5 = gauss_sum(n/5);
|
||||
size_t factor_15 = gauss_sum(n/15);
|
||||
|
||||
return factor_3*3 + factor_5*5 - factor_15*15;
|
||||
}
|
||||
18
ex1/ex1C_summation_of_specified_numbers/special_sum.h
Normal file
18
ex1/ex1C_summation_of_specified_numbers/special_sum.h
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
#include <cstddef>
|
||||
|
||||
/**
|
||||
This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITH using a loop.
|
||||
@param[in] n
|
||||
@param[out] M
|
||||
*/
|
||||
size_t special_sum_loop(size_t n);
|
||||
|
||||
|
||||
/**
|
||||
This function returns the sum of all positive integers less or equal n which are a multiples of 3 or of 5, WITHOUT using a loop.
|
||||
Example: For n=15, we have 60 = 3+5+6+9+10+12+15 = (1+2+3+4+5)*3 + (1+2+3)*5 - 1*15
|
||||
Formula: M = (\sum_{i=1}^{k_3} i)*3 + (\sum_{i=1}^{k_5} i)*5 - (\sum_{i=1}^{k_15} i)*15
|
||||
@param[in] n
|
||||
@param[out] M
|
||||
*/
|
||||
size_t special_sum_noloop(size_t n);
|
||||
BIN
ex1/ex1D_kahan_summation/main
Executable file
BIN
ex1/ex1D_kahan_summation/main
Executable file
Binary file not shown.
33
ex1/ex1D_kahan_summation/main.cpp
Normal file
33
ex1/ex1D_kahan_summation/main.cpp
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
#include "mylib.h"
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
for(size_t i = 1; i < 8; ++i)
|
||||
{
|
||||
size_t n = pow(10,i);
|
||||
vector<double> x(n);
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
x[k] = 1.0/(k + 1);
|
||||
|
||||
|
||||
// compute scalar products
|
||||
double sum_1 = scalar(x, x);
|
||||
double sum_2 = Kahan_skalar(x, x);
|
||||
|
||||
// compute error
|
||||
double err_1 = abs(sum_1 - pow(M_PI,2)/6);
|
||||
double err_2 = abs(sum_2 - pow(M_PI,2)/6);
|
||||
|
||||
cout << "n = " << n << endl;
|
||||
cout << "Normal scalar product: " << sum_1 << "\terror: " << err_1 << endl;
|
||||
cout << "Kahan scalar product: " << sum_2 << "\terror: " << err_2 << endl;
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
34
ex1/ex1D_kahan_summation/mylib.cpp
Normal file
34
ex1/ex1D_kahan_summation/mylib.cpp
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
#include "mylib.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
double Kahan_skalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
double sum = 0;
|
||||
double c = 0;
|
||||
size_t n = x.size();
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
double z = x[i]*y[i] - c; // c is the part that got lost in the last iteration
|
||||
double t = sum + z; // when adding sum + z, the lower digits are lost if sum is large
|
||||
c = (t - sum) - z; // now we recover the lower digits to add in the next iteration
|
||||
sum = t;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
30
ex1/ex1D_kahan_summation/mylib.h
Normal file
30
ex1/ex1D_kahan_summation/mylib.h
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#ifndef FILE_MYLIB
|
||||
#define FILE_MYLIB
|
||||
#include <vector>
|
||||
|
||||
/** Inner product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
/** Inner product using BLAS routines
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_cblas(std::vector<double> const &x, std::vector<double> const &y);
|
||||
float scalar_cblas(std::vector<float> const &x, std::vector<float> const &y);
|
||||
|
||||
|
||||
/** L_2 Norm of a vector
|
||||
@param[in] x vector
|
||||
@return resulting Euclidian norm <x,y>
|
||||
*/
|
||||
double norm(std::vector<double> const &x);
|
||||
|
||||
double Kahan_skalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
#endif
|
||||
BIN
ex1/ex1E_vector_vs_list/main
Executable file
BIN
ex1/ex1E_vector_vs_list/main
Executable file
Binary file not shown.
63
ex1/ex1E_vector_vs_list/main.cpp
Normal file
63
ex1/ex1E_vector_vs_list/main.cpp
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
#include "../utils/timing.h"
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
|
||||
size_t random_integer(int lower_bound, int upper_bound)
|
||||
{
|
||||
unsigned seed = chrono::system_clock::now().time_since_epoch().count();
|
||||
|
||||
minstd_rand0 generator (seed);
|
||||
|
||||
return lower_bound + generator() % (upper_bound - lower_bound + 1);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// start with generating a sorted vector/list
|
||||
size_t n = 10000;
|
||||
vector<int> x_vec(n);
|
||||
list<int> x_list(n);
|
||||
for(size_t k = 0; k < n; ++k)
|
||||
{
|
||||
x_vec[k] = k + 1;
|
||||
x_list.push_back(k + 1);
|
||||
}
|
||||
|
||||
|
||||
// insert new random entries such that the container stays sorted
|
||||
tic();
|
||||
for(size_t i = 0; i < n; ++i)
|
||||
{
|
||||
size_t new_entry = random_integer(1,n);
|
||||
auto it = lower_bound(x_vec.begin(), x_vec.end(), new_entry);
|
||||
x_vec.insert(it, new_entry);
|
||||
}
|
||||
double time_1 = toc();
|
||||
|
||||
tic();
|
||||
for(size_t i = 0; i < n; ++i)
|
||||
{
|
||||
size_t new_entry = random_integer(1,n);
|
||||
auto it = lower_bound(x_list.begin(), x_list.end(), new_entry);
|
||||
x_list.insert(it, new_entry);
|
||||
}
|
||||
double time_2 = toc();
|
||||
|
||||
|
||||
// check results
|
||||
cout << "New vector is sorted: " << std::boolalpha << is_sorted(x_vec.cbegin(), x_vec.cend()) << "\tsize: " << x_vec.size() << "\tduration: " << time_1 << endl;
|
||||
cout << "New list is sorted: " << std::boolalpha << is_sorted(x_list.cbegin(), x_list.cend()) << "\tsize: " << x_list.size() << "\tduration: " << time_2 << endl;
|
||||
|
||||
// Vector stores 3 pointers
|
||||
// List stores two pointers for every element: one to the previous, one to the next element
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
BIN
ex1/ex1F_goldbachs_conjecture/goldbach
Executable file
BIN
ex1/ex1F_goldbachs_conjecture/goldbach
Executable file
Binary file not shown.
42
ex1/ex1F_goldbachs_conjecture/goldbach.cpp
Normal file
42
ex1/ex1F_goldbachs_conjecture/goldbach.cpp
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
#include "goldbach.h"
|
||||
|
||||
size_t single_goldbach(size_t k)
|
||||
{
|
||||
const std::vector<size_t> relevant_primes = get_primes(k);
|
||||
size_t m = relevant_primes.size();
|
||||
|
||||
size_t counter = 0;
|
||||
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
{
|
||||
if(relevant_primes[i] + relevant_primes[j] == k)
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
|
||||
std::vector<size_t> count_goldbach(size_t n)
|
||||
{
|
||||
const std::vector<size_t> relevant_primes = get_primes(n);
|
||||
size_t m = relevant_primes.size();
|
||||
|
||||
std::vector<size_t> counter_vector(n + 1, 0);
|
||||
|
||||
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
{
|
||||
size_t sum = relevant_primes[i] + relevant_primes[j];
|
||||
if(sum <= n)
|
||||
++counter_vector[relevant_primes[i] + relevant_primes[j]];
|
||||
}
|
||||
}
|
||||
|
||||
return counter_vector;
|
||||
}
|
||||
21
ex1/ex1F_goldbachs_conjecture/goldbach.h
Normal file
21
ex1/ex1F_goldbachs_conjecture/goldbach.h
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
#pragma once
|
||||
#include "mayer_primes.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <iterator>
|
||||
#include <cassert>
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
|
||||
@param[in] k first integer
|
||||
@param[out] count number of decompositions
|
||||
*/
|
||||
size_t single_goldbach(size_t k);
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
|
||||
@param[in] n upper integer bound
|
||||
@param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index
|
||||
*/
|
||||
std::vector<size_t> count_goldbach(size_t n);
|
||||
BIN
ex1/ex1F_goldbachs_conjecture/main
Executable file
BIN
ex1/ex1F_goldbachs_conjecture/main
Executable file
Binary file not shown.
37
ex1/ex1F_goldbachs_conjecture/main.cpp
Normal file
37
ex1/ex1F_goldbachs_conjecture/main.cpp
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
#include "../utils/timing.h"
|
||||
#include "goldbach.h"
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
|
||||
|
||||
|
||||
for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
|
||||
{
|
||||
tic();
|
||||
|
||||
auto goldbach_vector = count_goldbach(n);
|
||||
|
||||
auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
|
||||
size_t max_number = distance(goldbach_vector.begin(), max_it);
|
||||
|
||||
double time = toc();
|
||||
|
||||
cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << time << endl;
|
||||
}
|
||||
|
||||
/*
|
||||
The number 9240 has 329 decompositions. Duration: 0.00572876
|
||||
The number 99330 has 2168 decompositions. Duration: 0.3342
|
||||
The number 390390 has 7094 decompositions. Duration: 4.23734
|
||||
The number 990990 has 15594 decompositions. Duration: 29.5817
|
||||
The number 1981980 has 27988 decompositions. Duration: 135.985
|
||||
*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
73
ex1/ex1F_goldbachs_conjecture/mayer_primes.h
Normal file
73
ex1/ex1F_goldbachs_conjecture/mayer_primes.h
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstring> //memset
|
||||
#include <vector>
|
||||
//using namespace std;
|
||||
|
||||
/** \brief Determines all prime numbers in interval [2, @p max].
|
||||
*
|
||||
* The sieve of Eratosthenes is used.
|
||||
*
|
||||
* The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
|
||||
*
|
||||
* \param[in] max end of interval for the prime number search.
|
||||
* \return vector of prime numbers @f$2,3,5, ..., p<=max @f$.
|
||||
*
|
||||
* \copyright
|
||||
* Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018)
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
template <class T>
|
||||
std::vector<T> get_primes(T max)
|
||||
{
|
||||
std::vector<T> primes;
|
||||
char *sieve;
|
||||
sieve = new char[max / 8 + 1];
|
||||
// Fill sieve with 1
|
||||
memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
|
||||
for (T x = 2; x <= max; x++)
|
||||
{
|
||||
if (sieve[x / 8] & (0x01 << (x % 8))) {
|
||||
primes.push_back(x);
|
||||
// Is prime. Mark multiplicates.
|
||||
for (T j = 2 * x; j <= max; j += x)
|
||||
{
|
||||
sieve[j / 8] &= ~(0x01 << (j % 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] sieve;
|
||||
return primes;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------
|
||||
//int main() // by Florian Mayer
|
||||
//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out
|
||||
// vector<unsigned long> primes;
|
||||
// primes = get_primes(10000000);
|
||||
// // return 0;
|
||||
// // Print out result.
|
||||
// vector<unsigned long>::iterator it;
|
||||
// for(it=primes.begin(); it < primes.end(); it++)
|
||||
// cout << *it << " ";
|
||||
//
|
||||
// cout << endl;
|
||||
// return 0;
|
||||
//}
|
||||
71
ex1/ex1G_dense_matrices_access/DenseMatrix.h
Normal file
71
ex1/ex1G_dense_matrices_access/DenseMatrix.h
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
#pragma once
|
||||
#include "sigmoid.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
class DenseMatrix
|
||||
{
|
||||
private:
|
||||
vector<double> M;
|
||||
size_t n;
|
||||
size_t m;
|
||||
|
||||
|
||||
public:
|
||||
vector<double> Mult(const vector<double> &x) const
|
||||
{
|
||||
vector<double> y(n,0);
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
y[i] += M[i*m + j]*x[j];
|
||||
}
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
vector<double> MultT(const vector<double> &y) const
|
||||
{
|
||||
vector<double> x(m,0);
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
x[j] += M[i*m + j]*y[i];
|
||||
}
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
void Print() const
|
||||
{
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
cout << M[i*m + j] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
|
||||
DenseMatrix(size_t n, size_t m)
|
||||
{
|
||||
this->n = n;
|
||||
this->m = m;
|
||||
M = vector<double>(n*m);
|
||||
size_t nm = max(n,m);
|
||||
|
||||
for(size_t i = 0; i < n; ++i) // iterate row
|
||||
{
|
||||
for(size_t j = 0; j < m; ++j) // iterate column
|
||||
{
|
||||
M[i*m + j] = sigmoid(x_entry(i,nm))*sigmoid(x_entry(j,nm));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
52
ex1/ex1G_dense_matrices_access/ProductMatrix.h
Normal file
52
ex1/ex1G_dense_matrices_access/ProductMatrix.h
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
#pragma once
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
class ProductMatrix
|
||||
{
|
||||
private:
|
||||
vector<double> u;
|
||||
vector<double> v;
|
||||
size_t n;
|
||||
size_t m;
|
||||
|
||||
public:
|
||||
vector<double> Mult(const vector<double> &x) const
|
||||
{
|
||||
vector<double> y(n,0);
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
for(int j = 0; j < m; ++j)
|
||||
{
|
||||
y[i] += v[j]*x[j];
|
||||
}
|
||||
y[i] *= u[i];
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
vector<double> MultT(const vector<double> &y) const
|
||||
{
|
||||
vector<double> x(m,0);
|
||||
for(int j = 0; j < m; ++j)
|
||||
{
|
||||
for(int i = 0; i < n; ++i)
|
||||
{
|
||||
x[j] += y[i]*u[i];
|
||||
}
|
||||
x[j] *= v[j];
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
ProductMatrix(const vector<double> &u, const vector<double> &v)
|
||||
{
|
||||
n = u.size();
|
||||
m = v.size();
|
||||
this->u = u;
|
||||
this->v = v;
|
||||
|
||||
}
|
||||
};
|
||||
BIN
ex1/ex1G_dense_matrices_access/main
Executable file
BIN
ex1/ex1G_dense_matrices_access/main
Executable file
Binary file not shown.
93
ex1/ex1G_dense_matrices_access/main.cpp
Normal file
93
ex1/ex1G_dense_matrices_access/main.cpp
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
#include "../utils/timing.h"
|
||||
#include "DenseMatrix.h"
|
||||
#include "ProductMatrix.h"
|
||||
#include <algorithm>
|
||||
|
||||
int main()
|
||||
{
|
||||
// b) ------------------------------------------------------------------------------------------------------
|
||||
DenseMatrix const M(5,3);
|
||||
vector<double> const u{{1, 2, 3}};
|
||||
vector<double> f1 = M.Mult(u);
|
||||
vector<double> const v{{-1, 2, -3, 4, -5}};
|
||||
vector<double> f2 = M.MultT(v);
|
||||
|
||||
M.Print();
|
||||
|
||||
for(size_t i = 0; i < f1.size(); ++i)
|
||||
cout << f1[i] << endl;
|
||||
cout << endl;
|
||||
|
||||
|
||||
for(size_t j = 0; j < f2.size(); ++j)
|
||||
cout << f2[j] << " ";
|
||||
cout << endl << "-------------------------------------------------" << endl;
|
||||
|
||||
// c) ------------------------------------------------------------------------------------------------------
|
||||
size_t n = pow(10,3);
|
||||
DenseMatrix const M_1(n,n);
|
||||
vector<double> x(n, 1.0);
|
||||
|
||||
size_t n_loops = 100;
|
||||
vector<double> y_1;
|
||||
vector<double> y_2;
|
||||
|
||||
double time_1 = 0;
|
||||
double time_2 = 0;
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_1 = M_1.Mult(x);
|
||||
time_1 += toc();
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_2 = M_1.MultT(x);
|
||||
time_2 += toc();
|
||||
|
||||
vector<double> error_vec(n,0);
|
||||
for(int i = 0; i < n; ++i)
|
||||
error_vec[i] = abs(y_1[i] - y_2[i]);
|
||||
double sup_error = *max_element(error_vec.begin(), error_vec.end());
|
||||
|
||||
|
||||
cout << "n = " << n << endl;
|
||||
cout << "Average duration for Mult: " << time_1/n_loops << endl;
|
||||
cout << "Average duration for MultT: " << time_2/n_loops << endl;
|
||||
cout << "sup-error: " << sup_error << endl;
|
||||
cout << "-------------------------------------------------" << endl;
|
||||
|
||||
// d) ------------------------------------------------------------------------------------------------------
|
||||
vector<double> u_M(n,0);
|
||||
for(int i = 0; i < n; ++i)
|
||||
u_M[i] = sigmoid(x_entry(i, n));
|
||||
|
||||
ProductMatrix const M_2(u_M, u_M);
|
||||
|
||||
time_1 = 0;
|
||||
time_2 = 0;
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_1 = M_2.Mult(x);
|
||||
time_1 += toc();
|
||||
|
||||
tic();
|
||||
for(int l = 0; l < n_loops; ++l)
|
||||
y_2 = M_2.MultT(x);
|
||||
time_2 += toc();
|
||||
|
||||
for(int i = 0; i < n; ++i)
|
||||
error_vec[i] = abs(y_1[i] - y_2[i]);
|
||||
sup_error = *max_element(error_vec.begin(), error_vec.end());
|
||||
|
||||
|
||||
cout << "n = " << n << endl;
|
||||
cout << "Average duration for Mult: " << time_1/n_loops << endl;
|
||||
cout << "Average duration for MultT: " << time_2/n_loops << endl;
|
||||
cout << "sup-error: " << sup_error << endl;
|
||||
cout << "-------------------------------------------------" << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
12
ex1/ex1G_dense_matrices_access/sigmoid.h
Normal file
12
ex1/ex1G_dense_matrices_access/sigmoid.h
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
#pragma once
|
||||
#include <cmath>
|
||||
|
||||
double sigmoid(double x)
|
||||
{
|
||||
return 1./(1. + exp(-x));
|
||||
}
|
||||
|
||||
double x_entry(size_t k, size_t nm)
|
||||
{
|
||||
return (10.*k)/(nm - 1) - 5.;
|
||||
}
|
||||
18
ex2/ex2_C_main.py
Normal file
18
ex2/ex2_C_main.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def u(x,p):
|
||||
return (np.exp(p*x) - 1)/(np.exp(p) - 1)
|
||||
|
||||
x = np.linspace(0, 1, 200)
|
||||
|
||||
|
||||
plt.figure(figsize=(8, 5))
|
||||
for p in [ -3, -1, -0.5, 0.5, 2, 4]:
|
||||
plt.plot(x, u(x, p), label="p ="+ str(p))
|
||||
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u(x)')
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
BIN
ex2/ex_2_Corrected.pdf
Normal file
BIN
ex2/ex_2_Corrected.pdf
Normal file
Binary file not shown.
71
ex3/.vscode/settings.json
vendored
Normal file
71
ex3/.vscode/settings.json
vendored
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
{
|
||||
"files.associations": {
|
||||
"array": "cpp",
|
||||
"atomic": "cpp",
|
||||
"bit": "cpp",
|
||||
"cctype": "cpp",
|
||||
"charconv": "cpp",
|
||||
"chrono": "cpp",
|
||||
"clocale": "cpp",
|
||||
"cmath": "cpp",
|
||||
"compare": "cpp",
|
||||
"complex": "cpp",
|
||||
"concepts": "cpp",
|
||||
"condition_variable": "cpp",
|
||||
"cstdarg": "cpp",
|
||||
"cstddef": "cpp",
|
||||
"cstdint": "cpp",
|
||||
"cstdio": "cpp",
|
||||
"cstdlib": "cpp",
|
||||
"cstring": "cpp",
|
||||
"ctime": "cpp",
|
||||
"cwchar": "cpp",
|
||||
"cwctype": "cpp",
|
||||
"deque": "cpp",
|
||||
"forward_list": "cpp",
|
||||
"string": "cpp",
|
||||
"unordered_map": "cpp",
|
||||
"unordered_set": "cpp",
|
||||
"vector": "cpp",
|
||||
"exception": "cpp",
|
||||
"algorithm": "cpp",
|
||||
"functional": "cpp",
|
||||
"iterator": "cpp",
|
||||
"memory": "cpp",
|
||||
"memory_resource": "cpp",
|
||||
"numeric": "cpp",
|
||||
"optional": "cpp",
|
||||
"random": "cpp",
|
||||
"ratio": "cpp",
|
||||
"string_view": "cpp",
|
||||
"system_error": "cpp",
|
||||
"tuple": "cpp",
|
||||
"type_traits": "cpp",
|
||||
"utility": "cpp",
|
||||
"format": "cpp",
|
||||
"initializer_list": "cpp",
|
||||
"iomanip": "cpp",
|
||||
"iosfwd": "cpp",
|
||||
"iostream": "cpp",
|
||||
"istream": "cpp",
|
||||
"limits": "cpp",
|
||||
"mutex": "cpp",
|
||||
"new": "cpp",
|
||||
"numbers": "cpp",
|
||||
"ostream": "cpp",
|
||||
"semaphore": "cpp",
|
||||
"span": "cpp",
|
||||
"sstream": "cpp",
|
||||
"stdexcept": "cpp",
|
||||
"stop_token": "cpp",
|
||||
"streambuf": "cpp",
|
||||
"thread": "cpp",
|
||||
"cinttypes": "cpp",
|
||||
"typeindex": "cpp",
|
||||
"typeinfo": "cpp",
|
||||
"variant": "cpp",
|
||||
"cassert": "cpp",
|
||||
"list": "cpp",
|
||||
"fstream": "cpp"
|
||||
}
|
||||
}
|
||||
123
ex3/CLANG_default.mk
Normal file
123
ex3/CLANG_default.mk
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=CLANG_
|
||||
|
||||
#CLANGPATH=//usr/lib/llvm-10/bin/
|
||||
CC = ${CLANGPATH}clang
|
||||
CXX = ${CLANGPATH}clang++
|
||||
#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
|
||||
#F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
|
||||
WARNINGS += -Weverything -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
|
||||
WARNINGS += -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
|
||||
#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
|
||||
|
||||
CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
|
||||
# don't use -Ofast
|
||||
# -ftrapv
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
ifeq ($(UBUNTU),1)
|
||||
# ubuntu
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# very good check
|
||||
# http://clang.llvm.org/extra/clang-tidy/
|
||||
# good check, see: http://llvm.org/docs/CodingStandards.html#include-style
|
||||
SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
|
||||
SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
|
||||
#READABILITY=,readability*${SWITCH_OFF}
|
||||
#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
#TIDYFLAGS += -checks='modernize*
|
||||
# ???
|
||||
#TIDYFLAGS = -checks='cert*' -header-filter=.*
|
||||
# MPI checks ??
|
||||
#TIDYFLAGS = -checks='mpi*'
|
||||
# ??
|
||||
#TIDYFLAGS = -checks='performance*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='portability-*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='readability-*' -header-filter=.*
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
codecheck: tidy_check
|
||||
tidy_check:
|
||||
clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
|
||||
# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
codecheck: tidy_check
|
||||
130
ex3/GCC_AMD32_default.mk
Normal file
130
ex3/GCC_AMD32_default.mk
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=GCC_
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
# on mephisto:
|
||||
#CXXFLAGS += -I/share/apps/atlas/include
|
||||
#LINKFLAGS += -L/share/apps/atlas/lib
|
||||
#LINKFLAGS += -lcblas -latlas
|
||||
|
||||
#LINKFLAGS += -lblas
|
||||
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
|
||||
|
||||
|
||||
#WARNINGS = -pedantic -pedantic-errors -Wall -Wextra -Werror -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow
|
||||
WARNINGS = -pedantic -Wall -Wextra -Wconversion -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -Winline -fmax-errors=1
|
||||
# -Wunreachable-code
|
||||
# -Wunreachable-code
|
||||
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
|
||||
#-msse3
|
||||
# -ftree-vectorizer-verbose=2 -DNDEBUG
|
||||
# -ftree-vectorizer-verbose=5
|
||||
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
|
||||
|
||||
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
|
||||
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
|
||||
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# BLAS, LAPACK
|
||||
OPENBLAS_DIR = /opt/openblas_GCCseq
|
||||
#OPENBLAS_DIR = /opt/openblas_GCC
|
||||
OPENBLAS_LIBDIR = ${OPENBLAS_DIR}/lib
|
||||
OPENBLAS_INCDIR = ${OPENBLAS_DIR}/include
|
||||
CXXFLAGS += -I${OPENBLAS_INCDIR}
|
||||
LINKFLAGS += -L${OPENBLAS_LIBDIR} -lopenblas
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig
|
||||
-@rm -r html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
# time ./${PROGRAM}
|
||||
# ./${PROGRAM}
|
||||
( export LD_LIBRARY_PATH=${OPENBLAS_LIBDIR}:${LD_LIBRARY_PATH} ; ./${PROGRAM} )
|
||||
# or 'export LD_LIBRARY_PATH=/opt/openblas_gcc/lib:${LD_LIBRARY_PATH}' in your ~/.bashrc
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar:
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
# no "-pg" in compile/link options
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
./$^
|
||||
gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
#Trace your heap:
|
||||
#> heaptrack ./main.GCC_
|
||||
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
|
||||
heap: ${PROGRAM}
|
||||
heaptrack ./$^ 11
|
||||
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
|
||||
|
||||
|
||||
|
||||
########################################################################
|
||||
# get the detailed status of all optimization flags
|
||||
info:
|
||||
echo "detailed status of all optimization flags"
|
||||
$(CXX) --version
|
||||
$(CXX) -Q $(CXXFLAGS) --help=optimizers
|
||||
183
ex3/GCC_default.mk
Normal file
183
ex3/GCC_default.mk
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=GCC_
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -Winline -fmax-errors=1
|
||||
# -Wunreachable-code
|
||||
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
|
||||
#-msse3
|
||||
# -ftree-vectorizer-verbose=2 -DNDEBUG
|
||||
# -ftree-vectorizer-verbose=5
|
||||
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
|
||||
|
||||
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
|
||||
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
|
||||
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
LINKFLAGS += -O3
|
||||
|
||||
#architecture
|
||||
#CPU = -march=znver2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU}
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
ifeq ($(UBUNTU),1)
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -llapack -lopenblas -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# for debugging purpose (save code)
|
||||
# -fsanitize=leak # only one out the three can be used
|
||||
# -fsanitize=address
|
||||
# -fsanitize=thread
|
||||
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
|
||||
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
|
||||
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
|
||||
#CXXFLAGS += ${SANITARY}
|
||||
#LINKFLAGS += ${SANITARY}
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
|
||||
-@rm -rf html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
#run: ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
#find . -size +10M > large_files
|
||||
#--exclude-from ${MY_DIR}/large_files
|
||||
|
||||
zip: clean
|
||||
@echo "Zip the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
.SUFFIXES: .f90
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
.f90.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
# no "-pg" in compile/link options
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
# Graphical interface
|
||||
# valkyrie
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
|
||||
# * install
|
||||
# * sudo vi /etc/sysctl.conf
|
||||
# add kernel.perf_event_paranoid = 0
|
||||
|
||||
#Trace your heap:
|
||||
#> heaptrack ./main.GCC_
|
||||
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
|
||||
heap: ${PROGRAM}
|
||||
heaptrack ./$^ ${PARAMS}
|
||||
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
|
||||
|
||||
codecheck: $(SOURCES)
|
||||
cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
|
||||
|
||||
|
||||
########################################################################
|
||||
# get the detailed status of all optimization flags
|
||||
info:
|
||||
echo "detailed status of all optimization flags"
|
||||
$(CXX) --version
|
||||
$(CXX) -Q $(CXXFLAGS) --help=optimizers
|
||||
lscpu
|
||||
inxi -C
|
||||
lstopo
|
||||
|
||||
# Excellent hardware info
|
||||
# hardinfo
|
||||
# Life monitoring of CPU frequency etc.
|
||||
# sudo i7z
|
||||
|
||||
# Memory consumption
|
||||
# vmstat -at -SM 3
|
||||
# xfce4-taskmanager
|
||||
|
||||
|
||||
# https://www.tecmint.com/check-linux-cpu-information/
|
||||
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
|
||||
|
||||
# Debugging:
|
||||
# https://wiki.archlinux.org/index.php/Debugging
|
||||
137
ex3/ICC_default.mk
Normal file
137
ex3/ICC_default.mk
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
# Basic Defintions for using INTEL compiler suite sequentially
|
||||
# requires setting of COMPILER=ICC_
|
||||
|
||||
#BINDIR = /opt/intel/bin/
|
||||
|
||||
# special on my sony [GH]
|
||||
#BINDIR = /opt/save.intel/bin/
|
||||
# very special on my sony [GH]
|
||||
# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
|
||||
|
||||
# Error with g++-4.8 using icpc14.0,x:
|
||||
# find directory wherein bits/c++config.h is located
|
||||
# 'locate bits/c++config.h'
|
||||
#FOUND_CONFIG = -I/usr/include/x86_64-linux-gnu/c++/4.8
|
||||
|
||||
|
||||
CC = ${BINDIR}icc
|
||||
CXX = ${BINDIR}icpc
|
||||
F77 = ${BINDIR}ifort
|
||||
LINKER = ${CXX}
|
||||
|
||||
|
||||
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012 -wn3
|
||||
# -Winline -Wredundant-decls -Wunreachable-code
|
||||
CXXFLAGS += -O3 -fargument-noalias -std=c++17 -DNDEBUG ${WARNINGS} -mkl ${FOUND_CONFIG}
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
# -vec-report=3
|
||||
# -qopt-report=5 -qopt-report-phase=vec
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
CXXFLAGS += -align
|
||||
|
||||
# use MKL by INTEL
|
||||
# https://software.intel.com/content/www/us/en/develop/documentation/mkl-linux-developer-guide/top/linking-your-application-with-the-intel-math-kernel-library/linking-quick-start/using-the-mkl-compiler-option.html
|
||||
# https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html
|
||||
# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
#LINKFLAGS += -O3 -L/opt/intel/mkl/lib -mkl
|
||||
LINKFLAGS += -O3 -mkl=sequential
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -ipo
|
||||
LINKFLAGS += -ipo
|
||||
|
||||
# annotated assembler file
|
||||
ANNOTED = -fsource-asm -S
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# # some tools
|
||||
# # Cache behaviour (CXXFLAGS += -g tracks down to source lines)
|
||||
# cache: ${PROGRAM}
|
||||
# valgrind --tool=callgrind --simulate-cache=yes ./$^
|
||||
# # kcachegrind callgrind.out.<pid> &
|
||||
#
|
||||
# # Check for wrong memory accesses, memory leaks, ...
|
||||
# # use smaller data sets
|
||||
# mem: ${PROGRAM}
|
||||
# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
|
||||
#
|
||||
# # Simple run time profiling of your code
|
||||
# # CXXFLAGS += -g -pg
|
||||
# # LINKFLAGS += -pg
|
||||
# prof: ${PROGRAM}
|
||||
# ./$^
|
||||
# gprof -b ./$^ > gp.out
|
||||
# # kprof -f gp.out -p gprof &
|
||||
#
|
||||
|
||||
|
||||
mem: inspector
|
||||
prof: amplifier
|
||||
cache: amplifier
|
||||
|
||||
gap_par_report:
|
||||
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
|
||||
|
||||
# GUI for performance report
|
||||
amplifier: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# alternatively to the solution above:
|
||||
#edit file /etc/sysctl.d/10-ptrace.conf and set variable kernel.yama.ptrace_scope variable to 0 .
|
||||
amplxe-gui &
|
||||
|
||||
# GUI for Memory and Thread analyzer (race condition)
|
||||
inspector: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
inspxe-gui &
|
||||
|
||||
advisor:
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
advixe-gui &
|
||||
|
||||
icc-info:
|
||||
icpc -# main.cpp
|
||||
|
||||
|
||||
|
||||
|
||||
176
ex3/ONEAPI_default.mk
Normal file
176
ex3/ONEAPI_default.mk
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
# Basic Defintions for using INTEL compiler suite sequentially
|
||||
# requires setting of COMPILER=ONEAPI_
|
||||
|
||||
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
|
||||
# requires
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# on AMD: export MKL_DEBUG_CPU_TYPE=5
|
||||
|
||||
#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
|
||||
#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
|
||||
#export KMP_AFFINITY=verbose,compact
|
||||
|
||||
CC = ${BINDIR}icc
|
||||
CXX = ${BINDIR}dpcpp
|
||||
F77 = ${BINDIR}ifort
|
||||
LINKER = ${CXX}
|
||||
|
||||
## Compiler flags
|
||||
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
|
||||
WARNINGS += -Wpessimizing-move -Wredundant-move
|
||||
#-wd2015,2012,2014 -wn3
|
||||
# -Winline -Wredundant-decls -Wunreachable-code
|
||||
# -qopt-subscript-in-range
|
||||
# -vec-threshold0
|
||||
|
||||
CXXFLAGS += -O3 -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include"
|
||||
#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include"
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -ipo
|
||||
LINKFLAGS += -ipo
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# annotated Assembler file
|
||||
ANNOTED = -fsource-asm -S
|
||||
|
||||
#architecture
|
||||
CPU = -march=core-avx2
|
||||
#CPU += -mtp=zen
|
||||
# -xCORE-AVX2
|
||||
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU}
|
||||
|
||||
# use MKL by INTEL
|
||||
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
|
||||
# sequential MKL
|
||||
# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
|
||||
CXXFLAGS += -qmkl=sequential -UMKL_ILP64
|
||||
LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
|
||||
# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static
|
||||
# install intel-oneapi-compiler-static
|
||||
# or
|
||||
LINKFLAGS += -shared-intel
|
||||
|
||||
|
||||
OPENMP = -qopenmp
|
||||
CXXFLAGS += ${OPENMP}
|
||||
LINKFLAGS += ${OPENMP}
|
||||
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
# -vec-report=3
|
||||
# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
|
||||
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
|
||||
# Redirect report from *.optrpt to stderr
|
||||
# -qopt-report-file=stderr
|
||||
# Guided paralellization
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
|
||||
## run time checks
|
||||
# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
|
||||
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines)
|
||||
# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
|
||||
|
||||
mem: inspector
|
||||
prof: vtune
|
||||
cache: inspector
|
||||
|
||||
gap_par_report:
|
||||
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
|
||||
|
||||
# GUI for performance report
|
||||
amplifier: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
|
||||
amplxe-gui &
|
||||
|
||||
# GUI for Memory and Thread analyzer (race condition)
|
||||
inspector: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# inspxe-gui &
|
||||
vtune-gui ./${PROGRAM} &
|
||||
|
||||
advisor:
|
||||
source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
|
||||
# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
|
||||
advisor --collect=survey ./${PROGRAM}
|
||||
# advisor --collect=roofline ./${PROGRAM}
|
||||
advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
|
||||
|
||||
vtune:
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
|
||||
export ADVIXE_EXPERIMENTAL=roofline
|
||||
vtune -collect hotspots ./${PROGRAM}
|
||||
vtune -report hotspots -r r000hs > vtune.out
|
||||
# vtune-gui ./${PROGRAM} &
|
||||
|
||||
icc-info:
|
||||
icpc -# main.cpp
|
||||
|
||||
# MKL on AMD
|
||||
# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
|
||||
#
|
||||
# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
|
||||
# export MKL_DEBUG_CPU_TYPE=5
|
||||
# export MKL_NUM_THRAEDS=1
|
||||
# export MKL_DYNAMIC=false
|
||||
# on Intel compiler
|
||||
# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html
|
||||
93
ex3/PGI_default.mk
Normal file
93
ex3/PGI_default.mk
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
# Basic Defintions for using PGI-compiler suite sequentially
|
||||
# requires setting of COMPILER=PGI_
|
||||
# OPTIRUN = optirun
|
||||
|
||||
|
||||
CC = pgcc
|
||||
CXX = pgc++
|
||||
F77 = pgfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
# on mephisto:
|
||||
#CXXFLAGS += -I/share/apps/atlas/include
|
||||
#LINKFLAGS += -L/share/apps/atlas/lib
|
||||
#LINKFLAGS += -lcblas -latlas
|
||||
|
||||
#LINKFLAGS += -lblas
|
||||
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
|
||||
|
||||
WARNINGS = -Minform=warn
|
||||
# -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow -Wredundant-decls
|
||||
# -pedantic -Wunreachable-code -Wextra -Winline
|
||||
# -Wunreachable-code
|
||||
|
||||
#PGI_PROFILING = -Minfo=ccff,loop,vect,opt,intensity,mp,accel
|
||||
PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
|
||||
# -Minfo
|
||||
# -Mprof=time
|
||||
# -Mprof=lines
|
||||
# take care with option -Msafeptr
|
||||
CXXFLAGS += -O3 -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -O3 -std=c++11 -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
|
||||
# -fastsse -fargument-noalias ${WARNINGS} -msse3 -vec-report=3
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# # some tools
|
||||
# # Simple run time profiling of your code
|
||||
# # CXXFLAGS += -g -pg
|
||||
# # LINKFLAGS += -pg
|
||||
|
||||
|
||||
# Profiling options PGI, see: pgcollect -help
|
||||
# CPU_PROF = -allcache
|
||||
CPU_PROF = -time
|
||||
# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
|
||||
#GPU_PROF = -cuda=branch:cc20
|
||||
#
|
||||
PROF_FILE = pgprof.out
|
||||
|
||||
cache: prof
|
||||
|
||||
prof: ${PROGRAM}
|
||||
${OPTIRUN} ${BINDIR}pgcollect $(CPU_PROF) ./$^
|
||||
${OPTIRUN} ${BINDIR}pgprof -exe ./$^ $(PROF_FILE) &
|
||||
|
||||
info:
|
||||
pgaccelinfo -v
|
||||
30
ex3/ex3/Makefile
Normal file
30
ex3/ex3/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp factorization_solve.cpp factorization_solve_tests.cpp vdop.cpp getmatrix.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
333
ex3/ex3/benchmark_tests.cpp
Normal file
333
ex3/ex3/benchmark_tests.cpp
Normal file
|
|
@ -0,0 +1,333 @@
|
|||
#include "benchmark_tests.h"
|
||||
#include "benchmarks.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
using namespace std::chrono;
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function)
|
||||
{
|
||||
cout << "#################### (A) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = i % 219 + 1;
|
||||
y[i] = 1.0/x[i];
|
||||
}
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking scalar\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double check(0.0),ss(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
check = scalar_function(x, y);
|
||||
ss += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != N)
|
||||
cout << " !! W R O N G result !!\n";
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking norm\n";
|
||||
|
||||
auto t3 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double ss2(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
auto sk1 = sqrt(scalar(x, x));
|
||||
ss2 += sk1; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t4 = system_clock::now(); // stop timer
|
||||
auto duration2 = duration_cast<microseconds>(t4 - t3); // duration in microseconds
|
||||
double t_diff2 = static_cast<double>(duration2.count()) / 1e6; // overall duration in seconds
|
||||
t_diff2 = t_diff2/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
cout << "ss(norm): " << ss2 << endl;
|
||||
cout << "Timing in sec. : " << t_diff2 << endl;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function)
|
||||
{
|
||||
cout << "#################### (B) ####################" << endl;
|
||||
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
cout << "\nM = " << M << endl;
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> A(M*N);
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
A[N*i + j] = (i + j) % 219 + 1;
|
||||
|
||||
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
x[j] = 1.0/A[N*17 + j];
|
||||
}
|
||||
|
||||
cout << "\nStart Benchmarking MatVec\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> b(M);
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
b = MatVec_function(A, x);
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <A[17,*],x> = " << b[17] << endl;
|
||||
if (static_cast<size_t>(b[17]) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function)
|
||||
{
|
||||
cout << "#################### (C) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nL = " << L << endl;
|
||||
cout << "\nM = " << M << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> A(M*L);
|
||||
vector<double> B(L*N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
A[L*i + k] = (i + k) % 219 + 1;
|
||||
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
B[N*k + j] = 1.0/A[L*17 + k];
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking MatMat\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> C(M*N);
|
||||
double check;
|
||||
double check_sum = 0;
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
C = MatMat_function(A, B, L);
|
||||
|
||||
check = C[N*17];
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
cout << check_sum;
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n C[17,0] = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != L)
|
||||
{
|
||||
cout << " !! W R O N G result !!, should be " << L <<"\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
|
||||
{
|
||||
cout << "#################### (D) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
cout << "\np = " << p << endl;
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> a(p + 1, 0);
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
x[j] = 1.0*j;
|
||||
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
|
||||
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking poly\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> y(N);
|
||||
double check;
|
||||
double check_sum;
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
y = poly(a, x);
|
||||
check = y[0];
|
||||
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n poly(" << x[0] << ") = " << check << endl;
|
||||
if (abs(check - 1.0) > 1.0/1e6)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
15
ex3/ex3/benchmark_tests.h
Normal file
15
ex3/ex3/benchmark_tests.h
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N, const function<double(const vector<double>&, const vector<double>&)>& scalar_function);
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M, const function<vector<double>(const vector<double>&, const vector<double>&)>& MatVec_function);
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N, const function<vector<double>(const vector<double>&, const vector<double>&, size_t const &shared_dim)>& MatMat_function);
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
|
||||
245
ex3/ex3/benchmarks.cpp
Normal file
245
ex3/ex3/benchmarks.cpp
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
#include "benchmarks.h"
|
||||
#include "vdop.h"
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <cassert> // assert()
|
||||
|
||||
|
||||
#ifdef __INTEL_CLANG_COMPILER
|
||||
#pragma message(" ########## Use of MKL ###############")
|
||||
#include <mkl.h>
|
||||
#else
|
||||
#pragma message(" ########## Use of CBLAS ###############")
|
||||
|
||||
#include <cblas.h> // cBLAS Library
|
||||
#include <lapacke.h> // Lapack
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// (A) Inner product of two vectors (from skalar_stl)
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// (A) 5.(b) Kahan scalar product
|
||||
double Kahan_skalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
double sum = 0.0;
|
||||
double c = 0.0;
|
||||
size_t n = x.size();
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
double z = x[i]*y[i] - c; // c is the part that got lost in the last iteration
|
||||
double t = sum + z; // when adding sum + z, the lower digits are lost if sum is large
|
||||
c = (t - sum) - z; // now we recover the lower digits to add in the next iteration
|
||||
sum = t;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// (A) 6. cBLAS scalar product
|
||||
double scalar_cBLAS(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
return cblas_ddot(x.size(), x.data(), 1, y.data(), 1); // x.data() = &x[0]
|
||||
}
|
||||
|
||||
|
||||
// (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
vector<double> MatVec(vector<double> const &A, vector<double> const &x)
|
||||
{
|
||||
size_t const nelem = A.size();
|
||||
size_t const N = x.size();
|
||||
assert(nelem % N == 0); // make sure multiplication is possible
|
||||
size_t const M = nelem/N;
|
||||
|
||||
|
||||
vector<double> b(M);
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
double tmp = 0.0;
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
tmp += A[N*i + j] * x[j];
|
||||
b[i] = tmp;
|
||||
}
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
// (B) cBLAS Matrix-vector product
|
||||
vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x)
|
||||
{
|
||||
size_t const nelem = A.size();
|
||||
size_t const N = x.size();
|
||||
assert(nelem % N == 0); // make sure multiplication is possible
|
||||
size_t const M = nelem/N;
|
||||
|
||||
|
||||
vector<double> b(M);
|
||||
|
||||
cblas_dgemv(CblasRowMajor, CblasNoTrans, M, N, 1.0, A.data(), N, x.data(), 1, 0.0, b.data(), 1);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
// (C) Matrix-matrix product
|
||||
vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &L)
|
||||
{
|
||||
size_t const nelem_A = A.size();
|
||||
size_t const nelem_B = B.size();
|
||||
|
||||
assert(nelem_A % L == 0 && nelem_B % L == 0);
|
||||
|
||||
size_t const M = nelem_A/L;
|
||||
size_t const N = nelem_B/L;
|
||||
|
||||
|
||||
vector<double> C(M*N);
|
||||
|
||||
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
C[N*i + j] += A[L*i + k]*B[N*k + j];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
// (C) cBLAS matrix-matrix product
|
||||
vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &L)
|
||||
{
|
||||
size_t const nelem_A = A.size();
|
||||
size_t const nelem_B = B.size();
|
||||
|
||||
assert(nelem_A % L == 0 && nelem_B % L == 0);
|
||||
|
||||
size_t const M = nelem_A/L;
|
||||
size_t const N = nelem_B/L;
|
||||
|
||||
|
||||
vector<double> C(M*N);
|
||||
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, L, 1.0, A.data(), L, B.data(), N, 0.0, C.data(), N);
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
|
||||
// (D) Evaluation of a polynomial function
|
||||
vector<double> poly(vector<double> const &a, vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
size_t const p = a.size() - 1;
|
||||
vector<double> y(N, 0);
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
double x_temp = x[i];
|
||||
double y_temp = 0;
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
{
|
||||
y_temp += x_temp*y_temp + a[p - k];
|
||||
}
|
||||
y[i] = y_temp;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
|
||||
// (E) Solves linear system of equations
|
||||
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
|
||||
{
|
||||
const double omega = 1.0;
|
||||
const int maxiter = 1000;
|
||||
const double tol = 1e-5, // tolerance
|
||||
tol2 = tol * tol; // tolerance^2
|
||||
|
||||
int nrows = SK.Nrows(); // number of rows == number of columns
|
||||
assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
|
||||
|
||||
cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl;
|
||||
// Choose initial guess
|
||||
for (int k = 0; k < nrows; ++k)
|
||||
{
|
||||
u[k] = 0.0; // u := 0
|
||||
}
|
||||
|
||||
vector<double> dd(nrows); // matrix diagonal
|
||||
vector<double> r(nrows); // residual
|
||||
vector<double> w(nrows); // correction
|
||||
|
||||
SK.GetDiag(dd); // dd := diag(K)
|
||||
////DebugVector(dd);{int ijk; cin >> ijk;}
|
||||
|
||||
// Initial sweep
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
double sigma0 = dscapr(w, r); // s0 := <w,r>
|
||||
|
||||
// Iteration sweeps
|
||||
int iter = 0;
|
||||
double sigma = sigma0;
|
||||
while ( sigma > tol2 * sigma0 && maxiter > iter)
|
||||
{
|
||||
++iter;
|
||||
vdaxpy(u, u, omega, w ); // u := u + om*w
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
sigma = dscapr(w, r); // s0 := <w,r>
|
||||
// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
|
||||
}
|
||||
cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl;
|
||||
cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n";
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
89
ex3/ex3/benchmarks.h
Normal file
89
ex3/ex3/benchmarks.h
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
#pragma once
|
||||
#include "getmatrix.h"
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/** (A) Inner product of two vectors (from skalar_stl)
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
/** (A) 5.(b) Inner product of two vectors using the Kahan scalar product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double Kahan_skalar(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
/** (A) 6. cBLAS scalar product of two vectors
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_cBLAS(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
|
||||
/** (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
* @param[in] A dense matrix (1D access)
|
||||
* @param[in] u vector
|
||||
*
|
||||
* @return resulting vector
|
||||
*/
|
||||
vector<double> MatVec(vector<double> const &A, vector<double> const &x);
|
||||
|
||||
/** (B) 6. cBLAS Matrix-vector product
|
||||
* @param[in] A dense matrix (1D access)
|
||||
* @param[in] u vector
|
||||
*
|
||||
* @return resulting vector
|
||||
*/
|
||||
vector<double> MatVec_cBLAS(vector<double> const &A, vector<double> const &x);
|
||||
|
||||
|
||||
/** (C) Matrix-matrix product
|
||||
* @param[in] A MxL dense matrix (1D access)
|
||||
* @param[in] B LxN dense matrix (1D access)
|
||||
* @param[in] shared_dim shared dimension L
|
||||
*
|
||||
* @return resulting MxN matrix
|
||||
*/
|
||||
vector<double> MatMat(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
|
||||
|
||||
/** (C) 6. cBLAS Matrix-matrix product
|
||||
* @param[in] A MxL dense matrix (1D access)
|
||||
* @param[in] B LxN dense matrix (1D access)
|
||||
* @param[in] shared_dim shared dimension L
|
||||
*
|
||||
* @return resulting MxN matrix
|
||||
*/
|
||||
vector<double> MatMat_cBLAS(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
|
||||
|
||||
|
||||
/** (D) Evaluation of a polynomial function using Horner's scheme
|
||||
* @param[in] a coefficient vector
|
||||
* @param[in] x vector with input values
|
||||
*
|
||||
* @return vector with output values
|
||||
*/
|
||||
vector<double> poly(vector<double> const &a, vector<double> const &x);
|
||||
|
||||
|
||||
/** (E) Solves linear system of equations K @p u = @p f via the Jacobi iteration (from jaboci_oo_stl)
|
||||
* We use a distributed symmetric CSR matrix @p SK and initial guess of the
|
||||
* solution is set to 0.
|
||||
* @param[in] SK CSR matrix
|
||||
* @param[in] f distributed local vector storing the right hand side
|
||||
* @param[out] u accumulated local vector storing the solution.
|
||||
*/
|
||||
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
39
ex3/ex3/factorization_solve.cpp
Normal file
39
ex3/ex3/factorization_solve.cpp
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
#include "factorization_solve.h"
|
||||
#include <vector>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <lapack.h>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs)
|
||||
{
|
||||
int32_t nelem = A.size();
|
||||
int32_t N = sqrt(nelem);
|
||||
assert(N == static_cast<int32_t>(b.size())/n_rhs);
|
||||
|
||||
vector<int> IPIV(N); // pivot indices
|
||||
int info_factorization;
|
||||
|
||||
dgetrf_(&N, &N, A.data(), &N, IPIV.data(), &info_factorization);
|
||||
|
||||
const char transp = 'N';
|
||||
int info_solve;
|
||||
dgetrs_(&transp, &N, &n_rhs, A.data(), &N, IPIV.data(), b.data(), &N, &info_solve, 1); // 1 is length of parameter 'N'
|
||||
|
||||
}
|
||||
|
||||
void print_matrix(vector<double> &A, size_t M, size_t N)
|
||||
{
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
cout << A[N*i + j] << "\t";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
21
ex3/ex3/factorization_solve.h
Normal file
21
ex3/ex3/factorization_solve.h
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
#include <cstdint>
|
||||
using namespace std;
|
||||
|
||||
|
||||
/** Solve linear system of equations with multiple right hand sides using LU factorization
|
||||
* @param[inout] A NxN Matrix (1D access), gets modified to contain the LU decomposition of A
|
||||
* @param[inout] B N x n_rhs Matrix of right-hand-sides (1D access), gets modified to contain the solution vectors x
|
||||
* @param[in] n_rhs number of right-hand-sides b
|
||||
*
|
||||
*/
|
||||
void factorization_solve(vector<double> &A, vector<double> &b, const int32_t &n_rhs);
|
||||
|
||||
/** Print a matrix to console
|
||||
* @param[in] A MxN Matrix (1D access)
|
||||
* @param[in] M rows
|
||||
* @param[in] N columns
|
||||
*
|
||||
*/
|
||||
void print_matrix(vector<double> &A, size_t M, size_t N);
|
||||
89
ex3/ex3/factorization_solve_tests.cpp
Normal file
89
ex3/ex3/factorization_solve_tests.cpp
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
#include "factorization_solve_tests.h"
|
||||
#include "factorization_solve.h"
|
||||
#include "benchmarks.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std::chrono;
|
||||
|
||||
void CheckCorrectness()
|
||||
{
|
||||
cout.precision(4);
|
||||
|
||||
size_t N = 5;
|
||||
size_t n_rhs = 5;
|
||||
vector<double> A(N*N);
|
||||
vector<double> b(N*n_rhs);
|
||||
|
||||
// initialize A
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
if (i == j)
|
||||
A[N*i + j] = 4.0;
|
||||
else
|
||||
A[N*i + j] = 1.0/((i - j)*(i - j));
|
||||
}
|
||||
const vector<double> A_copy = A;
|
||||
|
||||
// initialize b as identity matrix
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
for (size_t l = 0; l < n_rhs; ++l)
|
||||
if (l == j)
|
||||
b[N*l + j] = 1.0;
|
||||
else
|
||||
b[N*l + j] = 0.0;
|
||||
}
|
||||
|
||||
cout << "A = " << endl;
|
||||
print_matrix(A, N, N);
|
||||
cout << "b = " << endl;
|
||||
print_matrix(b, N, n_rhs);
|
||||
|
||||
// solve system
|
||||
factorization_solve(A, b, n_rhs);
|
||||
|
||||
vector<double> check_matrix_identity = MatMat_cBLAS(A_copy, b, N);
|
||||
cout << "A*A^{-1} = " << endl;
|
||||
print_matrix(check_matrix_identity, N, n_rhs);
|
||||
}
|
||||
|
||||
|
||||
void CheckDuration(const size_t &N)
|
||||
{
|
||||
for (size_t n_rhs : {1, 2, 4, 8, 16, 32})
|
||||
{
|
||||
auto time_start = system_clock::now();
|
||||
|
||||
vector<double> A(N*N);
|
||||
vector<double> b(N*n_rhs);
|
||||
|
||||
// initialize A
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
if (i == j)
|
||||
A[N*i + j] = 4.0;
|
||||
else
|
||||
A[N*i + j] = 1.0/((i - j)*(i - j));
|
||||
}
|
||||
|
||||
// initialize b
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
for (size_t l = 0; l < n_rhs; ++l)
|
||||
b[N*l + j] = 1.0;
|
||||
|
||||
|
||||
// solve system
|
||||
factorization_solve(A, b, n_rhs);
|
||||
|
||||
|
||||
|
||||
auto time_end = system_clock::now();
|
||||
auto duration = duration_cast<microseconds>(time_end - time_start); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
|
||||
cout << "n_rhs = " << n_rhs << "\tTime per rhs: " << t_diff/n_rhs << endl;
|
||||
}
|
||||
}
|
||||
7
ex3/ex3/factorization_solve_tests.h
Normal file
7
ex3/ex3/factorization_solve_tests.h
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
#pragma once
|
||||
#include <cstddef>
|
||||
|
||||
|
||||
void CheckCorrectness();
|
||||
|
||||
void CheckDuration(const size_t &N);
|
||||
522
ex3/ex3/geom.cpp
Normal file
522
ex3/ex3/geom.cpp
Normal file
|
|
@ -0,0 +1,522 @@
|
|||
// see: http://llvm.org/docs/CodingStandards.html#include-style
|
||||
#include "geom.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
Mesh::Mesh(int ndim, int nvert_e, int ndof_e)
|
||||
: _nelem(0), _nvert_e(nvert_e), _ndof_e(ndof_e), _nnode(0), _ndim(ndim), _ia(0), _xc(0)
|
||||
{
|
||||
}
|
||||
|
||||
Mesh::~Mesh()
|
||||
{}
|
||||
|
||||
void Mesh::SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const
|
||||
{
|
||||
int const nnode = Nnodes(); // number of vertices in mesh
|
||||
assert( nnode == static_cast<int>(v.size()) );
|
||||
for (int k = 0; k < nnode; ++k)
|
||||
{
|
||||
v[k] = func( _xc[2 * k], _xc[2 * k + 1] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Mesh::Debug() const
|
||||
{
|
||||
cout << "\n ############### Debug M E S H ###################\n";
|
||||
cout << "\n ............... Coordinates ...................\n";
|
||||
for (int k = 0; k < _nnode; ++k)
|
||||
{
|
||||
cout << k << " : " ;
|
||||
for (int i = 0; i < _ndof_e; ++i )
|
||||
{
|
||||
cout << _xc[2*k+i] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
cout << "\n ............... Elements ...................\n";
|
||||
for (int k = 0; k < _nelem; ++k)
|
||||
{
|
||||
cout << k << " : ";
|
||||
for (int i = 0; i < _ndof_e; ++i )
|
||||
cout << _ia[_ndof_e * k + i] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void Mesh::Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const
|
||||
{
|
||||
assert(Nnodes() == static_cast<int>(v.size())); // fits vector length to mesh information?
|
||||
|
||||
ofstream fout(fname); // open file ASCII mode
|
||||
if ( !fout.is_open() )
|
||||
{
|
||||
cout << "\nFile " << fname << " has not been opened.\n\n" ;
|
||||
assert( fout.is_open() && "File not opened." );
|
||||
}
|
||||
|
||||
string const DELIMETER(" "); // define the same delimeter as in matlab/ascii_read*.m
|
||||
int const OFFSET(1); // convert C-indexing to matlab
|
||||
|
||||
// Write data: #nodes, #space dimensions, #elements, #vertices per element
|
||||
fout << Nnodes() << DELIMETER << Ndims() << DELIMETER << Nelems() << DELIMETER << NverticesElements() << endl;
|
||||
|
||||
// Write cordinates: x_k, y_k in seperate lines
|
||||
assert( Nnodes()*Ndims() == static_cast<int>(_xc.size()));
|
||||
for (int k = 0, kj = 0; k < Nnodes(); ++k)
|
||||
{
|
||||
for (int j = 0; j < Ndims(); ++j, ++kj)
|
||||
{
|
||||
fout << _xc[kj] << DELIMETER;
|
||||
}
|
||||
fout << endl;
|
||||
}
|
||||
|
||||
// Write connectivity: ia_k,0, ia_k,1 etc in seperate lines
|
||||
assert( Nelems()*NverticesElements() == static_cast<int>(_ia.size()));
|
||||
for (int k = 0, kj = 0; k < Nelems(); ++k)
|
||||
{
|
||||
for (int j = 0; j < NverticesElements(); ++j, ++kj)
|
||||
{
|
||||
fout << _ia[kj] + OFFSET << DELIMETER; // C to matlab
|
||||
}
|
||||
fout << endl;
|
||||
}
|
||||
|
||||
// Write vector
|
||||
for (int k = 0; k < Nnodes(); ++k)
|
||||
{
|
||||
fout << v[k] << endl;
|
||||
}
|
||||
|
||||
fout.close();
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void Mesh::Visualize(std::vector<double> const &v) const
|
||||
{
|
||||
// define external command
|
||||
const string exec_m("matlab -nosplash < visualize_results.m"); // Matlab
|
||||
//const string exec_m("octave --no-window-system --no-gui visualize_results.m"); // Octave
|
||||
//const string exec_m("flatpak run org.octave.Octave visualize_results.m"); // Octave (flatpak): desktop GH
|
||||
|
||||
const string fname("uv.txt");
|
||||
Write_ascii_matlab(fname, v);
|
||||
|
||||
int ierror = system(exec_m.c_str()); // call external command
|
||||
|
||||
if (ierror != 0)
|
||||
{
|
||||
cout << endl << "Check path to Matlab/octave on your system" << endl;
|
||||
}
|
||||
cout << endl;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// #####################################################################
|
||||
Mesh_2d_3_square::Mesh_2d_3_square(int nx, int ny, int myid, int procx, int procy)
|
||||
: Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
|
||||
_myid(myid), _procx(procx), _procy(procy), _neigh{{-1, -1, -1, -1}}, _color(0),
|
||||
_xl(0.0), _xr(1.0), _yb(0.0), _yt(1.0), _nx(nx), _ny(ny)
|
||||
{
|
||||
//void IniGeom(int const myid, int const procx, int const procy, int neigh[], int &color)
|
||||
int const ky = _myid / _procx;
|
||||
int const kx = _myid % _procy; // MOD(myid,procx)
|
||||
// Determine the neighbors of domain/rank myid
|
||||
_neigh[0] = (ky == 0) ? -1 : _myid - _procx; // South
|
||||
_neigh[1] = (kx == _procx - 1) ? -1 : _myid + 1; // East
|
||||
_neigh[2] = (ky == _procy - 1) ? -1 : _myid + _procx; // North
|
||||
_neigh[3] = (kx == 0) ? -1 : _myid - 1; // West
|
||||
|
||||
_color = (kx + ky) & 1 ;
|
||||
|
||||
// subdomain is part of unit square
|
||||
double const hx = 1. / _procx;
|
||||
double const hy = 1. / _procy;
|
||||
_xl = kx * hx; // left
|
||||
_xr = (kx + 1) * hx; // right
|
||||
_yb = ky * hy; // bottom
|
||||
_yt = (ky + 1) * hy; // top
|
||||
|
||||
// Calculate coordinates
|
||||
int const nnode = (_nx + 1) * (_ny + 1); // number of nodes
|
||||
Resize_Coords(nnode, 2); // coordinates in 2D [nnode][ndim]
|
||||
GetCoordsInRectangle(_nx, _ny, _xl, _xr, _yb, _yt, GetCoords().data());
|
||||
|
||||
// Calculate element connectivity (linear triangles)
|
||||
int const nelem = 2 * _nx * _ny; // number of elements
|
||||
Resize_Connectivity(nelem, 3); // connectivity matrix [nelem][3]
|
||||
GetConnectivityInRectangle(_nx, _ny, GetConnectivity().data());
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void Mesh_2d_3_square::SetU(std::vector<double> &u) const
|
||||
{
|
||||
int dx = _nx + 1;
|
||||
for (int j = 0; j <= _ny; ++j)
|
||||
{
|
||||
int k = j * dx;
|
||||
for (int i = 0; i <= _nx; ++i, ++k)
|
||||
{
|
||||
u[k] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::SetF(std::vector<double> &f) const
|
||||
{
|
||||
int dx = _nx + 1;
|
||||
for (int j = 0; j <= _ny; ++j)
|
||||
{
|
||||
int k = j * dx;
|
||||
for (int i = 0; i <= _nx; ++i, ++k)
|
||||
{
|
||||
f[k] = 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
std::vector<int> Mesh_2d_3_square::Index_DirichletNodes() const
|
||||
{
|
||||
int const dx = 1,
|
||||
dy = _nx + 1,
|
||||
bl = 0,
|
||||
br = _nx,
|
||||
tl = _ny * (_nx + 1),
|
||||
tr = (_ny + 1) * (_nx + 1) - 1;
|
||||
int const start[4] = { bl, br, tl, bl},
|
||||
end[4] = { br, tr, tr, tl},
|
||||
step[4] = { dx, dy, dx, dy};
|
||||
|
||||
vector<int> idx(0);
|
||||
for (int j = 0; j < 4; j++)
|
||||
{
|
||||
if (_neigh[j] < 0)
|
||||
{
|
||||
for (int i = start[j]; i <= end[j]; i += step[j])
|
||||
{
|
||||
idx.push_back(i); // node i is Dirichlet node
|
||||
}
|
||||
}
|
||||
}
|
||||
// remove multiple elements
|
||||
sort(idx.begin(), idx.end()); // sort
|
||||
idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::SaveVectorP(std::string const &name, vector<double> const &u) const
|
||||
{
|
||||
// construct the file name for subdomain myid
|
||||
const string tmp( std::to_string(_myid / 100) + to_string((_myid % 100) / 10) + to_string(_myid % 10) );
|
||||
|
||||
const string namep = name + "." + tmp;
|
||||
ofstream ff(namep.c_str());
|
||||
ff.precision(6);
|
||||
ff.setf(ios::fixed, ios::floatfield);
|
||||
|
||||
// assumes tensor product grid in unit square; rowise numbered (as generated in class constructor)
|
||||
// output is provided for tensor product grid visualization ( similar to Matlab-surf() )
|
||||
auto const &xc = GetCoords();
|
||||
int k = 0;
|
||||
for (int j = 0; j <= _ny; ++j)
|
||||
{
|
||||
for (int i = 0; i <= _nx; ++i, ++k)
|
||||
ff << xc[2 * k + 0] << " " << xc[2 * k + 1] << " " << u[k] << endl;
|
||||
ff << endl;
|
||||
}
|
||||
|
||||
ff.close();
|
||||
return;
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::GetCoordsInRectangle(int const nx, int const ny,
|
||||
double const xl, double const xr, double const yb, double const yt,
|
||||
double xc[])
|
||||
{
|
||||
const double hx = (xr - xl) / nx,
|
||||
hy = (yt - yb) / ny;
|
||||
|
||||
int k = 0;
|
||||
for (int j = 0; j <= ny; ++j)
|
||||
{
|
||||
const double y0 = yb + j * hy;
|
||||
for (int i = 0; i <= nx; ++i, k += 2)
|
||||
{
|
||||
xc[k ] = xl + i * hx;
|
||||
xc[k + 1] = y0;
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void Mesh_2d_3_square::GetConnectivityInRectangle(int const nx, int const ny, int ia[])
|
||||
{
|
||||
const int dx = nx + 1;
|
||||
int k = 0;
|
||||
int l = 0;
|
||||
for (int j = 0; j < ny; ++j, ++k)
|
||||
{
|
||||
for (int i = 0; i < nx; ++i, ++k)
|
||||
{
|
||||
ia[l ] = k;
|
||||
ia[l + 1] = k + 1;
|
||||
ia[l + 2] = k + dx + 1;
|
||||
l += 3;
|
||||
ia[l ] = k;
|
||||
ia[l + 1] = k + dx;
|
||||
ia[l + 2] = k + dx + 1;
|
||||
l += 3;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// #################### still some old code (--> MPI) ############################
|
||||
|
||||
|
||||
// Copies the values of w corresponding to the boundary
|
||||
// South (ib==1), East (ib==2), North (ib==3), West (ib==4)
|
||||
|
||||
void GetBound(int const ib, int const nx, int const ny, double const w[], double s[])
|
||||
{
|
||||
const int //dx = 1,
|
||||
dy = nx + 1,
|
||||
bl = 0,
|
||||
br = nx,
|
||||
tl = ny * (nx + 1),
|
||||
tr = (ny + 1) * (nx + 1) - 1;
|
||||
switch (ib)
|
||||
{
|
||||
case 1:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= br; ++i, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
for (int i = tl, j = 0; i <= tr; ++i, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= tl; i += dy, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
for (int i = br, j = 0; i <= tr; i += dy, ++j)
|
||||
s[j] = w[i];
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------
|
||||
// Computes w: = w + s at nodes on the boundary
|
||||
// South (ib == 1), East (ib == 2), North (ib == 3), West (ib == 4)
|
||||
|
||||
void AddBound(int const ib, int const nx, int const ny, double w[], double const s[])
|
||||
{
|
||||
int const dy = nx + 1,
|
||||
bl = 0,
|
||||
br = nx,
|
||||
tl = ny * (nx + 1),
|
||||
tr = (ny + 1) * (nx + 1) - 1;
|
||||
switch (ib)
|
||||
{
|
||||
case 1:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= br; ++i, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
case 3:
|
||||
{
|
||||
for (int i = tl, j = 0; i <= tr; ++i, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
for (int i = bl, j = 0; i <= tl; i += dy, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
for (int i = br, j = 0; i <= tr; i += dy, ++j)
|
||||
w[i] += s[j];
|
||||
break;
|
||||
}
|
||||
default:
|
||||
{
|
||||
cout << endl << "Wrong parameter ib in " << __FILE__ << ":" << __LINE__ << endl;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// ####################################################################
|
||||
|
||||
Mesh_2d_3_matlab::Mesh_2d_3_matlab(string const &fname)
|
||||
: Mesh(2, 3, 3), // two dimensions, 3 vertices, 3 dofs
|
||||
bedges(0)
|
||||
{
|
||||
ifstream ifs(fname);
|
||||
if (!(ifs.is_open() && ifs.good()))
|
||||
{
|
||||
cerr << "Mesh_2d_3_matlab: Error cannot open file " << fname << endl;
|
||||
assert(ifs.is_open());
|
||||
}
|
||||
|
||||
int const OFFSET(1); // Matlab to C indexing
|
||||
cout << "ASCI file " << fname << " opened" << endl;
|
||||
|
||||
// Read some mesh constants
|
||||
int nnode, ndim, nelem, nvert_e;
|
||||
ifs >> nnode >> ndim >> nelem >> nvert_e;
|
||||
cout << nnode << " " << ndim << " " << nelem << " " << nvert_e << endl;
|
||||
assert(ndim == 2 && nvert_e == 3);
|
||||
|
||||
// Allocate memory
|
||||
Resize_Coords(nnode, ndim); // coordinates in 2D [nnode][ndim]
|
||||
Resize_Connectivity(nelem, nvert_e); // connectivity matrix [nelem][nvert]
|
||||
|
||||
// Read ccordinates
|
||||
auto &xc = GetCoords();
|
||||
for (int k = 0; k < nnode * ndim; ++k)
|
||||
{
|
||||
ifs >> xc[k];
|
||||
}
|
||||
|
||||
// Read connectivity
|
||||
auto &ia = GetConnectivity();
|
||||
for (int k = 0; k < nelem * nvert_e; ++k)
|
||||
{
|
||||
ifs >> ia[k];
|
||||
ia[k] -= OFFSET; // Matlab to C indexing
|
||||
}
|
||||
|
||||
// additional read of boundary information (only start/end point)
|
||||
int nbedges;
|
||||
ifs >> nbedges;
|
||||
|
||||
bedges.resize(nbedges * 2);
|
||||
for (int k = 0; k < nbedges * 2; ++k)
|
||||
{
|
||||
ifs >> bedges[k];
|
||||
bedges[k] -= OFFSET; // Matlab to C indexing
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// binary
|
||||
//{
|
||||
//ifstream ifs(fname, ios_base::in | ios_base::binary);
|
||||
//if(!(ifs.is_open() && ifs.good()))
|
||||
//{
|
||||
//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
|
||||
//assert(ifs.is_open());
|
||||
//}
|
||||
//cout << "ReadBinMatrix: file opened" << file << endl;
|
||||
|
||||
|
||||
//}
|
||||
|
||||
// binaryIO.cpp
|
||||
//void read_binMatrix(const string& file, vector<int> &cnt, vector<int> &col, vector<double> &ele)
|
||||
//{
|
||||
|
||||
//ifstream ifs(file, ios_base::in | ios_base::binary);
|
||||
|
||||
//if(!(ifs.is_open() && ifs.good()))
|
||||
//{
|
||||
//cerr << "ReadBinMatrix: Error cannot open file " << file << endl;
|
||||
//assert(ifs.is_open());
|
||||
//}
|
||||
//cout << "ReadBinMatrix: Opened file " << file << endl;
|
||||
|
||||
//int _size;
|
||||
|
||||
//ifs.read(reinterpret_cast<char*>(&_size), sizeof(int)); // old: ifs.read((char*)&_size, sizeof(int));
|
||||
//cnt.resize(_size);
|
||||
//cout << "ReadBinMatrix: cnt size: " << _size << endl;
|
||||
|
||||
//ifs.read((char*)&_size, sizeof(int));
|
||||
//col.resize(_size);
|
||||
//cout << "ReadBinMatrix: col size: " << _size << endl;
|
||||
|
||||
//ifs.read((char*)&_size, sizeof(int));
|
||||
//ele.resize(_size);
|
||||
//cout << "ReadBinMatrix: ele size: " << _size << endl;
|
||||
|
||||
|
||||
//ifs.read((char*)cnt.data(), cnt.size() * sizeof(int));
|
||||
//ifs.read((char*)col.data(), col.size() * sizeof(int));
|
||||
//ifs.read((char*)ele.data(), ele.size() * sizeof(double));
|
||||
|
||||
//ifs.close();
|
||||
//cout << "ReadBinMatrix: Finished reading matrix.." << endl;
|
||||
|
||||
//}
|
||||
|
||||
|
||||
std::vector<int> Mesh_2d_3_matlab::Index_DirichletNodes() const
|
||||
{
|
||||
vector<int> idx(bedges); // copy
|
||||
|
||||
sort(idx.begin(), idx.end()); // sort
|
||||
idx.erase( unique(idx.begin(), idx.end()), idx.end() ); // remove duplicate data
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
381
ex3/ex3/geom.h
Normal file
381
ex3/ex3/geom.h
Normal file
|
|
@ -0,0 +1,381 @@
|
|||
#ifndef GEOM_FILE
|
||||
#define GEOM_FILE
|
||||
#include <array>
|
||||
#include <functional> // function; C++11
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* Basis class for finite element meshes.
|
||||
*/
|
||||
class Mesh
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Constructor initializing the members with default values.
|
||||
*
|
||||
* @param[in] ndim space dimensions (dimension for coordinates)
|
||||
* @param[in] nvert_e number of vertices per element (dimension for connectivity)
|
||||
* @param[in] ndof_e degrees of freedom per element (= @p nvert_e for linear elements)
|
||||
*/
|
||||
explicit Mesh(int ndim, int nvert_e = 0, int ndof_e = 0);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*
|
||||
* See clang warning on
|
||||
* <a href="https://stackoverflow.com/questions/28786473/clang-no-out-of-line-virtual-method-definitions-pure-abstract-c-class/40550578">weak-vtables</a>.
|
||||
*/
|
||||
virtual ~Mesh();
|
||||
|
||||
/**
|
||||
* Number of finite elements in (sub)domain.
|
||||
* @return number of elements.
|
||||
*/
|
||||
int Nelems() const
|
||||
{
|
||||
return _nelem;
|
||||
}
|
||||
|
||||
/**
|
||||
* Global number of vertices for each finite element.
|
||||
* @return number of vertices per element.
|
||||
*/
|
||||
int NverticesElements() const
|
||||
{
|
||||
return _nvert_e;
|
||||
}
|
||||
|
||||
/**
|
||||
* Global number of degrees of freedom (dof) for each finite element.
|
||||
* @return degrees of freedom per element.
|
||||
*/
|
||||
int NdofsElement() const
|
||||
{
|
||||
return _ndof_e;
|
||||
}
|
||||
|
||||
/**
|
||||
* Number of vertices in mesh.
|
||||
* @return number of vertices.
|
||||
*/
|
||||
int Nnodes() const
|
||||
{
|
||||
return _nnode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Space dimension.
|
||||
* @return number of dimensions.
|
||||
*/
|
||||
int Ndims() const
|
||||
{
|
||||
return _ndim;
|
||||
}
|
||||
|
||||
/**
|
||||
* (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
|
||||
*
|
||||
* @param[in] nelem number of elements
|
||||
* @param[in] nvert_e number of vertices per element
|
||||
*/
|
||||
void Resize_Connectivity(int nelem, int nvert_e)
|
||||
{
|
||||
SetNelem(nelem); // number of elements
|
||||
SetNverticesElement(nvert_e); // vertices per element
|
||||
_ia.resize(nelem * nvert_e);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read connectivity information (g1,g2,g3)_i.
|
||||
* @return convectivity vector [nelems*ndofs].
|
||||
*/
|
||||
const std::vector<int> &GetConnectivity() const
|
||||
{
|
||||
return _ia;
|
||||
}
|
||||
|
||||
/**
|
||||
* Access/Change connectivity information (g1,g2,g3)_i.
|
||||
* @return convectivity vector [nelems*ndofs].
|
||||
*/
|
||||
std::vector<int> &GetConnectivity()
|
||||
{
|
||||
return _ia;
|
||||
}
|
||||
|
||||
/**
|
||||
* (Re-)Allocates memory for the element connectivity and redefines the appropriate dimensions.
|
||||
*
|
||||
* @param[in] nnodes number of nodes
|
||||
* @param[in] ndim space dimension
|
||||
*/
|
||||
void Resize_Coords(int nnodes, int ndim)
|
||||
{
|
||||
SetNnode(nnodes); // number of nodes
|
||||
SetNdim(ndim); // space dimension
|
||||
_xc.resize(nnodes * ndim);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read coordinates of vertices (x,y)_i.
|
||||
* @return coordinates vector [nnodes*2].
|
||||
*/
|
||||
const std::vector<double> &GetCoords() const
|
||||
{
|
||||
return _xc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Access/Change coordinates of vertices (x,y)_i.
|
||||
* @return coordinates vector [nnodes*2].
|
||||
*/
|
||||
std::vector<double> &GetCoords()
|
||||
{
|
||||
return _xc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate values in vector @p v via function @p func(x,y)
|
||||
* @param[in] v vector
|
||||
* @param[in] func function of (x,y) returning a double value.
|
||||
*/
|
||||
void SetValues(std::vector<double> &v, const std::function<double(double, double)> &func) const;
|
||||
|
||||
/**
|
||||
* Prints the information for a finite element mesh
|
||||
*/
|
||||
void Debug() const;
|
||||
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions
|
||||
* @return index vector.
|
||||
*/
|
||||
virtual std::vector<int> Index_DirichletNodes() const = 0;
|
||||
|
||||
/**
|
||||
* Write vector @p v toghether with its mesh information to an ASCii file @p fname.
|
||||
*
|
||||
* The data are written in C-style.
|
||||
*
|
||||
* @param[in] fname file name
|
||||
* @param[in] v vector
|
||||
*/
|
||||
void Write_ascii_matlab(std::string const &fname, std::vector<double> const &v) const;
|
||||
|
||||
/**
|
||||
* Visualize @p v together with its mesh information via matlab or octave.
|
||||
*
|
||||
* Comment/uncomment those code lines in method Mesh:Visualize (geom.cpp)
|
||||
* that are supported on your system.
|
||||
*
|
||||
* @param[in] v vector
|
||||
*
|
||||
* @warning matlab files ascii_read_meshvector.m visualize_results.m
|
||||
* must be in the executing directory.
|
||||
*/
|
||||
void Visualize(std::vector<double> const &v) const;
|
||||
|
||||
|
||||
protected:
|
||||
void SetNelem(int nelem)
|
||||
{
|
||||
_nelem = nelem;
|
||||
}
|
||||
|
||||
void SetNverticesElement(int nvert)
|
||||
{
|
||||
_nvert_e = nvert;
|
||||
}
|
||||
|
||||
void SetNdofsElement(int ndof)
|
||||
{
|
||||
_ndof_e = ndof;
|
||||
}
|
||||
|
||||
void SetNnode(int nnode)
|
||||
{
|
||||
_nnode = nnode;
|
||||
}
|
||||
|
||||
void SetNdim(int ndim)
|
||||
{
|
||||
_ndim = ndim;
|
||||
}
|
||||
|
||||
private:
|
||||
int _nelem; //!< number elements
|
||||
int _nvert_e; //!< number of vertices per element
|
||||
int _ndof_e; //!< degrees of freedom (d.o.f.) per element
|
||||
int _nnode; //!< number nodes/vertices
|
||||
int _ndim; //!< space dimension of the problem (1, 2, or 3)
|
||||
std::vector<int> _ia; //!< element connectivity
|
||||
std::vector<double> _xc; //!< coordinates
|
||||
};
|
||||
|
||||
/**
|
||||
* 2D finite element mesh of the square consiting of linear triangular elements.
|
||||
*/
|
||||
class Mesh_2d_3_square: public Mesh
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Generates the f.e. mesh for the unit square.
|
||||
*
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in] myid my MPI-rank / subdomain
|
||||
* @param[in] procx number of ranks/subdomains in x-direction
|
||||
* @param[in] procy number of processes in y-direction
|
||||
*/
|
||||
Mesh_2d_3_square(int nx, int ny, int myid = 0, int procx = 1, int procy = 1);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
~Mesh_2d_3_square() override
|
||||
{}
|
||||
|
||||
/**
|
||||
* Set solution vector based on a tensor product grid in the rectangle.
|
||||
* @param[in] u solution vector
|
||||
*/
|
||||
void SetU(std::vector<double> &u) const;
|
||||
|
||||
/**
|
||||
* Set right hand side (rhs) vector on a tensor product grid in the rectangle.
|
||||
* @param[in] f rhs vector
|
||||
*/
|
||||
void SetF(std::vector<double> &f) const;
|
||||
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions
|
||||
* @return index vector.
|
||||
*/
|
||||
std::vector<int> Index_DirichletNodes() const override;
|
||||
|
||||
/**
|
||||
* Stores the values of vector @p u of (sub)domain into a file @p name for further processing in gnuplot.
|
||||
* The file stores rowise the x- and y- coordinates together with the value from @p u .
|
||||
* The domain [@p xl, @p xr] x [@p yb, @p yt] is discretized into @p nx x @p ny intervals.
|
||||
*
|
||||
* @param[in] name basename of file name (file name will be extended by the rank number)
|
||||
* @param[in] u local vector
|
||||
*
|
||||
* @warning Assumes tensor product grid in unit square; rowise numbered
|
||||
* (as generated in class constructor).
|
||||
* The output is provided for tensor product grid visualization
|
||||
* ( similar to Matlab-surf() ).
|
||||
*
|
||||
* @see Mesh_2d_3_square
|
||||
*/
|
||||
void SaveVectorP(std::string const &name, std::vector<double> const &u) const;
|
||||
|
||||
// here will still need to implement in the class
|
||||
// GetBound(), AddBound()
|
||||
// or better a generalized way with indices and their appropriate ranks for MPI communication
|
||||
|
||||
private:
|
||||
/**
|
||||
* Determines the coordinates of the dicretization nodes of the domain [@p xl, @p xr] x [@p yb, @p yt]
|
||||
* which is discretized into @p nx x @p ny intervals.
|
||||
*
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in] xl x-coordinate of left boundary
|
||||
* @param[in] xr x-coordinate of right boundary
|
||||
* @param[in] yb y-coordinate of lower boundary
|
||||
* @param[in] yt y-coordinate of upper boundary
|
||||
* @param[out] xc coordinate vector of length 2n with x(2*k,2*k+1) as coodinates of node k
|
||||
*/
|
||||
|
||||
void GetCoordsInRectangle(int nx, int ny, double xl, double xr, double yb, double yt,
|
||||
double xc[]);
|
||||
/**
|
||||
* Determines the element connectivity of linear triangular elements of a FEM discretization
|
||||
* of a rectangle using @p nx x @p ny equidistant intervals for discretization.
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[out] ia element connectivity matrix with ia(3*s,3*s+1,3*s+2) as node numbers od element s
|
||||
*/
|
||||
void GetConnectivityInRectangle(int nx, int ny, int ia[]);
|
||||
|
||||
private:
|
||||
int _myid; //!< my MPI rank
|
||||
int _procx; //!< number of MPI ranks in x-direction
|
||||
int _procy; //!< number of MPI ranks in y-direction
|
||||
std::array<int, 4> _neigh; //!< MPI ranks of neighbors (negative: no neighbor but b.c.)
|
||||
int _color; //!< red/black coloring (checker board) of subdomains
|
||||
|
||||
double _xl; //!< x coordinate of lower left corner of square
|
||||
double _xr; //!< x coordinate of lower right corner of square
|
||||
double _yb; //!< y coordinate or lower left corner of square
|
||||
double _yt; //!< y coordinate of upper right corner of square
|
||||
int _nx; //!< number of intervals in x-direction
|
||||
int _ny; //!< number of intervals in y-direction
|
||||
};
|
||||
|
||||
// #################### still some old code (--> MPI) ############################
|
||||
/**
|
||||
* Copies the values of @p w corresponding to boundary @p ib
|
||||
* onto vector s. South (ib==1), East (ib==2), North (ib==3), West (ib==4).
|
||||
* The vector @p s has to be long enough!!
|
||||
* @param[in] ib my local boundary
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in] w vector for all nodes of local discretization
|
||||
* @param[out] s short vector with values on boundary @p ib
|
||||
*/
|
||||
// GH_NOTE: Absicherung bei s !!
|
||||
void GetBound(int ib, int nx, int ny, double const w[], double s[]);
|
||||
|
||||
|
||||
/**
|
||||
* Computes @p w := @p w + @p s at the interface/boundary nodes on the
|
||||
* boundary @p ib . South (ib==1), East (ib==2), North (ib==3), West (ib==4)
|
||||
* @param[in] ib my local boundary
|
||||
* @param[in] nx number of discretization intervals in x-direction
|
||||
* @param[in] ny number of discretization intervals in y-direction
|
||||
* @param[in,out] w vector for all nodes of local discretization
|
||||
* @param[in] s short vector with values on boundary @p ib
|
||||
*/
|
||||
void AddBound(int ib, int nx, int ny, double w[], double const s[]);
|
||||
|
||||
// #################### Mesh from Matlab ############################
|
||||
/**
|
||||
* 2D finite element mesh of the square consiting of linear triangular elements.
|
||||
*/
|
||||
class Mesh_2d_3_matlab: public Mesh
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Reads mesh data from a binary file.
|
||||
*
|
||||
* File format, see ascii_write_mesh.m
|
||||
*
|
||||
* @param[in] fname file name
|
||||
*/
|
||||
explicit Mesh_2d_3_matlab(std::string const &fname);
|
||||
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions.
|
||||
* @return index vector.
|
||||
*
|
||||
* @warning All boundary nodes are considered as Dirchlet nodes.
|
||||
*/
|
||||
std::vector<int> Index_DirichletNodes() const override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Determines the indices of those vertices with Dirichlet boundary conditions
|
||||
* @return index vector.
|
||||
*/
|
||||
int Nnbedges() const
|
||||
{
|
||||
return static_cast<int>(bedges.size());
|
||||
}
|
||||
|
||||
std::vector<int> bedges; //!< boundary edges [nbedges][2] storing start/end vertex
|
||||
|
||||
};
|
||||
|
||||
#endif
|
||||
347
ex3/ex3/getmatrix.cpp
Normal file
347
ex3/ex3/getmatrix.cpp
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
#include "getmatrix.h"
|
||||
#include "userset.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
// general routine for lin. triangular elements
|
||||
|
||||
void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3])
|
||||
//void CalcElem(const int* __restrict__ ial, const double* __restrict__ xc, double* __restrict__ ske[3], double* __restrict__ fe)
|
||||
{
|
||||
const int i1 = 2 * ial[0], i2 = 2 * ial[1], i3 = 2 * ial[2];
|
||||
const double x13 = xc[i3 + 0] - xc[i1 + 0], y13 = xc[i3 + 1] - xc[i1 + 1],
|
||||
x21 = xc[i1 + 0] - xc[i2 + 0], y21 = xc[i1 + 1] - xc[i2 + 1],
|
||||
x32 = xc[i2 + 0] - xc[i3 + 0], y32 = xc[i2 + 1] - xc[i3 + 1];
|
||||
const double jac = fabs(x21 * y13 - x13 * y21);
|
||||
|
||||
ske[0][0] = 0.5 / jac * (y32 * y32 + x32 * x32);
|
||||
ske[0][1] = 0.5 / jac * (y13 * y32 + x13 * x32);
|
||||
ske[0][2] = 0.5 / jac * (y21 * y32 + x21 * x32);
|
||||
ske[1][0] = ske[0][1];
|
||||
ske[1][1] = 0.5 / jac * (y13 * y13 + x13 * x13);
|
||||
ske[1][2] = 0.5 / jac * (y21 * y13 + x21 * x13);
|
||||
ske[2][0] = ske[0][2];
|
||||
ske[2][1] = ske[1][2];
|
||||
ske[2][2] = 0.5 / jac * (y21 * y21 + x21 * x21);
|
||||
|
||||
const double xm = (xc[i1 + 0] + xc[i2 + 0] + xc[i3 + 0]) / 3.0,
|
||||
ym = (xc[i1 + 1] + xc[i2 + 1] + xc[i3 + 1]) / 3.0;
|
||||
//fe[0] = fe[1] = fe[2] = 0.5 * jac * FunctF(xm, ym) / 3.0;
|
||||
fe[0] = fe[1] = fe[2] = 0.5 * jac * fNice(xm, ym) / 3.0;
|
||||
}
|
||||
|
||||
|
||||
// general routine for lin. triangular elements,
|
||||
// non-symm. matrix
|
||||
// node numbering in element: a s c e n d i n g indices !!
|
||||
// GH: deprecated
|
||||
void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
|
||||
int const id[], int const ik[], double sk[], double f[])
|
||||
{
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
const int ii = ial[i], // row ii (global index)
|
||||
id1 = id[ii], // start and
|
||||
id2 = id[ii + 1]; // end of row ii in matrix
|
||||
int ip = id1;
|
||||
for (int j = 0; j < 3; ++j) // no symmetry assumed
|
||||
{
|
||||
const int jj = ial[j];
|
||||
bool not_found = true;
|
||||
do // find entry jj (global index) in row ii
|
||||
{
|
||||
not_found = (ik[ip] != jj);
|
||||
++ip;
|
||||
}
|
||||
while (not_found && ip < id2);
|
||||
|
||||
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
|
||||
if (not_found) // no entry found !!
|
||||
{
|
||||
cout << "Error in AddElem: (" << ii << "," << jj << ") ["
|
||||
<< ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
|
||||
assert(!not_found);
|
||||
}
|
||||
#endif
|
||||
sk[ip - 1] += ske[i][j];
|
||||
}
|
||||
f[ii] += fe[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
// ####################################################################
|
||||
|
||||
CRS_Matrix::CRS_Matrix(Mesh const &mesh)
|
||||
: _mesh(mesh), _nrows(0), _nnz(0), _id(0), _ik(0), _sk(0)
|
||||
{
|
||||
Derive_Matrix_Pattern();
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::Derive_Matrix_Pattern()
|
||||
{
|
||||
int const nelem(_mesh.Nelems());
|
||||
int const ndof_e(_mesh.NdofsElement());
|
||||
auto const &ia(_mesh.GetConnectivity());
|
||||
// Determine the number of matrix rows
|
||||
_nrows = *max_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem);
|
||||
++_nrows; // node numberng: 0 ... nnode-1
|
||||
assert(*min_element(ia.cbegin(), ia.cbegin() + ndof_e * nelem) == 0); // numbering starts with 0 ?
|
||||
|
||||
// Collect for each node those nodes it is connected to (multiple entries)
|
||||
// Detect the neighboring nodes
|
||||
vector< list<int> > cc(_nrows); // cc[i] is the list of nodes a node i is connected to
|
||||
for (int i = 0; i < nelem; ++i)
|
||||
{
|
||||
int const idx = ndof_e * i;
|
||||
for (int k = 0; k < ndof_e; ++k)
|
||||
{
|
||||
list<int> &cck = cc.at(ia[idx + k]);
|
||||
cck.insert( cck.end(), ia.cbegin() + idx, ia.cbegin() + idx + ndof_e );
|
||||
}
|
||||
}
|
||||
// Delete the multiple entries
|
||||
_nnz = 0;
|
||||
for (auto &it : cc)
|
||||
{
|
||||
it.sort();
|
||||
it.unique();
|
||||
_nnz += static_cast<int>(it.size());
|
||||
// cout << it.size() << " :: "; copy(it->begin(),it->end(), ostream_iterator<int,char>(cout," ")); cout << endl;
|
||||
}
|
||||
|
||||
// CSR data allocation
|
||||
_id.resize(_nrows + 1); // Allocate memory for CSR row pointer
|
||||
_ik.resize(_nnz); // Allocate memory for CSR column index vector
|
||||
|
||||
// copy CSR data
|
||||
_id[0] = 0; // begin of first row
|
||||
for (size_t i = 0; i < cc.size(); ++i)
|
||||
{
|
||||
//cout << i << " " << nid.at(i) << endl;;
|
||||
const list<int> &ci = cc.at(i);
|
||||
const auto nci = static_cast<int>(ci.size());
|
||||
_id[i + 1] = _id[i] + nci; // begin of next line
|
||||
copy(ci.begin(), ci.end(), _ik.begin() + _id[i] );
|
||||
}
|
||||
|
||||
assert(_nnz == _id[_nrows]);
|
||||
_sk.resize(_nnz); // Allocate memory for CSR column index vector
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void CRS_Matrix::Debug() const
|
||||
{
|
||||
// ID points to first entry of row
|
||||
// no symmetry assumed
|
||||
cout << "\nMatrix (nnz = " << _id[_nrows] << ")\n";
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
cout << "Row " << row << " : ";
|
||||
int const id1 = _id[row];
|
||||
int const id2 = _id[row + 1];
|
||||
for (int j = id1; j < id2; ++j)
|
||||
{
|
||||
cout.setf(ios::right, ios::adjustfield);
|
||||
cout << "[" << setw(2) << _ik[j] << "] " << setw(4) << _sk[j] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::CalculateLaplace(vector<double> &f)
|
||||
{
|
||||
assert(_mesh.NdofsElement() == 3); // only for triangular, linear elements
|
||||
//cout << _nnz << " vs. " << _id[_nrows] << " " << _nrows<< endl;
|
||||
assert(_nnz == _id[_nrows]);
|
||||
|
||||
for (int k = 0; k < _nrows; ++k)
|
||||
{
|
||||
_sk[k] = 0.0;
|
||||
}
|
||||
for (int k = 0; k < _nrows; ++k)
|
||||
{
|
||||
f[k] = 0.0;
|
||||
}
|
||||
|
||||
double ske[3][3], fe[3];
|
||||
// Loop over all elements
|
||||
auto const nelem = _mesh.Nelems();
|
||||
auto const &ia = _mesh.GetConnectivity();
|
||||
auto const &xc = _mesh.GetCoords();
|
||||
|
||||
for (int i = 0; i < nelem; ++i)
|
||||
{
|
||||
CalcElem(ia.data() + 3 * i, xc.data(), ske, fe);
|
||||
//AddElem(ia.data()+3 * i, ske, fe, _id.data(), _ik.data(), _sk.data(), f.data()); // GH: deprecated
|
||||
AddElem_3(ia.data() + 3 * i, ske, fe, f);
|
||||
}
|
||||
|
||||
//Debug();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f)
|
||||
{
|
||||
double const PENALTY = 1e6;
|
||||
auto const idx = _mesh.Index_DirichletNodes();
|
||||
int const nidx = static_cast<int>(idx.size());
|
||||
|
||||
for (int row = 0; row < nidx; ++row)
|
||||
{
|
||||
int const k = idx[row];
|
||||
int const id1 = fetch(k, k); // Find diagonal entry of row
|
||||
assert(id1 >= 0);
|
||||
_sk[id1] += PENALTY; // matrix weighted scaling feasible
|
||||
f[k] += PENALTY * u[k];
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::GetDiag(vector<double> &d) const
|
||||
{
|
||||
assert( _nrows == static_cast<int>(d.size()) );
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
const int ia = fetch(row, row); // Find diagonal entry of row
|
||||
assert(ia >= 0);
|
||||
d[row] = _sk[ia];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
bool CRS_Matrix::Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const
|
||||
{
|
||||
bool bn = (nnode == _nrows); // number of rows
|
||||
if (!bn)
|
||||
{
|
||||
cout << "######### Error: " << "number of rows" << endl;
|
||||
}
|
||||
|
||||
bool bz = (id[nnode] == _nnz); // number of non zero elements
|
||||
if (!bz)
|
||||
{
|
||||
cout << "######### Error: " << "number of non zero elements" << endl;
|
||||
}
|
||||
|
||||
bool bd = equal(id, id + nnode + 1, _id.cbegin()); // row starts
|
||||
if (!bd)
|
||||
{
|
||||
cout << "######### Error: " << "row starts" << endl;
|
||||
}
|
||||
|
||||
bool bk = equal(ik, ik + id[nnode], _ik.cbegin()); // column indices
|
||||
if (!bk)
|
||||
{
|
||||
cout << "######### Error: " << "column indices" << endl;
|
||||
}
|
||||
|
||||
bool bv = equal(sk, sk + id[nnode], _sk.cbegin()); // values
|
||||
if (!bv)
|
||||
{
|
||||
cout << "######### Error: " << "values" << endl;
|
||||
}
|
||||
|
||||
return bn && bz && bd && bk && bv;
|
||||
}
|
||||
|
||||
|
||||
void CRS_Matrix::Mult(vector<double> &w, vector<double> const &u) const
|
||||
{
|
||||
assert( _nrows == static_cast<int>(w.size()) );
|
||||
assert( w.size() == u.size() );
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
double wi = 0.0;
|
||||
for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
|
||||
{
|
||||
wi += _sk[ij] * u[ _ik[ij] ];
|
||||
}
|
||||
w[row] = wi;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void CRS_Matrix::Defect(vector<double> &w,
|
||||
vector<double> const &f, vector<double> const &u) const
|
||||
{
|
||||
assert( _nrows == static_cast<int>(w.size()) );
|
||||
assert( w.size() == u.size() && u.size() == f.size() );
|
||||
|
||||
for (int row = 0; row < _nrows; ++row)
|
||||
{
|
||||
double wi = f[row];
|
||||
for (int ij = _id[row]; ij < _id[row + 1]; ++ij)
|
||||
{
|
||||
wi -= _sk[ij] * u[ _ik[ij] ];
|
||||
}
|
||||
w[row] = wi;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int CRS_Matrix::fetch(int const row, int const col) const
|
||||
{
|
||||
int const id2 = _id[row + 1]; // end and
|
||||
int ip = _id[row]; // start of recent row (global index)
|
||||
|
||||
while (ip < id2 && _ik[ip] != col) // find index col (global index)
|
||||
{
|
||||
++ip;
|
||||
}
|
||||
if (ip >= id2)
|
||||
{
|
||||
ip = -1;
|
||||
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
|
||||
cout << "No column " << col << " in row " << row << endl;
|
||||
assert(ip >= id2);
|
||||
#endif
|
||||
}
|
||||
return ip;
|
||||
}
|
||||
|
||||
|
||||
void CRS_Matrix::AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], vector<double> &f)
|
||||
{
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
const int ii = ial[i]; // row ii (global index)
|
||||
for (int j = 0; j < 3; ++j) // no symmetry assumed
|
||||
{
|
||||
const int jj = ial[j]; // column jj (global index)
|
||||
int ip = fetch(ii, jj); // find column entry jj in row ii
|
||||
#ifndef NDEBUG // compiler option -DNDEBUG switches off the check
|
||||
if (ip < 0) // no entry found !!
|
||||
{
|
||||
cout << "Error in AddElem: (" << ii << "," << jj << ") ["
|
||||
<< ial[0] << "," << ial[1] << "," << ial[2] << "]\n";
|
||||
assert(ip >= 0);
|
||||
}
|
||||
#endif
|
||||
_sk[ip] += ske[i][j];
|
||||
}
|
||||
f[ii] += fe[i];
|
||||
}
|
||||
}
|
||||
|
||||
178
ex3/ex3/getmatrix.h
Normal file
178
ex3/ex3/getmatrix.h
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
#ifndef GETMATRIX_FILE
|
||||
#define GETMATRIX_FILE
|
||||
|
||||
#include "geom.h"
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* Calculates the element stiffness matrix @p ske and the element load vector @p fe
|
||||
* of one triangular element with linear shape functions.
|
||||
* @param[in] ial node indices of the three element vertices
|
||||
* @param[in] xc vector of node coordinates with x(2*k,2*k+1) as coodinates of node k
|
||||
* @param[out] ske element stiffness matrix
|
||||
* @param[out] fe element load vector
|
||||
*/
|
||||
void CalcElem(int const ial[3], double const xc[], double ske[3][3], double fe[3]);
|
||||
|
||||
/**
|
||||
* Adds the element stiffness matrix @p ske and the element load vector @p fe
|
||||
* of one triangular element with linear shape functions to the appropriate positions in
|
||||
* the symmetric stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik)
|
||||
*
|
||||
* @param[in] ial node indices of the three element vertices
|
||||
* @param[in] ske element stiffness matrix
|
||||
* @param[in] fe element load vector
|
||||
* @param[out] sk vector non-zero entries of CSR matrix
|
||||
* @param[in] id index vector containing the first entry in a CSR row
|
||||
* @param[in] ik column index vector of CSR matrix
|
||||
* @param[out] f distributed local vector storing the right hand side
|
||||
*
|
||||
* @warning Algorithm requires indices in connectivity @p ial in ascending order.
|
||||
* Currently deprecated.
|
||||
*/
|
||||
void AddElem(int const ial[3], double const ske[3][3], double const fe[3],
|
||||
int const id[], int const ik[], double sk[], double f[]);
|
||||
|
||||
|
||||
// #####################################################################
|
||||
/**
|
||||
* Square matrix in CRS format (compressed row storage; also named CSR),
|
||||
* see an <a href="https://en.wikipedia.org/wiki/Sparse_matrix">introduction</a>.
|
||||
*/
|
||||
class CRS_Matrix
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Intializes the CRS matrix structure from the given discetization in @p mesh.
|
||||
*
|
||||
* The sparse matrix pattern is generated but the values are 0.
|
||||
*
|
||||
* @param[in] mesh given discretization
|
||||
*
|
||||
* @warning A reference to the discretization @p mesh is stored inside this class.
|
||||
* Therefore, changing @p mesh outside requires also
|
||||
* to call method @p Derive_Matrix_Pattern explicitely.
|
||||
*
|
||||
* @see Derive_Matrix_Pattern
|
||||
*/
|
||||
explicit CRS_Matrix(Mesh const & mesh);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~CRS_Matrix()
|
||||
{}
|
||||
|
||||
/**
|
||||
* Generates the sparse matrix pattern and overwrites the existing pattern.
|
||||
*
|
||||
* The sparse matrix pattern is generated but the values are 0.
|
||||
*/
|
||||
void Derive_Matrix_Pattern();
|
||||
|
||||
/**
|
||||
* Calculates the entries of f.e. stiffness matrix and load/rhs vector @p f for the Laplace operator in 2D.
|
||||
* No memory is allocated.
|
||||
*
|
||||
* @param[in,out] f (preallocated) rhs/load vector
|
||||
*/
|
||||
void CalculateLaplace(std::vector<double> &f);
|
||||
|
||||
/**
|
||||
* Applies Dirichlet boundary conditions to stiffness matrix and to load vector @p f.
|
||||
* The <a href="https://www.jstor.org/stable/2005611?seq=1#metadata_info_tab_contents">penalty method</a>
|
||||
* is used for incorporating the given values @p u.
|
||||
*
|
||||
* @param[in] u (global) vector with Dirichlet data
|
||||
* @param[in,out] f load vector
|
||||
*/
|
||||
void ApplyDirichletBC(std::vector<double> const &u, std::vector<double> &f);
|
||||
|
||||
/**
|
||||
* Extracts the diagonal elemenst of the sparse matrix.
|
||||
*
|
||||
* @param[in,out] d (prellocated) vector of diagonal elements
|
||||
*/
|
||||
void GetDiag(std::vector<double> &d) const;
|
||||
|
||||
/**
|
||||
* Performs the matrix-vector product w := K*u.
|
||||
*
|
||||
* @param[in,out] w resulting vector (preallocated)
|
||||
* @param[in] u vector
|
||||
*/
|
||||
void Mult(std::vector<double> &w, std::vector<double> const &u) const;
|
||||
|
||||
/**
|
||||
* Calculates the defect/residuum w := f - K*u.
|
||||
*
|
||||
* @param[in,out] w resulting vector (preallocated)
|
||||
* @param[in] f load vector
|
||||
* @param[in] u vector
|
||||
*/
|
||||
void Defect(std::vector<double> &w,
|
||||
std::vector<double> const &f, std::vector<double> const &u) const;
|
||||
|
||||
/**
|
||||
* Number rows in matrix.
|
||||
* @return number of rows.
|
||||
*/
|
||||
int Nrows() const
|
||||
{return _nrows;}
|
||||
|
||||
/**
|
||||
* Show the matrix entries.
|
||||
*/
|
||||
void Debug() const;
|
||||
|
||||
/**
|
||||
* Finds in a CRS matrix the access index for an entry at row @p row and column @p col.
|
||||
*
|
||||
* @param[in] row row index
|
||||
* @param[in] col column index
|
||||
* @return index for element (@p row, @p col). If no appropriate entry exists then -1 will be returned.
|
||||
*
|
||||
* @warning assert() stops the function in case that matrix element (@p row, @p col) doesn't exist.
|
||||
*/
|
||||
int fetch(int row, int col) const;
|
||||
|
||||
/**
|
||||
* Adds the element stiffness matrix @p ske and the element load vector @p fe
|
||||
* of one triangular element with linear shape functions to the appropriate positions in
|
||||
* the stiffness matrix, stored as CSR matrix K(@p sk,@p id, @p ik).
|
||||
*
|
||||
* @param[in] ial node indices of the three element vertices
|
||||
* @param[in] ske element stiffness matrix
|
||||
* @param[in] fe element load vector
|
||||
* @param[in,out] f distributed local vector storing the right hand side
|
||||
*
|
||||
* @warning Algorithm assumes linear triangular elements (ndof_e==3).
|
||||
*/
|
||||
void AddElem_3(int const ial[3], double const ske[3][3], double const fe[3], std::vector<double> &f);
|
||||
|
||||
/**
|
||||
* Compare @p this CRS matrix with an external CRS matrix stored in C-Style.
|
||||
*
|
||||
* The method prints statements on differences found.
|
||||
*
|
||||
* @param[in] nnode row number of external matrix
|
||||
* @param[in] id start indices of matrix rows of external matrix
|
||||
* @param[in] ik column indices of external matrix
|
||||
* @param[in] sk non-zero values of external matrix
|
||||
*
|
||||
* @return true iff all data are identical.
|
||||
*/
|
||||
bool Compare2Old(int nnode, int const id[], int const ik[], double const sk[]) const;
|
||||
|
||||
private:
|
||||
Mesh const & _mesh; //!< reference to discretization
|
||||
int _nrows; //!< number of rows in matrix
|
||||
int _nnz; //!< number of non-zero entries
|
||||
std::vector<int> _id; //!< start indices of matrix rows
|
||||
std::vector<int> _ik; //!< column indices
|
||||
std::vector<double> _sk; //!< non-zero values
|
||||
|
||||
};
|
||||
|
||||
|
||||
#endif
|
||||
61
ex3/ex3/jacsolve.cpp
Normal file
61
ex3/ex3/jacsolve.cpp
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
#include "vdop.h"
|
||||
#include "getmatrix.h"
|
||||
#include "jacsolve.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
// #####################################################################
|
||||
// const int neigh[], const int color,
|
||||
// const MPI::Intracomm& icomm,
|
||||
void JacobiSolve(CRS_Matrix const &SK, vector<double> const &f, vector<double> &u)
|
||||
{
|
||||
const double omega = 1.0;
|
||||
const int maxiter = 1000;
|
||||
const double tol = 1e-5, // tolerance
|
||||
tol2 = tol * tol; // tolerance^2
|
||||
|
||||
int nrows = SK.Nrows(); // number of rows == number of columns
|
||||
assert( nrows == static_cast<int>(f.size()) && f.size() == u.size() );
|
||||
|
||||
cout << endl << " Start Jacobi solver for " << nrows << " d.o.f.s" << endl;
|
||||
// Choose initial guess
|
||||
for (int k = 0; k < nrows; ++k)
|
||||
{
|
||||
u[k] = 0.0; // u := 0
|
||||
}
|
||||
|
||||
vector<double> dd(nrows); // matrix diagonal
|
||||
vector<double> r(nrows); // residual
|
||||
vector<double> w(nrows); // correction
|
||||
|
||||
SK.GetDiag(dd); // dd := diag(K)
|
||||
////DebugVector(dd);{int ijk; cin >> ijk;}
|
||||
|
||||
// Initial sweep
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
double sigma0 = dscapr(w, r); // s0 := <w,r>
|
||||
|
||||
// Iteration sweeps
|
||||
int iter = 0;
|
||||
double sigma = sigma0;
|
||||
while ( sigma > tol2 * sigma0 && maxiter > iter)
|
||||
{
|
||||
++iter;
|
||||
vdaxpy(u, u, omega, w ); // u := u + om*w
|
||||
SK.Defect(r, f, u); // r := f - K*u
|
||||
vddiv(w, r, dd); // w := D^{-1}*r
|
||||
sigma = dscapr(w, r); // s0 := <w,r>
|
||||
// cout << "Iteration " << iter << " : " << sqrt(sigma/sigma0) << endl;
|
||||
}
|
||||
cout << "aver. Jacobi rate : " << exp(log(sqrt(sigma / sigma0)) / iter) << " (" << iter << " iter)" << endl;
|
||||
cout << "final error: " << sqrt(sigma / sigma0) << " (rel) " << sqrt(sigma) << " (abs)\n";
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
18
ex3/ex3/jacsolve.h
Normal file
18
ex3/ex3/jacsolve.h
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
#ifndef JACSOLVE_FILE
|
||||
#define JACSOLVE_FILE
|
||||
#include "getmatrix.h"
|
||||
#include <vector>
|
||||
|
||||
|
||||
/**
|
||||
* Solves linear system of equations K @p u = @p f via the Jacobi iteration.
|
||||
* We use a distributed symmetric CSR matrix @p SK and initial guess of the
|
||||
* solution is set to 0.
|
||||
* @param[in] SK CSR matrix
|
||||
* @param[in] f distributed local vector storing the right hand side
|
||||
* @param[out] u accumulated local vector storing the solution.
|
||||
*/
|
||||
void JacobiSolve(CRS_Matrix const &SK, std::vector<double> const &f, std::vector<double> &u);
|
||||
|
||||
|
||||
#endif
|
||||
101
ex3/ex3/main.cpp
Normal file
101
ex3/ex3/main.cpp
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
#include "benchmarks.h"
|
||||
#include "benchmark_tests.h"
|
||||
#include "factorization_solve_tests.h"
|
||||
#include <iostream>
|
||||
|
||||
int main()
|
||||
{
|
||||
// ---------------------------------- 1. ----------------------------------
|
||||
// results in file "test_system.txt"
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 2. ----------------------------------
|
||||
// Memory FLOPS Read-write operations
|
||||
// (A) 2N 2N 2N
|
||||
// (B) MN + N 2NM 2NM + M
|
||||
// (C) ML + LM 2LNM 2LNM + MN
|
||||
// (D) (p+1) + N 3N(p+1) N(2 + 3(p+1))
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 3. ----------------------------------
|
||||
// implementation in file "benchmarks.cpp"
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 4.& 6. ----------------------------------
|
||||
vector<vector<double>> results;
|
||||
|
||||
results.push_back(test_A(250, 50000000, scalar));
|
||||
results.push_back(test_A(250, 50000000, Kahan_skalar));
|
||||
results.push_back(test_A(250, 50000000, scalar_cBLAS));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// scalar 0.039 2.4 19
|
||||
// Kahan_skalar 0.037 2.5 20
|
||||
// scalar_cBLAS 0.032 2.9 23
|
||||
|
||||
|
||||
results.push_back(test_B(100, 20000, 10000, MatVec));
|
||||
results.push_back(test_B(100, 20000, 10000, MatVec_cBLAS));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// MatVec 0.1 3.6 29
|
||||
// MatVec_cBLAS 0.074 5 40
|
||||
|
||||
|
||||
results.push_back(test_C(25, 500, 1000, 1500, MatMat));
|
||||
results.push_back(test_C(25, 500, 1000, 1500, MatMat_cBLAS));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// MatMat 0.57 2.5 20
|
||||
// MatMat_cBLAS 0.019 75 6e+02 // unrealistic
|
||||
|
||||
|
||||
results.push_back(test_D(100, 100, 1000000));
|
||||
// Timing GFLOPS GiByte/s
|
||||
// ------------------------------------------
|
||||
// 0.11 2.5 20
|
||||
|
||||
|
||||
cout << endl << "Timing\tGFLOPS\tGiByte/s" << endl;
|
||||
cout << "------------------------------" << endl;
|
||||
for (size_t i = 0; i < results.size(); ++i)
|
||||
cout << results[i][0] << "\t" << results[i][1] << "\t" << results[i][2] << endl;
|
||||
cout << endl;
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 5. ----------------------------------
|
||||
// 5.(a) Observation: time to calculate norm is approximately half the time as for the scalar product.
|
||||
// Reason: only have to access entries of x, so less memory that has to be accessed
|
||||
//
|
||||
// 5.(b) Runtime for Kahan_scalar is roughly the same as for the normal scalar product
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 6. ----------------------------------
|
||||
// see 4.
|
||||
|
||||
|
||||
// ---------------------------------- 7. ----------------------------------
|
||||
CheckCorrectness();
|
||||
// Checked correctness by computing the inverse of A
|
||||
|
||||
CheckDuration(5000);
|
||||
// The solving time per RHS scales roughly with factor 1/n_rhs
|
||||
|
||||
|
||||
|
||||
// ---------------------------------- 8. ----------------------------------
|
||||
// done seperately
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
16
ex3/ex3/userset.cpp
Normal file
16
ex3/ex3/userset.cpp
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#include "userset.h"
|
||||
#include <cmath>
|
||||
|
||||
|
||||
double FunctF(double const x, double const y)
|
||||
{
|
||||
// return std::sin(3.14159*1*x)*std::sin(3.14159*1*y);
|
||||
// return 16.0*1024. ;
|
||||
// return (double)1.0 ;
|
||||
return x * x * std::sin(2.5 * 3.14159 * y);
|
||||
}
|
||||
|
||||
double FunctU(const double /* x */, double const /* y */)
|
||||
{
|
||||
return 1.0 ;
|
||||
}
|
||||
44
ex3/ex3/userset.h
Normal file
44
ex3/ex3/userset.h
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
#ifndef USERSET_FILE
|
||||
#define USERSET_FILE
|
||||
#include <cmath>
|
||||
/**
|
||||
* User function: f(@p x,@p y)
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value for right hand side f(@p x,@p y)
|
||||
*/
|
||||
double FunctF(double const x, double const y);
|
||||
|
||||
/**
|
||||
* User function: u(@p x,@p y)
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value for solution vector u(@p x,@p y)
|
||||
*/
|
||||
double FunctU(double const x, double const y);
|
||||
|
||||
|
||||
/**
|
||||
* User function: f(@p x,@p y) = @f$ x^2 \sin(2.5\pi y)@f$.
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value f(@p x,@p y)
|
||||
*/
|
||||
inline double fNice(double const x, double const y)
|
||||
{
|
||||
return x * x * std::sin(2.5 * 3.14159 * y);
|
||||
}
|
||||
|
||||
/**
|
||||
* User function: f(@p x,@p y) = 0$.
|
||||
* @param[in] x x-coordinate of discretization point
|
||||
* @param[in] y y-coordinate of discretization point
|
||||
* @return value 0
|
||||
*/
|
||||
inline double f_zero(double const x, double const y)
|
||||
//double f_zero(double const /*x*/, double const /*y*/)
|
||||
{
|
||||
return 0.0 + 0.0*(x+y);
|
||||
}
|
||||
|
||||
#endif
|
||||
84
ex3/ex3/vdop.cpp
Normal file
84
ex3/ex3/vdop.cpp
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
#include "vdop.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
void vddiv(vector<double> &x, vector<double> const &y,
|
||||
vector<double> const &z)
|
||||
{
|
||||
assert( x.size() == y.size() && y.size() == z.size() );
|
||||
size_t n = x.size();
|
||||
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
{
|
||||
x[k] = y[k] / z[k];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
//******************************************************************************
|
||||
|
||||
void vdaxpy(std::vector<double> &x, std::vector<double> const &y,
|
||||
double alpha, std::vector<double> const &z )
|
||||
{
|
||||
assert( x.size() == y.size() && y.size() == z.size() );
|
||||
size_t n = x.size();
|
||||
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
{
|
||||
x[k] = y[k] + alpha * z[k];
|
||||
}
|
||||
return;
|
||||
}
|
||||
//******************************************************************************
|
||||
|
||||
double dscapr(std::vector<double> const &x, std::vector<double> const &y)
|
||||
{
|
||||
assert( x.size() == y.size());
|
||||
size_t n = x.size();
|
||||
|
||||
double s = 0.0;
|
||||
for (size_t k = 0; k < n; ++k)
|
||||
{
|
||||
s += x[k] * y[k];
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
//******************************************************************************
|
||||
void DebugVector(vector<double> const &v)
|
||||
{
|
||||
cout << "\nVector (nnode = " << v.size() << ")\n";
|
||||
for (size_t j = 0; j < v.size(); ++j)
|
||||
{
|
||||
cout.setf(ios::right, ios::adjustfield);
|
||||
cout << v[j] << " ";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
return;
|
||||
}
|
||||
//******************************************************************************
|
||||
bool CompareVectors(std::vector<double> const &x, int const n, double const y[], double const eps)
|
||||
{
|
||||
bool bn = (static_cast<int>(x.size()) == n);
|
||||
if (!bn)
|
||||
{
|
||||
cout << "######### Error: " << "number of elements" << endl;
|
||||
}
|
||||
//bool bv = equal(x.cbegin(),x.cend(),y);
|
||||
bool bv = equal(x.cbegin(), x.cend(), y,
|
||||
[eps](double a, double b) -> bool
|
||||
{ return std::abs(a - b) < eps * (1.0 + 0.5 * (std::abs(a) + std::abs(a))); }
|
||||
);
|
||||
if (!bv)
|
||||
{
|
||||
assert(static_cast<int>(x.size()) == n);
|
||||
cout << "######### Error: " << "values" << endl;
|
||||
}
|
||||
return bn && bv;
|
||||
}
|
||||
58
ex3/ex3/vdop.h
Normal file
58
ex3/ex3/vdop.h
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
#ifndef VDOP_FILE
|
||||
#define VDOP_FILE
|
||||
#include <vector>
|
||||
|
||||
/** @brief Element-wise vector divison x_k = y_k/z_k.
|
||||
*
|
||||
* @param[out] x target vector
|
||||
* @param[in] y source vector
|
||||
* @param[in] z source vector
|
||||
*
|
||||
*/
|
||||
void vddiv(std::vector<double> & x, std::vector<double> const& y,
|
||||
std::vector<double> const& z);
|
||||
|
||||
/** @brief Element-wise daxpy operation x(k) = y(k) + alpha*z(k).
|
||||
*
|
||||
* @param[out] x target vector
|
||||
* @param[in] y source vector
|
||||
* @param[in] alpha scalar
|
||||
* @param[in] z source vector
|
||||
*
|
||||
*/
|
||||
void vdaxpy(std::vector<double> & x, std::vector<double> const& y,
|
||||
double alpha, std::vector<double> const& z );
|
||||
|
||||
|
||||
/** @brief Calculates the Euclidian inner product of two vectors.
|
||||
*
|
||||
* @param[in] x vector
|
||||
* @param[in] y vector
|
||||
* @return Euclidian inner product @f$\langle x,y \rangle@f$
|
||||
*
|
||||
*/
|
||||
double dscapr(std::vector<double> const& x, std::vector<double> const& y);
|
||||
|
||||
|
||||
/**
|
||||
* Print entries of a vector.
|
||||
* @param[in] v vector values
|
||||
*/
|
||||
void DebugVector(std::vector<double> const &v);
|
||||
|
||||
/** @brief Compares an STL vector with POD vector.
|
||||
*
|
||||
* The accuracy criteria @f$ |x_k-y_k| < \varepsilon \left({1+0.5(|x_k|+|y_k|)}\right) @f$
|
||||
* follows the book by
|
||||
* <a href="https://www.springer.com/la/book/9783319446592">Stoyan/Baran</a>, p.8.
|
||||
*
|
||||
* @param[in] x STL vector
|
||||
* @param[in] n length of POD vector
|
||||
* @param[in] y POD vector
|
||||
* @param[in] eps relative accuracy criteria (default := 0.0).
|
||||
* @return true iff pairwise vector elements are relatively close to each other.
|
||||
*
|
||||
*/
|
||||
bool CompareVectors(std::vector<double> const& x, int n, double const y[], double const eps=0.0);
|
||||
|
||||
#endif
|
||||
76
ex4/ex_4A.py
Normal file
76
ex4/ex_4A.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
np.set_printoptions(precision=2)
|
||||
|
||||
|
||||
N = 10 # number of elements
|
||||
a = 1
|
||||
alpha = 1
|
||||
g_b = 1
|
||||
f_const = 1
|
||||
|
||||
x = np.linspace(0, 1, N + 1)
|
||||
|
||||
|
||||
A = np.zeros((N + 1, N + 1))
|
||||
f_vec = np.zeros(N + 1)
|
||||
|
||||
for i in range(1, N + 1):
|
||||
h = x[i] - x[i - 1]
|
||||
|
||||
|
||||
a_11 = 1./h + a*h/3.
|
||||
a_12 = -1./h + a*h/6.
|
||||
a_21 = -1./h + a*h/6.
|
||||
a_22 = 1./h + a*h/3
|
||||
|
||||
A[i - 1, i - 1] += a_11
|
||||
A[i - 1, i] += a_12
|
||||
A[i, i - 1] += a_21
|
||||
A[i, i] += a_22
|
||||
|
||||
|
||||
f_vec[i] = f_const*h
|
||||
|
||||
print("A =\n", A)
|
||||
|
||||
# take Neumann data into account
|
||||
A[N, N] += alpha
|
||||
f_vec[N] += alpha*g_b
|
||||
|
||||
|
||||
# take Dirichlet data into account
|
||||
u_g = np.zeros(N + 1)
|
||||
u_g[0] = 0
|
||||
print("u_g =\n", u_g)
|
||||
|
||||
# remove first row of A
|
||||
A_g = A[1:N+1, :]
|
||||
#print("A_g =\n", A_g)
|
||||
|
||||
# remove first row of f_vec
|
||||
f_g = f_vec[1:N+1]
|
||||
|
||||
# assemble RHS with dirichlet data
|
||||
f_g -= A_g.dot(u_g)
|
||||
#print(f_g)
|
||||
|
||||
# matrix for the inner nodes (excluding nodes with dirichlet bcs)
|
||||
A_0 = A[1:N+1, 1:N+1]
|
||||
#print(A_0)
|
||||
|
||||
# solve for u_0 (free dofs)
|
||||
u_0 = np.linalg.solve(A_0, f_g)
|
||||
|
||||
# assemble "u = u_0 + u_g"
|
||||
u = np.concatenate([[0], u_0])
|
||||
|
||||
|
||||
print("u =\n", u)
|
||||
|
||||
plt.plot(x, u, '-')
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u_h(x)')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
83
ex4/ex_4B.py
Normal file
83
ex4/ex_4B.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
np.set_printoptions(precision=2)
|
||||
|
||||
|
||||
|
||||
N = 2 # number of elements
|
||||
split = np.sqrt(2)/2
|
||||
#split = 0.5
|
||||
|
||||
|
||||
N_lower = round(N*split) # number of elements in (0, split)
|
||||
print(N_lower, "elements below sqrt(2)/2")
|
||||
N_upper = N - N_lower # number of elements in (split, 1)
|
||||
print(N_upper, "elements above sqrt(2)/2")
|
||||
|
||||
assert(N == N_lower + N_upper)
|
||||
|
||||
|
||||
|
||||
x_lower = np.linspace(0, split, N_lower + 1)
|
||||
x_upper = np.linspace(split, 1, N_upper + 1)
|
||||
x = np.concatenate([x_lower, x_upper[1:]])
|
||||
print(x)
|
||||
|
||||
|
||||
A = np.zeros((N + 1, N + 1))
|
||||
|
||||
for i in range(1, N + 1):
|
||||
h = x[i] - x[i - 1]
|
||||
|
||||
lam = 1
|
||||
if(x[i] > split):
|
||||
lam = 10
|
||||
|
||||
|
||||
a_11 = lam/h
|
||||
a_12 = -lam/h
|
||||
a_21 = -lam/h
|
||||
a_22 = lam/h
|
||||
|
||||
A[i - 1, i - 1] += a_11
|
||||
A[i - 1, i] += a_12
|
||||
A[i, i - 1] += a_21
|
||||
A[i, i] += a_22
|
||||
|
||||
print("A =\n", A)
|
||||
|
||||
|
||||
|
||||
# take dirichlet data into account
|
||||
u_g = np.zeros(N + 1)
|
||||
u_g[0] = 0
|
||||
u_g[N] = 1
|
||||
print("u_g =\n", u_g)
|
||||
|
||||
# remove first and last row of A
|
||||
A_g = A[1:N, :]
|
||||
#print("A_g =\n", A_g)
|
||||
|
||||
# assemble RHS with dirichlet data
|
||||
f = -A_g.dot(u_g)
|
||||
#print(f)
|
||||
|
||||
# matrix for the inner nodes (excluding nodes with dirichlet bcs)
|
||||
A_0 = A[1:N, 1:N]
|
||||
#print(A_0)
|
||||
|
||||
# solve for u_0 (free dofs)
|
||||
u_0 = np.linalg.solve(A_0, f)
|
||||
|
||||
# assemble "u = u_0 + u_g"
|
||||
u = np.concatenate([[0], u_0, [1]])
|
||||
|
||||
|
||||
print("u =\n", u)
|
||||
|
||||
plt.plot(x, u, '-')
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u_h(x)')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
76
ex4/ex_4C.py
Normal file
76
ex4/ex_4C.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
np.set_printoptions(precision=2)
|
||||
|
||||
|
||||
p = 70
|
||||
N_vec = [10, 20, 30, 40, 70]
|
||||
for N in N_vec:
|
||||
|
||||
x = np.linspace(0, 1, N + 1)
|
||||
|
||||
A = np.zeros((N + 1, N + 1))
|
||||
|
||||
for i in range(1, N + 1):
|
||||
h = x[i] - x[i - 1]
|
||||
|
||||
|
||||
a_11 = 1./h - p/2.
|
||||
a_12 = -1./h + p/2.
|
||||
a_21 = -1./h - p/2.
|
||||
a_22 = 1./h + p/2.
|
||||
|
||||
A[i - 1, i - 1] += a_11
|
||||
A[i - 1, i] += a_12
|
||||
A[i, i - 1] += a_21
|
||||
A[i, i] += a_22
|
||||
|
||||
print("A =\n", A)
|
||||
|
||||
|
||||
|
||||
# take dirichlet data into account
|
||||
u_g = np.zeros(N + 1)
|
||||
u_g[0] = 0
|
||||
u_g[N] = 1
|
||||
print("u_g =\n", u_g)
|
||||
|
||||
# remove first and last row of A
|
||||
A_g = A[1:N, :]
|
||||
#print("A_g =\n", A_g)
|
||||
|
||||
# assemble RHS with dirichlet data
|
||||
f = -A_g.dot(u_g)
|
||||
#print(f)
|
||||
|
||||
# matrix for the inner nodes (excluding nodes with dirichlet bcs)
|
||||
A_0 = A[1:N, 1:N]
|
||||
#print(A_0)
|
||||
|
||||
# solve for u_0 (free dofs)
|
||||
u_0 = np.linalg.solve(A_0, f)
|
||||
|
||||
# assemble "u = u_0 + u_g"
|
||||
u = np.concatenate([[0], u_0, [1]])
|
||||
|
||||
|
||||
print("u =\n", u)
|
||||
|
||||
plt.plot(x, u, '-')
|
||||
|
||||
|
||||
# plotting the exact solution
|
||||
plt.plot(x, (np.exp(p*x) - 1.)/(np.exp(p) - 1.))
|
||||
|
||||
plt.xlabel('x')
|
||||
plt.ylabel('u_h(x)')
|
||||
plt.legend(N_vec + ['exact'])
|
||||
plt.title("Comparing discrete solution for increasing number of elements")
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BIN
ex4/ex_4_results.pdf
Normal file
BIN
ex4/ex_4_results.pdf
Normal file
Binary file not shown.
131
ex5/CLANG_default.mk
Normal file
131
ex5/CLANG_default.mk
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=CLANG_
|
||||
# https://llvm.org/docs/CompileCudaWithLLVM.html
|
||||
# https://llvm.org/docs/NVPTXUsage.html
|
||||
|
||||
#CLANGPATH=//usr/lib/llvm-10/bin/
|
||||
CC = ${CLANGPATH}clang
|
||||
CXX = ${CLANGPATH}clang++
|
||||
#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
|
||||
#F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
|
||||
WARNINGS += -pedantic -Weverything -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion
|
||||
WARNINGS += -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
|
||||
WARNINGS += -Wno-unsafe-buffer-usage
|
||||
#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
|
||||
|
||||
CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
|
||||
# don't use -Ofast
|
||||
# -ftrapv
|
||||
LINKFLAGS += -O3
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
ifeq ($(UBUNTU),1)
|
||||
# ubuntu
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
#sudo apt install libomp-dev
|
||||
# OpenMP
|
||||
CXXFLAGS += -fopenmp
|
||||
LINKFLAGS += -fopenmp
|
||||
|
||||
# very good check
|
||||
# http://clang.llvm.org/extra/clang-tidy/
|
||||
# good check, see: http://llvm.org/docs/CodingStandards.html#include-style
|
||||
SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
|
||||
SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
|
||||
#READABILITY=,readability*${SWITCH_OFF}
|
||||
#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
|
||||
#TIDYFLAGS += -checks='modernize*
|
||||
# ???
|
||||
#TIDYFLAGS = -checks='cert*' -header-filter=.*
|
||||
# MPI checks ??
|
||||
#TIDYFLAGS = -checks='mpi*'
|
||||
# ??
|
||||
#TIDYFLAGS = -checks='performance*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='portability-*' -header-filter=.*
|
||||
#TIDYFLAGS = -checks='readability-*' -header-filter=.*
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
codecheck: tidy_check
|
||||
tidy_check:
|
||||
clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
|
||||
# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
codecheck: tidy_check
|
||||
212
ex5/GCCMKL_default.mk
Normal file
212
ex5/GCCMKL_default.mk
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
# Basic Defintions for using GNU-compiler suite with OpenMP und MKL
|
||||
# requires setting of COMPILER=GCCMKL_
|
||||
|
||||
# install MKL in manjaro
|
||||
# https://linux-packages.com/manjaro-linux/package/intel-mkl
|
||||
# > sudo pacman -Sy
|
||||
# > sudo pacman -S intel-mkl
|
||||
|
||||
ifeq ($(ONEAPI),1)
|
||||
MKL_INCLUDE=/opt/intel/oneapi/mkl/2024.0/include
|
||||
MKL_LIB=/opt/intel/oneapi/2024.0/lib
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/oneapi/2024.0/lib
|
||||
else
|
||||
MKL_INCLUDE=/usr/include/mkl
|
||||
MKL_LIB=/usr/lib/x86_64-linux-gnu/mkl
|
||||
endif
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -Winline -fmax-errors=1
|
||||
# -Wunreachable-code
|
||||
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
|
||||
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
|
||||
#-msse3
|
||||
# -ftree-vectorizer-verbose=2 -DNDEBUG
|
||||
# -ftree-vectorizer-verbose=5
|
||||
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
|
||||
|
||||
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
|
||||
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
|
||||
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
LINKFLAGS += -O3
|
||||
|
||||
#architecture
|
||||
CPU = -march=znver2
|
||||
#CPU = -march=core-avx2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU}
|
||||
|
||||
# MKL
|
||||
#CXXFLAGS += -I/usr/include/mkl -DUSE_MKL -Wno-redundant-decls
|
||||
CXXFLAGS += -I${MKL_INCLUDE} -DUSE_MKL -Wno-redundant-decls
|
||||
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_tbb_thread -ltbb -lmkl_core
|
||||
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -L/usr/lib/x86_64-linux-gnu/mkl
|
||||
LINKFLAGS += -L${MKL_LIB} -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core
|
||||
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
|
||||
|
||||
# workaround for MKL slow down on AMD hardware
|
||||
# https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
|
||||
default: run
|
||||
libfakeintel.so:
|
||||
gcc -shared -fPIC -o libfakeintel.so fakeintel.c
|
||||
echo "call: export LD_PRELOAD=./libfakeintel.so "
|
||||
|
||||
# different libraries in Ubuntu or manajaro
|
||||
#ifndef UBUNTU
|
||||
#UBUNTU=1
|
||||
#endif
|
||||
|
||||
## BLAS, LAPACK
|
||||
#ifeq ($(UBUNTU),1)
|
||||
#LINKFLAGS += -llapack -lblas
|
||||
## -lopenblas
|
||||
#else
|
||||
## on archlinux
|
||||
#LINKFLAGS += -llapack -lopenblas -lcblas
|
||||
#endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# for debugging purpose (save code)
|
||||
# -fsanitize=leak # only one out the three can be used
|
||||
# -fsanitize=address
|
||||
# -fsanitize=thread
|
||||
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
|
||||
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
|
||||
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
|
||||
#CXXFLAGS += ${SANITARY}
|
||||
#LINKFLAGS += ${SANITARY}
|
||||
|
||||
# OpenMP
|
||||
CXXFLAGS += -fopenmp
|
||||
LINKFLAGS += -fopenmp
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
|
||||
-@rm -rf html
|
||||
|
||||
run: clean ${PROGRAM} libfakeintel.so
|
||||
#run: ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
#find . -size +10M > large_files
|
||||
#--exclude-from ${MY_DIR}/large_files
|
||||
|
||||
zip: clean
|
||||
@echo "Zip the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
.SUFFIXES: .f90
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
.f90.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
# no "-pg" in compile/link options
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
# Graphical interface
|
||||
# valkyrie
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
|
||||
# * install
|
||||
# * sudo vi /etc/sysctl.conf
|
||||
# add kernel.perf_event_paranoid = 0
|
||||
|
||||
#Trace your heap:
|
||||
#> heaptrack ./main.GCC_
|
||||
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
|
||||
heap: ${PROGRAM}
|
||||
heaptrack ./$^ ${PARAMS} 11
|
||||
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
|
||||
|
||||
codecheck: $(SOURCES)
|
||||
cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
|
||||
|
||||
|
||||
########################################################################
|
||||
# get the detailed status of all optimization flags
|
||||
info:
|
||||
echo "detailed status of all optimization flags"
|
||||
$(CXX) --version
|
||||
$(CXX) -Q $(CXXFLAGS) --help=optimizers
|
||||
lscpu
|
||||
inxi -C
|
||||
lstopo
|
||||
|
||||
# Excellent hardware info
|
||||
# hardinfo
|
||||
# Life monitoring of CPU frequency etc.
|
||||
# sudo i7z
|
||||
|
||||
# Memory consumption
|
||||
# vmstat -at -SM 3
|
||||
# xfce4-taskmanager
|
||||
|
||||
|
||||
# https://www.tecmint.com/check-linux-cpu-information/
|
||||
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
|
||||
|
||||
# Debugging:
|
||||
# https://wiki.archlinux.org/index.php/Debugging
|
||||
111
ex5/GCC_SINGLE_default.mk
Normal file
111
ex5/GCC_SINGLE_default.mk
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=GCC_
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
# on mephisto:
|
||||
#CXXFLAGS += -I/share/apps/atlas/include
|
||||
#LINKFLAGS += -L/share/apps/atlas/lib -L/usr/lib64/atlas
|
||||
#LINKFLAGS += -latlas -lcblas
|
||||
#LINKFLAGS += -lblas
|
||||
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
|
||||
|
||||
|
||||
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -Winline -fmax-errors=1
|
||||
# -Wunreachable-code
|
||||
#CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native -DNDEBUG ${WARNINGS}
|
||||
CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native ${WARNINGS}
|
||||
|
||||
# info on vectorization
|
||||
#VECTORIZE = -ftree-vectorize -fdump-tree-vect-blocks=foo.dump
|
||||
#-fdump-tree-pre=stderr
|
||||
VECTORIZE = -ftree-vectorize -fopt-info -ftree-vectorizer-verbose=5
|
||||
#CXXFLAGS += ${VECTORIZE}
|
||||
# -funroll-all-loops -msse3
|
||||
#GCC -march=knl -march=broadwell -march=haswell
|
||||
|
||||
# interprocedural optimization
|
||||
#CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# for debugging purpose (save code)
|
||||
# -fsanitize=leak # only one out the trhee can be used
|
||||
# -fsanitize=address
|
||||
# -fsanitize=thread
|
||||
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
|
||||
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
|
||||
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
|
||||
#CXXFLAGS += ${SANITARY}
|
||||
#LINKFLAGS +=${SANITARY}
|
||||
|
||||
# OpenMP but no OpenMP in Single mode
|
||||
#CXXFLAGS += -fopenmp
|
||||
LINKFLAGS += -fopenmp
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
|
||||
@rm -rf html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
# time ./${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar:
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
|
||||
|
||||
|
||||
thread:${PROGRAM}
|
||||
valgrind -v --tool=helgrind --log-file=$^.thread.out ./$^
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
./$^
|
||||
gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
182
ex5/GCC_default.mk
Normal file
182
ex5/GCC_default.mk
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
# Basic Defintions for using GNU-compiler suite sequentially
|
||||
# requires setting of COMPILER=GCC_
|
||||
|
||||
CC = gcc
|
||||
CXX = g++
|
||||
F77 = gfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
|
||||
-Wredundant-decls -Winline -fmax-errors=1
|
||||
# -Wunreachable-code
|
||||
CXXFLAGS += -ffast-math -O3 -march=native -std=c++20 ${WARNINGS}
|
||||
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
|
||||
#-msse3
|
||||
# -ftree-vectorizer-verbose=2 -DNDEBUG
|
||||
# -ftree-vectorizer-verbose=5
|
||||
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
|
||||
|
||||
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
|
||||
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
|
||||
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
|
||||
LINKFLAGS += -O3
|
||||
|
||||
#architecture
|
||||
#CPU = -march=znver2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU} -ltbb
|
||||
|
||||
# different libraries in Ubuntu or manajaró
|
||||
ifndef UBUNTU
|
||||
UBUNTU=1
|
||||
endif
|
||||
|
||||
# BLAS, LAPACK
|
||||
ifeq ($(UBUNTU),1)
|
||||
LINKFLAGS += -llapack -lblas
|
||||
# -lopenblas
|
||||
else
|
||||
# on archlinux
|
||||
LINKFLAGS += -llapack -lopenblas -lcblas
|
||||
endif
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -flto
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# for debugging purpose (save code)
|
||||
# -fsanitize=leak # only one out the three can be used
|
||||
# -fsanitize=address
|
||||
# -fsanitize=thread
|
||||
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
|
||||
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
|
||||
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
|
||||
#CXXFLAGS += ${SANITARY}
|
||||
#LINKFLAGS += ${SANITARY}
|
||||
|
||||
# OpenMP
|
||||
CXXFLAGS += -fopenmp
|
||||
LINKFLAGS += -fopenmp
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
|
||||
-@rm -rf html
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
#run: ${PROGRAM}
|
||||
# time ./${PROGRAM} ${PARAMS}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
#find . -size +10M > large_files
|
||||
#--exclude-from ${MY_DIR}/large_files
|
||||
|
||||
zip: clean
|
||||
@echo "Zip the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
.SUFFIXES: .f90
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
|
||||
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
.f90.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
|
||||
cache: ${PROGRAM}
|
||||
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
|
||||
# kcachegrind callgrind.out.<pid> &
|
||||
kcachegrind `ls -1tr callgrind.out.* |tail -1`
|
||||
|
||||
# Check for wrong memory accesses, memory leaks, ...
|
||||
# use smaller data sets
|
||||
# no "-pg" in compile/link options
|
||||
mem: ${PROGRAM}
|
||||
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
|
||||
# Graphical interface
|
||||
# valkyrie
|
||||
|
||||
# Simple run time profiling of your code
|
||||
# CXXFLAGS += -g -pg
|
||||
# LINKFLAGS += -pg
|
||||
prof: ${PROGRAM}
|
||||
perf record ./$^ ${PARAMS}
|
||||
perf report
|
||||
# gprof -b ./$^ > gp.out
|
||||
# kprof -f gp.out -p gprof &
|
||||
|
||||
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
|
||||
# * install
|
||||
# * sudo vi /etc/sysctl.conf
|
||||
# add kernel.perf_event_paranoid = 0
|
||||
|
||||
#Trace your heap:
|
||||
#> heaptrack ./main.GCC_
|
||||
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
|
||||
heap: ${PROGRAM}
|
||||
heaptrack ./$^ ${PARAMS} 11
|
||||
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
|
||||
|
||||
codecheck: $(SOURCES)
|
||||
cppcheck --enable=all --inconclusive --std=c++17 -I${CUDA_INC} --suppress=missingIncludeSystem $^
|
||||
|
||||
|
||||
########################################################################
|
||||
# get the detailed status of all optimization flags
|
||||
info:
|
||||
echo "detailed status of all optimization flags"
|
||||
$(CXX) --version
|
||||
$(CXX) -Q $(CXXFLAGS) --help=optimizers
|
||||
lscpu
|
||||
inxi -C
|
||||
lstopo
|
||||
|
||||
# Excellent hardware info
|
||||
# hardinfo
|
||||
# Life monitoring of CPU frequency etc.
|
||||
# sudo i7z
|
||||
|
||||
# Memory consumption
|
||||
# vmstat -at -SM 3
|
||||
# xfce4-taskmanager
|
||||
|
||||
|
||||
# https://www.tecmint.com/check-linux-cpu-information/
|
||||
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
|
||||
|
||||
# Debugging:
|
||||
# https://wiki.archlinux.org/index.php/Debugging
|
||||
151
ex5/ICC_default.mk
Normal file
151
ex5/ICC_default.mk
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
# Basic Defintions for using INTEL compiler suite sequentially
|
||||
# requires setting of COMPILER=ICC_
|
||||
|
||||
# special on my sony [GH]
|
||||
#BINDIR = /opt/save.intel/bin/
|
||||
# very special on my sony [GH]
|
||||
# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
|
||||
|
||||
#export KMP_AFFINITY=verbose,compact
|
||||
|
||||
CC = ${BINDIR}icc
|
||||
CXX = ${BINDIR}icpc
|
||||
F77 = ${BINDIR}ifort
|
||||
LINKER = ${CXX}
|
||||
|
||||
WARNINGS = -pedantic -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012
|
||||
#-Winline -Wunreachable-code -Wredundant-decls
|
||||
CXXFLAGS += -std=c++17 -O3 -fma -DNDEBUG ${WARNINGS} -mkl
|
||||
#CXXFLAGS += -std=c++17 -O3 -march=core-avx2 -fma -ftz -fomit-frame-pointer -DNDEBUG ${WARNINGS} -mkl
|
||||
# -fast # fast inludes also -ipo !
|
||||
CXXFLAGS += -fargument-noalias -fargument-noalias-global -ansi-alias
|
||||
CXXFLAGS += -align -qopt-dynamic-align
|
||||
#CXXFLAGS += -xCore-AVX2
|
||||
#CXXFLAGS += -tp=zen
|
||||
# -qopt-subscript-in-range
|
||||
# -vec-threshold0
|
||||
# -xCORE-AVX2
|
||||
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
|
||||
# -ipo
|
||||
|
||||
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
|
||||
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
|
||||
# Redirect report from *.optrpt to stderr
|
||||
# -qopt-report-file=stderr
|
||||
# Guided paralellization
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
|
||||
# interprocedural optimization
|
||||
#CXXFLAGS += -ipo
|
||||
#LINKFLAGS += -ipo
|
||||
|
||||
# annotated Assembler file
|
||||
ANNOTED = -fsource-asm -S
|
||||
|
||||
# OpenMP
|
||||
CXXFLAGS += -qopenmp
|
||||
# -qopt-report-phase=openmp
|
||||
# -diag-enable=sc-full -diag-file=filename -diag-file-append[=filename]
|
||||
LINKFLAGS += -qopenmp
|
||||
|
||||
# use MKL by INTEL
|
||||
# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
LINKFLAGS += -O2 -mkl
|
||||
# -ipo
|
||||
|
||||
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# # some tools
|
||||
# # Cache behaviour (CXXFLAGS += -g tracks down to source lines)
|
||||
# cache: ${PROGRAM}
|
||||
# valgrind --tool=callgrind --simulate-cache=yes ./$^
|
||||
# # kcachegrind callgrind.out.<pid> &
|
||||
#
|
||||
# # Check for wrong memory accesses, memory leaks, ...
|
||||
# # use smaller data sets
|
||||
# mem: ${PROGRAM}
|
||||
# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
|
||||
#
|
||||
# # Simple run time profiling of your code
|
||||
# # CXXFLAGS += -g -pg
|
||||
# # LINKFLAGS += -pg
|
||||
# prof: ${PROGRAM}
|
||||
# ./$^
|
||||
# gprof -b ./$^ > gp.out
|
||||
# # kprof -f gp.out -p gprof &
|
||||
#
|
||||
|
||||
|
||||
mem: inspector
|
||||
prof: amplifier
|
||||
cache: amplifier
|
||||
|
||||
gap_par_report:
|
||||
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
|
||||
|
||||
# GUI for performance report
|
||||
amplifier: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
|
||||
amplxe-gui &
|
||||
|
||||
# GUI for Memory and Thread analyzer (race condition)
|
||||
inspector: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
#${BINDIR}../inspector_xe_2013/bin64/inspxe-gui &
|
||||
inspxe-gui &
|
||||
|
||||
advisor:
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
|
||||
export ADVIXE_EXPERIMENTAL=roofline
|
||||
advixe-gui &
|
||||
|
||||
icc-info:
|
||||
icpc -# main.cpp
|
||||
|
||||
|
||||
|
||||
|
||||
181
ex5/ONEAPI_default.mk
Normal file
181
ex5/ONEAPI_default.mk
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
# Basic Defintions for using INTEL compiler suite sequentially
|
||||
# requires setting of COMPILER=ONEAPI_
|
||||
|
||||
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
|
||||
# requires
|
||||
# source /opt/intel/oneapi/setvars.sh
|
||||
# on AMD: export MKL_DEBUG_CPU_TYPE=5
|
||||
|
||||
#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
|
||||
#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
|
||||
#export KMP_AFFINITY=verbose,compact
|
||||
|
||||
CC = ${BINDIR}icc
|
||||
CXX = ${BINDIR}dpcpp
|
||||
F77 = ${BINDIR}ifort
|
||||
LINKER = ${CXX}
|
||||
|
||||
## Compiler flags
|
||||
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
|
||||
WARNINGS += -Wpessimizing-move -Wredundant-move
|
||||
#-wd2015,2012,2014 -wn3
|
||||
# -Winline -Wredundant-decls -Wunreachable-code
|
||||
# -qopt-subscript-in-range
|
||||
# -vec-threshold0
|
||||
|
||||
CXXFLAGS += -O3 -std=c++17 -tbb ${WARNINGS}
|
||||
# https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned
|
||||
# needed on Desktop-PC Haase (not needed on mephisto)
|
||||
CXXFLAGS += -D_GLIBCXX_USE_TBB_PAR_BACKEND=0
|
||||
#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include"
|
||||
#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include"
|
||||
LINKFLAGS += -O3 -tbb
|
||||
#LINKFLAGS += -no-prec-div
|
||||
|
||||
# interprocedural optimization
|
||||
CXXFLAGS += -ipo
|
||||
LINKFLAGS += -ipo
|
||||
LINKFLAGS += -flto
|
||||
|
||||
# annotated Assembler file
|
||||
ANNOTED = -fsource-asm -S
|
||||
|
||||
#architecture
|
||||
CPU = -march=core-avx2
|
||||
#CPU += -mtp=zen
|
||||
# -xCORE-AVX2
|
||||
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
|
||||
CXXFLAGS += ${CPU}
|
||||
LINKFLAGS += ${CPU}
|
||||
|
||||
# use MKL by INTEL
|
||||
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
|
||||
# sequential MKL
|
||||
# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
|
||||
CXXFLAGS += -qmkl=sequential -UMKL_ILP64
|
||||
LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
|
||||
|
||||
# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static
|
||||
# install intel-oneapi-compiler-static
|
||||
# or
|
||||
LINKFLAGS += -shared-intel
|
||||
|
||||
|
||||
OPENMP = -qopenmp
|
||||
CXXFLAGS += ${OPENMP}
|
||||
LINKFLAGS += ${OPENMP}
|
||||
|
||||
|
||||
# profiling tools
|
||||
#CXXFLAGS += -pg
|
||||
#LINKFLAGS += -pg
|
||||
# -vec-report=3
|
||||
# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
|
||||
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
|
||||
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
|
||||
# Redirect report from *.optrpt to stderr
|
||||
# -qopt-report-file=stderr
|
||||
# Guided paralellization
|
||||
# -guide -parallel
|
||||
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
|
||||
# -auto-p32 -simd
|
||||
|
||||
## run time checks
|
||||
# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
|
||||
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM} ${PARAMS}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# some tools
|
||||
# Cache behaviour (CXXFLAGS += -g tracks down to source lines)
|
||||
# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
|
||||
|
||||
mem: inspector
|
||||
prof: vtune
|
||||
cache: inspector
|
||||
|
||||
gap_par_report:
|
||||
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
|
||||
|
||||
# GUI for performance report
|
||||
amplifier: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
|
||||
amplxe-gui &
|
||||
|
||||
# GUI for Memory and Thread analyzer (race condition)
|
||||
inspector: ${PROGRAM}
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# inspxe-gui &
|
||||
vtune-gui ./${PROGRAM} &
|
||||
|
||||
advisor:
|
||||
# source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
|
||||
# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
|
||||
advisor --collect=survey ./${PROGRAM} ${PARAMS}
|
||||
# advisor --collect=roofline ./${PROGRAM} ${PARAMS}
|
||||
advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
|
||||
# advisor-gui
|
||||
|
||||
vtune:
|
||||
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
|
||||
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
|
||||
export ADVIXE_EXPERIMENTAL=roofline
|
||||
vtune -collect hotspots ./${PROGRAM} ${PARAMS}
|
||||
vtune -report hotspots -r r000hs > vtune.out
|
||||
# vtune-gui ./${PROGRAM} &
|
||||
|
||||
icc-info:
|
||||
icpc -# main.cpp
|
||||
|
||||
# MKL on AMD
|
||||
# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
|
||||
#
|
||||
# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
|
||||
# export MKL_DEBUG_CPU_TYPE=5
|
||||
# export MKL_NUM_THRAEDS=1
|
||||
# export MKL_DYNAMIC=false
|
||||
# on Intel compiler
|
||||
# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html
|
||||
96
ex5/PGI_default.mk
Normal file
96
ex5/PGI_default.mk
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
# Basic Defintions for using PGI-compiler suite sequentially
|
||||
# requires setting of COMPILER=PGI_
|
||||
# OPTIRUN = optirun
|
||||
# on mephisto:
|
||||
#CXXFLAGS += -I/share/apps/atlas/include
|
||||
#LINKFLAGS += -L/share/apps/atlas/lib
|
||||
#LINKFLAGS += -lcblas -latlas
|
||||
|
||||
LINKFLAGS += -lblas
|
||||
|
||||
CC = pgcc
|
||||
CXX = pgc++
|
||||
F77 = pgfortran
|
||||
LINKER = ${CXX}
|
||||
|
||||
|
||||
WARNINGS = -Minform=warn
|
||||
|
||||
#PGI_PROFILING = -Minfo=loop,vect,opt,intensity,mp,accel
|
||||
PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
|
||||
|
||||
# -Minfo
|
||||
# -Mprof=lines
|
||||
|
||||
CXXFLAGS += -std=c++14 -O3 -fast -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
|
||||
CXXFLAGS += -Mvect -Mcache_align -Msafeptr -Mprefetch -Mlre -Mdepchk
|
||||
#-Msmart
|
||||
|
||||
LINKFLAGS += ${PGI_PROFILING}
|
||||
#-lcblas
|
||||
# OpenMP
|
||||
CXXFLAGS += -mp=align,bind,numa -Mneginfo=mp
|
||||
LINKFLAGS += -mp=allcores,bind,numa
|
||||
|
||||
default: ${PROGRAM}
|
||||
|
||||
${PROGRAM}: ${OBJECTS}
|
||||
$(LINKER) $^ ${LINKFLAGS} -o $@
|
||||
|
||||
clean:
|
||||
@rm -f ${PROGRAM} ${OBJECTS}
|
||||
|
||||
clean_all:: clean
|
||||
@rm -f *_ *~ *.bak *.log *.out *.tar
|
||||
|
||||
run: clean ${PROGRAM}
|
||||
./${PROGRAM}
|
||||
|
||||
# tar the current directory
|
||||
MY_DIR = `basename ${PWD}`
|
||||
tar: clean_all
|
||||
@echo "Tar the directory: " ${MY_DIR}
|
||||
@cd .. ;\
|
||||
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
|
||||
cd ${MY_DIR}
|
||||
# tar cf `basename ${PWD}`.tar *
|
||||
|
||||
doc:
|
||||
doxygen Doxyfile
|
||||
|
||||
#########################################################################
|
||||
|
||||
.cpp.o:
|
||||
$(CXX) -c $(CXXFLAGS) -o $@ $<
|
||||
|
||||
.c.o:
|
||||
$(CC) -c $(CFLAGS) -o $@ $<
|
||||
|
||||
.f.o:
|
||||
$(F77) -c $(FFLAGS) -o $@ $<
|
||||
|
||||
##################################################################################################
|
||||
# # some tools
|
||||
# # Simple run time profiling of your code
|
||||
# # CXXFLAGS += -g
|
||||
# # LINKFLAGS +=
|
||||
|
||||
|
||||
# Profiling options PGI, see: pgprof -h
|
||||
PROF_FILE = jac.pgprof
|
||||
# CPU_PROF = -allcache
|
||||
CPU_PROF = --cpu-profiling on --analysis-metrics
|
||||
# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
|
||||
#GPU_PROF = -cuda=branch:cc20
|
||||
#
|
||||
|
||||
cache: prof
|
||||
|
||||
prof: ${PROGRAM}
|
||||
# ./$^
|
||||
# $(CUDA_HOME)/bin/nvvp &
|
||||
# more /opt/pgi/linux86-64/16.10/bin/pgcollectrc
|
||||
${OPTIRUN} ${BINDIR}pgprof ${CPU_PROF} -o $(PROF_FILE) ./$^
|
||||
${OPTIRUN} ${BINDIR}pgprof -i $(PROF_FILE) 2> prof.out
|
||||
|
||||
|
||||
30
ex5/ex5_1/Makefile
Normal file
30
ex5/ex5_1/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp mylib.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
99
ex5/ex5_1/check_env.h
Normal file
99
ex5/ex5_1/check_env.h
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <unordered_map>
|
||||
|
||||
//#####################################
|
||||
// G.Haase
|
||||
// See https://sourceforge.net/p/predef/wiki/Compilers/
|
||||
// http://www.cplusplus.com/doc/tutorial/preprocessor/
|
||||
// also: export OMP_DISPLAY_ENV=VERBOSE
|
||||
//#####################################
|
||||
/** Checks for compilers, its versions, threads etc.
|
||||
*
|
||||
@param[in] argc number of command line arguemnts
|
||||
@param[in] argv command line arguments as array of C-strings
|
||||
*/
|
||||
template <class T>
|
||||
void check_env(T argc, char const *argv[])
|
||||
{
|
||||
std::cout << "\n#######################################################################\n";
|
||||
std::cout << "Code :";
|
||||
for (T k = 0; k < argc; ++k) std::cout << " " << argv[k];
|
||||
std::cout << std::endl;
|
||||
|
||||
// compiler: https://sourceforge.net/p/predef/wiki/Compilers/
|
||||
std::cout << "Compiler: ";
|
||||
#if defined __INTEL_COMPILER
|
||||
#pragma message(" ########## INTEL ###############")
|
||||
std::cout << "INTEL " << __INTEL_COMPILER;
|
||||
// Ignore warnings for #pragma acc unrecognice
|
||||
#pragma warning disable 161
|
||||
// Ignore warnings for #pragma omp unrecognice
|
||||
#pragma warning disable 3180
|
||||
|
||||
#elif defined __PGI
|
||||
#pragma message(" ########## PGI ###############")
|
||||
std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
|
||||
#elif defined __clang__
|
||||
#pragma message(" ########## CLANG ###############")
|
||||
std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
|
||||
#elif defined __GNUC__
|
||||
#pragma message(" ########## Gnu ###############")
|
||||
std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
|
||||
#else
|
||||
#pragma message(" ########## unknown Compiler ###############")
|
||||
std::cout << "unknown";
|
||||
#endif
|
||||
std::cout << " C++ standard: " << __cplusplus << std::endl;
|
||||
|
||||
// Parallel environments
|
||||
std::cout << "Parallel: ";
|
||||
#if defined MPI_VERSION
|
||||
#pragma message(" ########## MPI ###############")
|
||||
#ifdef OPEN_MPI
|
||||
std::cout << "OpenMPI ";
|
||||
#else
|
||||
std::cout << "MPI ";
|
||||
#endif
|
||||
std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " ";
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
//https://www.openmp.org/specifications/
|
||||
//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
|
||||
std::unordered_map<unsigned, std::string> const map{
|
||||
{200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
|
||||
#pragma message(" ########## OPENMP ###############")
|
||||
//std::cout << _OPENMP;
|
||||
std::cout << "OpenMP ";
|
||||
try {
|
||||
std::cout << map.at(_OPENMP);
|
||||
}
|
||||
catch (...) {
|
||||
std::cout << _OPENMP;
|
||||
}
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp master
|
||||
{
|
||||
const int nn = omp_get_num_threads(); // OpenMP
|
||||
std::cout << " ---> " << nn << " Threads ";
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef _OPENACC
|
||||
#pragma message(" ########## OPENACC ###############")
|
||||
std::cout << "OpenACC ";
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
std::cout << "Date : " << __DATE__ << " " << __TIME__;
|
||||
std::cout << "\n#######################################################################\n";
|
||||
}
|
||||
// HG
|
||||
|
||||
142
ex5/ex5_1/main.cpp
Normal file
142
ex5/ex5_1/main.cpp
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
#include "check_env.h"
|
||||
#include "mylib.h"
|
||||
#include <cstdlib> // atoi()
|
||||
#include <cstring> // strncmp()
|
||||
#include <ctime>
|
||||
#include <iostream>
|
||||
#include <omp.h> // OpenMP
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char const *argv[])
|
||||
{
|
||||
omp_set_schedule(omp_sched_static, 2000000);
|
||||
//omp_set_schedule(omp_sched_dynamic, 1000000);
|
||||
//omp_set_schedule(omp_sched_guided, 1000000);
|
||||
//omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
|
||||
|
||||
|
||||
// Speedup for different number of cores (incl. hyperthreading)
|
||||
omp_set_num_threads(8);
|
||||
|
||||
// Print number of available processors
|
||||
cout << "Number of available processors: " << omp_get_num_procs() << endl;
|
||||
|
||||
// Currently executing parallel code? -> no
|
||||
cout << "Currently in parallel? " << omp_in_parallel() << endl;
|
||||
|
||||
|
||||
int const NLOOPS = 10; // chose a value such that the benchmark runs at least 10 sec.
|
||||
unsigned int N = 500000001;
|
||||
|
||||
|
||||
//##########################################################################
|
||||
// Read Parameter from command line (C++ style)
|
||||
cout << "Checking command line parameters for: -n <number> " << endl;
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
cout << " arg[" << i << "] = " << argv[i] << endl;
|
||||
string ss(argv[i]);
|
||||
if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
|
||||
{
|
||||
N = static_cast<unsigned int>(atoi(argv[i + 1]));
|
||||
}
|
||||
else
|
||||
{
|
||||
cout << "Corect call: " << argv[0] << " -n <number>\n";
|
||||
}
|
||||
}
|
||||
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
check_env(argc, argv);
|
||||
//########################################################################
|
||||
int nthreads; // OpenMP
|
||||
#pragma omp parallel default(none) shared(cout,nthreads)
|
||||
{
|
||||
stringstream inparallel;
|
||||
inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
|
||||
|
||||
|
||||
int const th_id = omp_get_thread_num(); // OpenMP
|
||||
int const nthrds = omp_get_num_threads(); // OpenMP
|
||||
stringstream ss;
|
||||
ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
|
||||
#pragma omp critical
|
||||
{
|
||||
cout << ss.str(); // output to a shared ressource
|
||||
cout << inparallel.str() << endl;
|
||||
}
|
||||
#pragma omp master
|
||||
nthreads = nthrds; // transfer nn to to master thread
|
||||
}
|
||||
cout << " " << nthreads << " threads have been started." << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
//##########################################################################
|
||||
// Data initialization
|
||||
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
|
||||
for (unsigned int i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = i + 1;
|
||||
y[i] = 1.0 / x[i];
|
||||
}
|
||||
|
||||
//##########################################################################
|
||||
cout << "\nStart Benchmarking\n";
|
||||
|
||||
// Do calculation
|
||||
double tstart = omp_get_wtime(); // OpenMP
|
||||
|
||||
double sk(0.0);
|
||||
for (int i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
//sk = scalar(x, y);
|
||||
sk = scalar_parallel(x, y);
|
||||
//sk = scalar_trans(x, y);
|
||||
//sk = norm(x);
|
||||
}
|
||||
|
||||
double t1 = omp_get_wtime() - tstart; // OpenMP
|
||||
|
||||
t1 /= NLOOPS; // divide by number of function calls
|
||||
|
||||
//##########################################################################
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << sk << endl;
|
||||
if (static_cast<unsigned int>(sk) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
//##########################################################################
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
cout << "Total benchmarking time: " << t1*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t1 << endl;
|
||||
cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
|
||||
cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
|
||||
|
||||
//#########################################################################
|
||||
|
||||
cout << "\n Try the reduction with an STL-vektor \n";
|
||||
|
||||
auto vr = reduction_vec_append(5);
|
||||
cout << "done\n";
|
||||
cout << vr << endl;
|
||||
|
||||
|
||||
return 0;
|
||||
} // memory for x and y will be deallocated their destructors
|
||||
137
ex5/ex5_1/mylib.cpp
Normal file
137
ex5/ex5_1/mylib.cpp
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
#include "mylib.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <functional> // multiplies<>{}
|
||||
#include <list>
|
||||
#include <numeric> // iota()
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
|
||||
{
|
||||
const size_t nthreads = omp_get_num_threads();
|
||||
const size_t threadnum = omp_get_thread_num();
|
||||
const size_t chunksize = N/nthreads;
|
||||
|
||||
|
||||
size_t start = threadnum*chunksize;
|
||||
size_t end = start + chunksize;
|
||||
if (threadnum == nthreads - 1)
|
||||
end = N;
|
||||
|
||||
|
||||
for (size_t i = start; i < end; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<int> reduction_vec_append(int n)
|
||||
{
|
||||
vector<int> vec(n);
|
||||
#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp critical
|
||||
cout << omp_get_thread_num() << " : " << vec.size() << endl;
|
||||
#pragma omp barrier
|
||||
iota( vec.begin(),vec.end(), omp_get_thread_num() );
|
||||
#pragma omp barrier
|
||||
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
double scalar(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
//sum += exp(x[i])*log(y[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
double norm(vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i]*x[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
vector<int> reduction_vec(int n)
|
||||
{
|
||||
vector<int> vec(n);
|
||||
#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
|
||||
{
|
||||
#pragma omp barrier
|
||||
#pragma omp critical
|
||||
cout << omp_get_thread_num() << " : " << vec.size() << endl;
|
||||
#pragma omp barrier
|
||||
iota( vec.begin(),vec.end(), omp_get_thread_num() );
|
||||
#pragma omp barrier
|
||||
|
||||
}
|
||||
return vec;
|
||||
}
|
||||
|
||||
|
||||
double scalar_trans(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
|
||||
vector<double> z(x.size());
|
||||
//list<double> z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-')
|
||||
// c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison
|
||||
|
||||
transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
|
||||
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
|
||||
for (auto pi = cbegin(z); pi!=cend(z); ++pi)
|
||||
{
|
||||
sum += *pi;
|
||||
}
|
||||
//for (auto val: z)
|
||||
//{
|
||||
//sum += val;
|
||||
//}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
|
||||
88
ex5/ex5_1/mylib.h
Normal file
88
ex5/ex5_1/mylib.h
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
#pragma once
|
||||
#include <cassert>
|
||||
#include <iomanip> // setw()
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
/** Inner product
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar(std::vector<double> const &x, std::vector<double> const &y);
|
||||
double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
|
||||
|
||||
|
||||
|
||||
// Declare additional reduction operation in OpenMP for STL-vector
|
||||
#pragma omp declare reduction(VecAppend : std::vector<int> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
|
||||
initializer (omp_priv=omp_orig)
|
||||
|
||||
std::vector<int> reduction_vec_append(int n);
|
||||
|
||||
|
||||
/** l2-norm
|
||||
@param[in] x vector
|
||||
@return resulting Euclidian norm
|
||||
*/
|
||||
double norm(std::vector<double> const &x);
|
||||
|
||||
/** Vector @p b adds its elements to vector @p a .
|
||||
@param[in] a vector
|
||||
@param[in] b vector
|
||||
@return a+=b componentwise
|
||||
*/
|
||||
template<class T>
|
||||
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
|
||||
{
|
||||
assert(a.size()==b.size());
|
||||
for (size_t k = 0; k < a.size(); ++k) {
|
||||
a[k] += b[k];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Declare the reduction operation in OpenMP for an STL-vector
|
||||
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
|
||||
// ------------------------------------------------------------
|
||||
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
|
||||
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
|
||||
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
|
||||
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
|
||||
#pragma omp declare reduction(VecAdd : std::vector<int> : omp_out += omp_in) \
|
||||
initializer (omp_priv=omp_orig)
|
||||
|
||||
// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type.
|
||||
//template <class T>
|
||||
//#pragma omp declare reduction(VecAdd : std::vector<T> : omp_out += omp_in) initializer (omp_priv(omp_orig))
|
||||
// MS: template nach #pragma !?
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
/** Test for vector reduction.
|
||||
*
|
||||
* The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
|
||||
* Afterwards these vectors are accumulated, i.e.,
|
||||
* @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
|
||||
*
|
||||
* @param[in] n size of global/private vector
|
||||
* @return resulting global vector.
|
||||
*/
|
||||
std::vector<int> reduction_vec(int n);
|
||||
|
||||
|
||||
|
||||
/** Output of a vector.
|
||||
@param[in,out] s output stream
|
||||
@param[in] x vector
|
||||
@return modified output stream
|
||||
*/
|
||||
template <class T>
|
||||
std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
|
||||
{
|
||||
for (auto const &v : x) s << std::setw(4) << v << " ";
|
||||
return s;
|
||||
}
|
||||
|
||||
70
ex5/ex5_1/timing.h
Normal file
70
ex5/ex5_1/timing.h
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
#pragma once
|
||||
#include <chrono> // timing
|
||||
#include <stack>
|
||||
|
||||
using Clock = std::chrono::system_clock; //!< The wall clock timer chosen
|
||||
//using Clock = std::chrono::high_resolution_clock;
|
||||
using TPoint= std::chrono::time_point<Clock>;
|
||||
|
||||
// [Galowicz, C++17 STL Cookbook, p. 29]
|
||||
inline
|
||||
std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
|
||||
|
||||
/** Starts stopwatch timer.
|
||||
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
|
||||
*
|
||||
* The timining is allowed to be nested and the recent time is stored on top of the stack.
|
||||
*
|
||||
* @return recent time
|
||||
* @see toc
|
||||
*/
|
||||
inline auto tic()
|
||||
{
|
||||
MyStopWatch.push(Clock::now());
|
||||
return MyStopWatch.top();
|
||||
}
|
||||
|
||||
/** Returns the elapsed time from stopwatch.
|
||||
*
|
||||
* The time from top of the stack is used
|
||||
* if time point @p t_b is not passed as input parameter.
|
||||
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
|
||||
* or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode
|
||||
* The last option is to be used in the case of
|
||||
* non-nested but overlapping time measurements.
|
||||
*
|
||||
* @param[in] t_b start time of some stop watch
|
||||
* @return elapsed time in seconds.
|
||||
*
|
||||
*/
|
||||
inline double toc(TPoint const &t_b = MyStopWatch.top())
|
||||
{
|
||||
// https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
|
||||
using Unit = std::chrono::seconds;
|
||||
using FpSeconds = std::chrono::duration<double, Unit::period>;
|
||||
auto t_e = Clock::now();
|
||||
MyStopWatch.pop();
|
||||
return FpSeconds(t_e-t_b).count();
|
||||
}
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
/** Executes function @p f and measures/prints elapsed wall clock time in seconds
|
||||
*
|
||||
* Call as
|
||||
* @code measure("Time for (b = b + 1)", [&]() {
|
||||
thrust::transform(b.begin(), b.end(), b.begin(), increment());
|
||||
}); @endcode
|
||||
*
|
||||
* @param[in] label additional string to be printed with the measurement.
|
||||
* @param[in] f function to execute.
|
||||
* @author Therese Bösmüller, 2025
|
||||
*
|
||||
*/
|
||||
auto measure = [](const std::string& label, auto&& f) {
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
f();
|
||||
auto stop = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
|
||||
std::cout << label << ": " << duration << " microseconds" << std::endl;
|
||||
}; // ';' is needed for a visible documentation of this lambda-function
|
||||
31
ex5/ex5_2/Makefile
Normal file
31
ex5/ex5_2/Makefile
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp mylib.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
501
ex5/ex5_2/data_1.txt
Normal file
501
ex5/ex5_2/data_1.txt
Normal file
|
|
@ -0,0 +1,501 @@
|
|||
141
|
||||
261
|
||||
87
|
||||
430
|
||||
258
|
||||
298
|
||||
425
|
||||
120
|
||||
496
|
||||
707
|
||||
244
|
||||
786
|
||||
75
|
||||
394
|
||||
4
|
||||
221
|
||||
2
|
||||
190
|
||||
143
|
||||
269
|
||||
175
|
||||
139
|
||||
599
|
||||
902
|
||||
940
|
||||
222
|
||||
483
|
||||
377
|
||||
524
|
||||
265
|
||||
69
|
||||
437
|
||||
174
|
||||
27
|
||||
955
|
||||
431
|
||||
962
|
||||
763
|
||||
8
|
||||
681
|
||||
706
|
||||
646
|
||||
553
|
||||
219
|
||||
773
|
||||
229
|
||||
371
|
||||
891
|
||||
857
|
||||
403
|
||||
319
|
||||
609
|
||||
911
|
||||
910
|
||||
592
|
||||
333
|
||||
854
|
||||
443
|
||||
905
|
||||
34
|
||||
533
|
||||
717
|
||||
180
|
||||
337
|
||||
188
|
||||
322
|
||||
404
|
||||
549
|
||||
49
|
||||
553
|
||||
275
|
||||
242
|
||||
244
|
||||
155
|
||||
957
|
||||
936
|
||||
819
|
||||
729
|
||||
176
|
||||
361
|
||||
189
|
||||
2
|
||||
317
|
||||
700
|
||||
626
|
||||
544
|
||||
440
|
||||
288
|
||||
502
|
||||
762
|
||||
763
|
||||
577
|
||||
748
|
||||
646
|
||||
124
|
||||
505
|
||||
348
|
||||
93
|
||||
148
|
||||
199
|
||||
673
|
||||
432
|
||||
695
|
||||
257
|
||||
10
|
||||
533
|
||||
280
|
||||
947
|
||||
907
|
||||
393
|
||||
25
|
||||
672
|
||||
838
|
||||
972
|
||||
57
|
||||
451
|
||||
583
|
||||
687
|
||||
720
|
||||
651
|
||||
727
|
||||
374
|
||||
582
|
||||
117
|
||||
58
|
||||
980
|
||||
285
|
||||
595
|
||||
963
|
||||
186
|
||||
194
|
||||
342
|
||||
933
|
||||
391
|
||||
274
|
||||
152
|
||||
398
|
||||
375
|
||||
132
|
||||
436
|
||||
92
|
||||
615
|
||||
11
|
||||
574
|
||||
790
|
||||
236
|
||||
449
|
||||
570
|
||||
62
|
||||
497
|
||||
643
|
||||
222
|
||||
838
|
||||
972
|
||||
847
|
||||
506
|
||||
279
|
||||
747
|
||||
237
|
||||
958
|
||||
621
|
||||
601
|
||||
173
|
||||
91
|
||||
256
|
||||
859
|
||||
912
|
||||
700
|
||||
726
|
||||
230
|
||||
577
|
||||
811
|
||||
404
|
||||
989
|
||||
90
|
||||
321
|
||||
512
|
||||
61
|
||||
726
|
||||
557
|
||||
530
|
||||
830
|
||||
859
|
||||
790
|
||||
318
|
||||
453
|
||||
753
|
||||
110
|
||||
110
|
||||
270
|
||||
525
|
||||
973
|
||||
711
|
||||
312
|
||||
292
|
||||
851
|
||||
912
|
||||
640
|
||||
256
|
||||
89
|
||||
839
|
||||
585
|
||||
949
|
||||
62
|
||||
585
|
||||
286
|
||||
828
|
||||
191
|
||||
443
|
||||
394
|
||||
827
|
||||
677
|
||||
208
|
||||
319
|
||||
134
|
||||
672
|
||||
571
|
||||
170
|
||||
148
|
||||
477
|
||||
909
|
||||
553
|
||||
33
|
||||
54
|
||||
806
|
||||
452
|
||||
383
|
||||
790
|
||||
365
|
||||
533
|
||||
712
|
||||
872
|
||||
329
|
||||
651
|
||||
975
|
||||
76
|
||||
588
|
||||
414
|
||||
310
|
||||
264
|
||||
759
|
||||
996
|
||||
187
|
||||
782
|
||||
196
|
||||
993
|
||||
803
|
||||
425
|
||||
729
|
||||
499
|
||||
809
|
||||
357
|
||||
74
|
||||
591
|
||||
911
|
||||
194
|
||||
433
|
||||
750
|
||||
40
|
||||
947
|
||||
764
|
||||
559
|
||||
184
|
||||
498
|
||||
518
|
||||
995
|
||||
855
|
||||
963
|
||||
679
|
||||
404
|
||||
935
|
||||
480
|
||||
232
|
||||
397
|
||||
706
|
||||
559
|
||||
757
|
||||
996
|
||||
963
|
||||
536
|
||||
964
|
||||
116
|
||||
52
|
||||
305
|
||||
581
|
||||
531
|
||||
902
|
||||
541
|
||||
432
|
||||
543
|
||||
713
|
||||
17
|
||||
801
|
||||
143
|
||||
479
|
||||
257
|
||||
370
|
||||
662
|
||||
170
|
||||
279
|
||||
199
|
||||
196
|
||||
327
|
||||
881
|
||||
472
|
||||
404
|
||||
180
|
||||
969
|
||||
408
|
||||
845
|
||||
616
|
||||
377
|
||||
878
|
||||
785
|
||||
465
|
||||
814
|
||||
899
|
||||
430
|
||||
335
|
||||
597
|
||||
902
|
||||
703
|
||||
378
|
||||
735
|
||||
955
|
||||
543
|
||||
541
|
||||
312
|
||||
72
|
||||
182
|
||||
93
|
||||
464
|
||||
10
|
||||
916
|
||||
643
|
||||
2
|
||||
31
|
||||
209
|
||||
455
|
||||
128
|
||||
9
|
||||
728
|
||||
355
|
||||
781
|
||||
437
|
||||
437
|
||||
50
|
||||
50
|
||||
92
|
||||
595
|
||||
242
|
||||
842
|
||||
858
|
||||
964
|
||||
489
|
||||
221
|
||||
227
|
||||
537
|
||||
763
|
||||
348
|
||||
462
|
||||
640
|
||||
918
|
||||
162
|
||||
716
|
||||
578
|
||||
434
|
||||
885
|
||||
394
|
||||
179
|
||||
634
|
||||
625
|
||||
328
|
||||
803
|
||||
1000
|
||||
981
|
||||
128
|
||||
233
|
||||
24
|
||||
608
|
||||
111
|
||||
408
|
||||
885
|
||||
549
|
||||
370
|
||||
209
|
||||
441
|
||||
957
|
||||
125
|
||||
471
|
||||
857
|
||||
44
|
||||
692
|
||||
979
|
||||
284
|
||||
134
|
||||
686
|
||||
910
|
||||
611
|
||||
900
|
||||
194
|
||||
755
|
||||
347
|
||||
419
|
||||
156
|
||||
820
|
||||
625
|
||||
739
|
||||
806
|
||||
68
|
||||
951
|
||||
498
|
||||
756
|
||||
743
|
||||
832
|
||||
157
|
||||
458
|
||||
619
|
||||
933
|
||||
836
|
||||
896
|
||||
583
|
||||
583
|
||||
855
|
||||
35
|
||||
886
|
||||
408
|
||||
37
|
||||
747
|
||||
155
|
||||
144
|
||||
606
|
||||
255
|
||||
325
|
||||
402
|
||||
407
|
||||
387
|
||||
610
|
||||
167
|
||||
189
|
||||
95
|
||||
324
|
||||
770
|
||||
235
|
||||
741
|
||||
693
|
||||
825
|
||||
828
|
||||
294
|
||||
310
|
||||
524
|
||||
326
|
||||
832
|
||||
811
|
||||
557
|
||||
263
|
||||
681
|
||||
234
|
||||
457
|
||||
385
|
||||
539
|
||||
992
|
||||
756
|
||||
981
|
||||
235
|
||||
529
|
||||
52
|
||||
757
|
||||
602
|
||||
858
|
||||
989
|
||||
930
|
||||
410
|
||||
1
|
||||
541
|
||||
208
|
||||
220
|
||||
326
|
||||
96
|
||||
748
|
||||
749
|
||||
544
|
||||
339
|
||||
833
|
||||
553
|
||||
958
|
||||
893
|
||||
357
|
||||
547
|
||||
347
|
||||
623
|
||||
797
|
||||
746
|
||||
126
|
||||
823
|
||||
26
|
||||
415
|
||||
732
|
||||
782
|
||||
368
|
||||
|
||||
130
ex5/ex5_2/main.cpp
Normal file
130
ex5/ex5_2/main.cpp
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
#include "mylib.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
// read vector from file
|
||||
vector<size_t> data_vector = {};
|
||||
|
||||
ifstream input_stream("data_1.txt");
|
||||
|
||||
size_t line;
|
||||
while(input_stream >> line)
|
||||
{
|
||||
data_vector.push_back(line);
|
||||
}
|
||||
data_vector.shrink_to_fit();
|
||||
|
||||
|
||||
|
||||
|
||||
// specify loops
|
||||
size_t NLOOPS = 10000;
|
||||
|
||||
|
||||
|
||||
// ############# Parallelization with openMP #############
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am_omp, gm_omp, hm_omp;
|
||||
|
||||
double tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
means_omp(data_vector, am_omp, gm_omp, hm_omp);
|
||||
|
||||
double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
// calculate minimum and maximum
|
||||
size_t min, max;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
minmax_omp(data_vector, min, max);
|
||||
|
||||
double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// ############# Parallelization with C++ algorithms #############
|
||||
// calculate arithmetic mean, geometric mean and harmonic mean
|
||||
double am_cpp, gm_cpp, hm_cpp;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
|
||||
|
||||
double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
// calculate minimum and maximum
|
||||
size_t min_cpp, max_cpp;
|
||||
|
||||
tstart = omp_get_wtime();
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
minmax_cpp(data_vector, min_cpp, max_cpp);
|
||||
|
||||
double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// print results
|
||||
cout << "####### OpenMP #######" << endl;
|
||||
cout << "minimum: " << min << endl;
|
||||
cout << "maximum: " << max << endl;
|
||||
cout << "duration: " << t_minmax_omp << endl << endl;
|
||||
|
||||
cout << "arithmetic mean: " << am_omp << endl;
|
||||
cout << "geometric mean: " << gm_omp << endl;
|
||||
cout << "harmonic mean: " << hm_omp << endl;
|
||||
cout << "duration: " << t_means_omp << endl << endl;
|
||||
|
||||
|
||||
cout << "####### C++ #######" << endl;
|
||||
cout << "minimum: " << min_cpp << endl;
|
||||
cout << "maximum: " << max_cpp << endl;
|
||||
cout << "duration: " << t_minmax_cpp << endl << endl;
|
||||
|
||||
cout << "arithmetic mean: " << am_cpp << endl;
|
||||
cout << "geometric mean: " << gm_cpp << endl;
|
||||
cout << "harmonic mean: " << hm_cpp << endl;
|
||||
cout << "duration: " << t_means_cpp << endl << endl;
|
||||
|
||||
|
||||
|
||||
// ####### OpenMP #######
|
||||
// minimum: 1
|
||||
// maximum: 1000
|
||||
// duration: 3.52086e-06
|
||||
|
||||
// arithmetic mean: 498.184
|
||||
// geometric mean: 364.412
|
||||
// harmonic mean: 95.6857
|
||||
// duration: 5.90171e-06
|
||||
|
||||
// ####### C++ #######
|
||||
// minimum: 1
|
||||
// maximum: 1000
|
||||
// duration: 1.76816e-05
|
||||
|
||||
// arithmetic mean: 498.184
|
||||
// geometric mean: 364.412
|
||||
// harmonic mean: 95.6857
|
||||
// duration: 2.35728e-05
|
||||
|
||||
// --> the openMP variant is faster in both cases
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
103
ex5/ex5_2/mylib.cpp
Normal file
103
ex5/ex5_2/mylib.cpp
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
#include "mylib.h"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <execution>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
am = 0.;
|
||||
gm = 0.;
|
||||
hm = 0.;
|
||||
|
||||
#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
am += numbers[i];
|
||||
gm += log(numbers[i]);
|
||||
hm += 1.0/numbers[i];
|
||||
|
||||
// #pragma omp critical
|
||||
// {
|
||||
// cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
|
||||
// }
|
||||
}
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
|
||||
|
||||
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
global_min = -1; // gives the maximum size_t value
|
||||
global_max = 0;
|
||||
|
||||
#pragma omp parallel shared(numbers, n, global_min, global_max)
|
||||
{
|
||||
const size_t nthreads = omp_get_num_threads();
|
||||
const size_t threadnum = omp_get_thread_num();
|
||||
const size_t chunksize = n/nthreads;
|
||||
|
||||
|
||||
size_t start = threadnum*chunksize;
|
||||
size_t end = start + chunksize;
|
||||
if (threadnum == nthreads - 1)
|
||||
end = n;
|
||||
|
||||
|
||||
size_t local_min = -1;
|
||||
size_t local_max = 0;
|
||||
for (size_t i = start; i < end ; ++i)
|
||||
{
|
||||
if (numbers[i] < local_min)
|
||||
local_min = numbers[i];
|
||||
|
||||
if (numbers[i] > local_max)
|
||||
local_max = numbers[i];
|
||||
}
|
||||
|
||||
#pragma omp critical
|
||||
{
|
||||
if (local_min < global_min)
|
||||
global_min = local_min;
|
||||
|
||||
if (local_max > global_max)
|
||||
global_max = local_max;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
|
||||
{
|
||||
size_t const n = numbers.size();
|
||||
|
||||
am = reduce(std::execution::par, numbers.begin(), numbers.end());
|
||||
gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
|
||||
hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
|
||||
|
||||
am /= n;
|
||||
gm = exp(gm/n);
|
||||
hm = n/hm;
|
||||
}
|
||||
|
||||
|
||||
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
|
||||
{
|
||||
auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
|
||||
auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
|
||||
|
||||
global_min = *min_it;
|
||||
global_max = *max_it;
|
||||
}
|
||||
42
ex5/ex5_2/mylib.h
Normal file
42
ex5/ex5_2/mylib.h
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
#include <vector>
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
Uses openMP parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates the minimum and maximum of a vector.
|
||||
Uses openMP parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] global_min minimum
|
||||
@param[out] global_max maximum
|
||||
*/
|
||||
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
|
||||
Uses C++ parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] am arithmetic mean
|
||||
@param[out] gm geometric mean
|
||||
@param[out] hm harmonic mean
|
||||
*/
|
||||
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
|
||||
|
||||
|
||||
/**
|
||||
This function calculates the minimum and maximum of a vector.
|
||||
Uses C++ parallelization.
|
||||
@param[in] numbers vector containing integers
|
||||
@param[out] global_min minimum
|
||||
@param[out] global_max maximum
|
||||
*/
|
||||
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
|
||||
30
ex5/ex5_3/Makefile
Normal file
30
ex5/ex5_3/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp goldbach.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
46
ex5/ex5_3/goldbach.cpp
Normal file
46
ex5/ex5_3/goldbach.cpp
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
#include "goldbach.h"
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <omp.h>
|
||||
|
||||
size_t single_goldbach(size_t k)
|
||||
{
|
||||
const std::vector<size_t> relevant_primes = get_primes(k);
|
||||
size_t m = relevant_primes.size();
|
||||
|
||||
size_t counter = 0;
|
||||
|
||||
#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
{
|
||||
if(relevant_primes[i] + relevant_primes[j] == k)
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
|
||||
std::vector<size_t> count_goldbach(size_t n)
|
||||
{
|
||||
const std::vector<size_t> relevant_primes = get_primes(n);
|
||||
size_t m = relevant_primes.size();
|
||||
|
||||
std::vector<size_t> counter_vector(n + 1, 0);
|
||||
|
||||
#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
|
||||
for(size_t i = 0; i < m; ++i)
|
||||
{
|
||||
for(size_t j = i; j < m; ++j)
|
||||
{
|
||||
size_t sum = relevant_primes[i] + relevant_primes[j];
|
||||
if(sum <= n)
|
||||
++counter_vector[relevant_primes[i] + relevant_primes[j]];
|
||||
}
|
||||
}
|
||||
|
||||
return counter_vector;
|
||||
}
|
||||
45
ex5/ex5_3/goldbach.h
Normal file
45
ex5/ex5_3/goldbach.h
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
#pragma once
|
||||
#include "mayer_primes.h"
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
|
||||
@param[in] k first integer
|
||||
@param[out] count number of decompositions
|
||||
*/
|
||||
size_t single_goldbach(size_t k);
|
||||
|
||||
|
||||
/**
|
||||
This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
|
||||
@param[in] n upper integer bound
|
||||
@param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index
|
||||
*/
|
||||
std::vector<size_t> count_goldbach(size_t n);
|
||||
|
||||
|
||||
/** Vector @p b adds its elements to vector @p a .
|
||||
@param[in] a vector
|
||||
@param[in] b vector
|
||||
@return a+=b componentwise
|
||||
*/
|
||||
template<class T>
|
||||
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
|
||||
{
|
||||
assert(a.size()==b.size());
|
||||
for (size_t k = 0; k < a.size(); ++k) {
|
||||
a[k] += b[k];
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
// Declare the reduction operation in OpenMP for an STL-vector
|
||||
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
|
||||
// ------------------------------------------------------------
|
||||
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
|
||||
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
|
||||
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
|
||||
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
|
||||
#pragma omp declare reduction(VecAdd : std::vector<size_t> : omp_out += omp_in) initializer (omp_priv=omp_orig)
|
||||
45
ex5/ex5_3/main.cpp
Normal file
45
ex5/ex5_3/main.cpp
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
#include "goldbach.h"
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
using namespace std;
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
|
||||
|
||||
for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
|
||||
{
|
||||
double t_start = omp_get_wtime();
|
||||
|
||||
auto goldbach_vector = count_goldbach(n);
|
||||
|
||||
auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
|
||||
size_t max_number = distance(goldbach_vector.begin(), max_it);
|
||||
|
||||
double t_end = omp_get_wtime() - t_start;
|
||||
|
||||
cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
|
||||
}
|
||||
|
||||
/*
|
||||
###### WITHOUT PARALLELIZATION ######
|
||||
The number 9240 has 329 decompositions. Duration: 0.00307696
|
||||
The number 99330 has 2168 decompositions. Duration: 0.189839
|
||||
The number 390390 has 7094 decompositions. Duration: 1.3042
|
||||
The number 990990 has 15594 decompositions. Duration: 5.45034
|
||||
The number 1981980 has 27988 decompositions. Duration: 47.1807
|
||||
|
||||
|
||||
###### WITH PARALLELIZATION ######
|
||||
The number 9240 has 329 decompositions. Duration: 0.000734854
|
||||
The number 99330 has 2168 decompositions. Duration: 0.0251322
|
||||
The number 390390 has 7094 decompositions. Duration: 0.487375
|
||||
The number 990990 has 15594 decompositions. Duration: 6.16972
|
||||
The number 1981980 has 27988 decompositions. Duration: 31.5699
|
||||
*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
73
ex5/ex5_3/mayer_primes.h
Normal file
73
ex5/ex5_3/mayer_primes.h
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstring> //memset
|
||||
#include <vector>
|
||||
//using namespace std;
|
||||
|
||||
/** \brief Determines all prime numbers in interval [2, @p max].
|
||||
*
|
||||
* The sieve of Eratosthenes is used.
|
||||
*
|
||||
* The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
|
||||
*
|
||||
* \param[in] max end of interval for the prime number search.
|
||||
* \return vector of prime numbers @f$2,3,5, ..., p<=max @f$.
|
||||
*
|
||||
* \copyright
|
||||
* Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018)
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
template <class T>
|
||||
std::vector<T> get_primes(T max)
|
||||
{
|
||||
std::vector<T> primes;
|
||||
char *sieve;
|
||||
sieve = new char[max / 8 + 1];
|
||||
// Fill sieve with 1
|
||||
memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
|
||||
for (T x = 2; x <= max; x++)
|
||||
{
|
||||
if (sieve[x / 8] & (0x01 << (x % 8))) {
|
||||
primes.push_back(x);
|
||||
// Is prime. Mark multiplicates.
|
||||
for (T j = 2 * x; j <= max; j += x)
|
||||
{
|
||||
sieve[j / 8] &= ~(0x01 << (j % 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] sieve;
|
||||
return primes;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------
|
||||
//int main() // by Florian Mayer
|
||||
//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out
|
||||
// vector<unsigned long> primes;
|
||||
// primes = get_primes(10000000);
|
||||
// // return 0;
|
||||
// // Print out result.
|
||||
// vector<unsigned long>::iterator it;
|
||||
// for(it=primes.begin(); it < primes.end(); it++)
|
||||
// cout << *it << " ";
|
||||
//
|
||||
// cout << endl;
|
||||
// return 0;
|
||||
//}
|
||||
30
ex5/ex5_4/Makefile
Normal file
30
ex5/ex5_4/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#
|
||||
# use GNU-Compiler tools
|
||||
COMPILER=GCC_
|
||||
# alternatively from the shell
|
||||
# export COMPILER=GCC_
|
||||
# or, alternatively from the shell
|
||||
# make COMPILER=GCC_
|
||||
|
||||
# use Intel compilers
|
||||
#COMPILER=ICC_
|
||||
|
||||
# use PGI compilers
|
||||
# COMPILER=PGI_
|
||||
|
||||
|
||||
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
PROGRAM = main.${COMPILER}
|
||||
|
||||
# uncomment the next to lines for debugging and detailed performance analysis
|
||||
CXXFLAGS += -g
|
||||
LINKFLAGS += -g
|
||||
# do not use -pg with PGI compilers
|
||||
|
||||
ifndef COMPILER
|
||||
COMPILER=GCC_
|
||||
endif
|
||||
|
||||
include ../${COMPILER}default.mk
|
||||
375
ex5/ex5_4/benchmark_tests.cpp
Normal file
375
ex5/ex5_4/benchmark_tests.cpp
Normal file
|
|
@ -0,0 +1,375 @@
|
|||
#include "benchmark_tests.h"
|
||||
#include "benchmarks.h"
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
using namespace std::chrono;
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N)
|
||||
{
|
||||
cout << "#################### (A) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N), y(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = i % 219 + 1;
|
||||
y[i] = 1.0/x[i];
|
||||
}
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking scalar\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double check(0.0),ss(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
check = scalar_parallel(x, y);
|
||||
ss += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != N)
|
||||
cout << " !! W R O N G result !!\n";
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
|
||||
{
|
||||
cout << "#################### (A) sum ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
x[i] = 1;
|
||||
}
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking sum\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
double check(0.0),ss(0.0);
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
check = sum(x);
|
||||
ss += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <x,y> = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != N)
|
||||
cout << " !! W R O N G result !!\n";
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
|
||||
{
|
||||
cout << "#################### (B) ####################" << endl;
|
||||
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
cout << "\nM = " << M << endl;
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> A(M*N);
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
A[N*i + j] = (i + j) % 219 + 1;
|
||||
|
||||
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
x[j] = 1.0/A[N*17 + j];
|
||||
}
|
||||
|
||||
cout << "\nStart Benchmarking MatVec\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> b(M);
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
b = MatVec_parallel(A, x);
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n <A[17,*],x> = " << b[17] << endl;
|
||||
if (static_cast<size_t>(b[17]) != N)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
|
||||
{
|
||||
cout << "#################### (C) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nL = " << L << endl;
|
||||
cout << "\nM = " << M << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> A(M*L);
|
||||
vector<double> B(L*N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
A[L*i + k] = (i + k) % 219 + 1;
|
||||
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
B[N*k + j] = 1.0/A[L*17 + k];
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking MatMat\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> C(M*N);
|
||||
double check;
|
||||
double check_sum = 0;
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
C = MatMat_parallel(A, B, L);
|
||||
|
||||
check = C[N*17];
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
cout << check_sum;
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n C[17,0] = " << check << endl;
|
||||
if (static_cast<unsigned int>(check) != L)
|
||||
{
|
||||
cout << " !! W R O N G result !!, should be " << L <<"\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
|
||||
{
|
||||
cout << "#################### (D) ####################" << endl;
|
||||
cout << "\nLOOPS = " << NLOOPS << endl;
|
||||
cout << "\nN = " << N << endl;
|
||||
cout << "\np = " << p << endl;
|
||||
|
||||
// Memory allocation
|
||||
cout << "Memory allocation\n";
|
||||
|
||||
vector<double> a(p + 1, 0);
|
||||
vector<double> x(N);
|
||||
|
||||
cout.precision(2);
|
||||
cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
|
||||
cout.precision(6);
|
||||
|
||||
// Data initialization
|
||||
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
x[j] = 1.0*j;
|
||||
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
|
||||
|
||||
|
||||
|
||||
cout << "\nStart Benchmarking poly\n";
|
||||
|
||||
auto t1 = system_clock::now(); // start timer
|
||||
// Do calculation
|
||||
vector<double> y(N);
|
||||
double check;
|
||||
double check_sum;
|
||||
|
||||
for (size_t i = 0; i < NLOOPS; ++i)
|
||||
{
|
||||
y = poly_parallel(a, x);
|
||||
check = y[0];
|
||||
|
||||
check_sum += check; // prevents the optimizer from removing unused calculation results.
|
||||
}
|
||||
|
||||
auto t2 = system_clock::now(); // stop timer
|
||||
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
|
||||
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
|
||||
t_diff = t_diff/NLOOPS; // duration per loop seconds
|
||||
|
||||
|
||||
|
||||
// Check the correct result
|
||||
cout << "\n poly(" << x[0] << ") = " << check << endl;
|
||||
if (abs(check - 1.0) > 1.0/1e6)
|
||||
{
|
||||
cout << " !! W R O N G result !!\n";
|
||||
}
|
||||
cout << endl;
|
||||
|
||||
|
||||
// Timings and Performance
|
||||
cout << endl;
|
||||
cout.precision(2);
|
||||
|
||||
|
||||
double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
|
||||
double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
|
||||
|
||||
cout << "Total duration : " << t_diff*NLOOPS << endl;
|
||||
cout << "Timing in sec. : " << t_diff << endl;
|
||||
cout << "GFLOPS : " << Gflops << endl;
|
||||
cout << "GiByte/s : " << MemBandwidth << endl;
|
||||
|
||||
|
||||
|
||||
return vector<double>{t_diff, Gflops, MemBandwidth};
|
||||
}
|
||||
13
ex5/ex5_4/benchmark_tests.h
Normal file
13
ex5/ex5_4/benchmark_tests.h
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
vector<double> test_A(const size_t &NLOOPS, const size_t &N);
|
||||
|
||||
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
|
||||
|
||||
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
|
||||
|
||||
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
|
||||
|
||||
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);
|
||||
141
ex5/ex5_4/benchmarks.cpp
Normal file
141
ex5/ex5_4/benchmarks.cpp
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
#include "benchmarks.h"
|
||||
#include <cassert> // assert()
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <omp.h>
|
||||
|
||||
// (A) Inner product of two vectors (from skalar_stl)
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y)
|
||||
{
|
||||
assert(x.size() == y.size());
|
||||
size_t const N = x.size();
|
||||
double sum = 0.0;
|
||||
//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
|
||||
#pragma omp parallel for shared(x, y, N) reduction(+:sum)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// (A) Vector entry sum
|
||||
double sum(vector<double> const &x)
|
||||
{
|
||||
double sum = 0.0;
|
||||
#pragma omp parallel for shared(x) reduction(+:sum)
|
||||
for (size_t i = 0; i < x.size(); ++i)
|
||||
{
|
||||
sum += x[i];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
// (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
|
||||
{
|
||||
size_t const nelem = A.size();
|
||||
size_t const N = x.size();
|
||||
assert(nelem % N == 0); // make sure multiplication is possible
|
||||
size_t const M = nelem/N;
|
||||
|
||||
vector<double> b(M);
|
||||
|
||||
#pragma omp parallel for shared(A, x, N, M, b)
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
double tmp = 0.0;
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
tmp += A[N*i + j] * x[j];
|
||||
b[i] = tmp;
|
||||
}
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
// (C) Matrix-matrix product
|
||||
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
|
||||
{
|
||||
size_t const nelem_A = A.size();
|
||||
size_t const nelem_B = B.size();
|
||||
|
||||
assert(nelem_A % L == 0 && nelem_B % L == 0);
|
||||
|
||||
size_t const M = nelem_A/L;
|
||||
size_t const N = nelem_B/L;
|
||||
|
||||
|
||||
vector<double> C(M*N);
|
||||
|
||||
|
||||
#pragma omp parallel for shared(A, B, M, N, L, C)
|
||||
for (size_t i = 0; i < M; ++i)
|
||||
{
|
||||
for (size_t k = 0; k < L; ++k)
|
||||
{
|
||||
for (size_t j = 0; j < N; ++j)
|
||||
{
|
||||
C[N*i + j] += A[L*i + k]*B[N*k + j];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return C;
|
||||
}
|
||||
|
||||
|
||||
// (D) Evaluation of a polynomial function
|
||||
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
|
||||
{
|
||||
size_t const N = x.size();
|
||||
size_t const p = a.size() - 1;
|
||||
vector<double> y(N, 0);
|
||||
|
||||
#pragma omp parallel for shared(a, x, N, p, y)
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
{
|
||||
double x_temp = x[i];
|
||||
double y_temp = 0;
|
||||
for (size_t k = 0; k < p + 1; ++k)
|
||||
{
|
||||
y_temp += x_temp*y_temp + a[p - k];
|
||||
}
|
||||
y[i] = y_temp;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
55
ex5/ex5_4/benchmarks.h
Normal file
55
ex5/ex5_4/benchmarks.h
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
#pragma once
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
/** (A) Inner product of two vectors (from skalar_stl)
|
||||
@param[in] x vector
|
||||
@param[in] y vector
|
||||
@return resulting Euclidian inner product <x,y>
|
||||
*/
|
||||
double scalar_parallel(vector<double> const &x, vector<double> const &y);
|
||||
|
||||
|
||||
/** (A) Sum entries of vector
|
||||
@param[in] x vector
|
||||
@return sum
|
||||
*/
|
||||
double sum(vector<double> const &x);
|
||||
|
||||
|
||||
/** (B) Matrix-vector product (from intro_vector_densematrix)
|
||||
* @param[in] A dense matrix (1D access)
|
||||
* @param[in] u vector
|
||||
*
|
||||
* @return resulting vector
|
||||
*/
|
||||
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
|
||||
|
||||
|
||||
/** (C) Matrix-matrix product
|
||||
* @param[in] A MxL dense matrix (1D access)
|
||||
* @param[in] B LxN dense matrix (1D access)
|
||||
* @param[in] shared_dim shared dimension L
|
||||
*
|
||||
* @return resulting MxN matrix
|
||||
*/
|
||||
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
|
||||
|
||||
|
||||
/** (D) Evaluation of a polynomial function using Horner's scheme
|
||||
* @param[in] a coefficient vector
|
||||
* @param[in] x vector with input values
|
||||
*
|
||||
* @return vector with output values
|
||||
*/
|
||||
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue