Pushing everything again, accidentally deleted my remote repository

This commit is contained in:
jakob.schratter 2025-12-09 22:06:13 +01:00
commit 1bee3e8e5b
101 changed files with 9428 additions and 0 deletions

131
ex5/CLANG_default.mk Normal file
View file

@ -0,0 +1,131 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=CLANG_
# https://llvm.org/docs/CompileCudaWithLLVM.html
# https://llvm.org/docs/NVPTXUsage.html
#CLANGPATH=//usr/lib/llvm-10/bin/
CC = ${CLANGPATH}clang
CXX = ${CLANGPATH}clang++
#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
#F77 = gfortran
LINKER = ${CXX}
#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
WARNINGS += -pedantic -Weverything -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion
WARNINGS += -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
WARNINGS += -Wno-unsafe-buffer-usage
#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
# don't use -Ofast
# -ftrapv
LINKFLAGS += -O3
# different libraries in Ubuntu or manajaró
ifndef UBUNTU
UBUNTU=1
endif
# BLAS, LAPACK
LINKFLAGS += -llapack -lblas
# -lopenblas
ifeq ($(UBUNTU),1)
# ubuntu
else
# on archlinux
LINKFLAGS += -lcblas
endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
#sudo apt install libomp-dev
# OpenMP
CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
# very good check
# http://clang.llvm.org/extra/clang-tidy/
# good check, see: http://llvm.org/docs/CodingStandards.html#include-style
SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
#READABILITY=,readability*${SWITCH_OFF}
#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
#TIDYFLAGS += -checks='modernize*
# ???
#TIDYFLAGS = -checks='cert*' -header-filter=.*
# MPI checks ??
#TIDYFLAGS = -checks='mpi*'
# ??
#TIDYFLAGS = -checks='performance*' -header-filter=.*
#TIDYFLAGS = -checks='portability-*' -header-filter=.*
#TIDYFLAGS = -checks='readability-*' -header-filter=.*
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
codecheck: tidy_check
tidy_check:
clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
run: clean ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
codecheck: tidy_check

212
ex5/GCCMKL_default.mk Normal file
View file

@ -0,0 +1,212 @@
# Basic Defintions for using GNU-compiler suite with OpenMP und MKL
# requires setting of COMPILER=GCCMKL_
# install MKL in manjaro
# https://linux-packages.com/manjaro-linux/package/intel-mkl
# > sudo pacman -Sy
# > sudo pacman -S intel-mkl
ifeq ($(ONEAPI),1)
MKL_INCLUDE=/opt/intel/oneapi/mkl/2024.0/include
MKL_LIB=/opt/intel/oneapi/2024.0/lib
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/oneapi/2024.0/lib
else
MKL_INCLUDE=/usr/include/mkl
MKL_LIB=/usr/lib/x86_64-linux-gnu/mkl
endif
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
#-msse3
# -ftree-vectorizer-verbose=2 -DNDEBUG
# -ftree-vectorizer-verbose=5
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
LINKFLAGS += -O3
#architecture
CPU = -march=znver2
#CPU = -march=core-avx2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU}
# MKL
#CXXFLAGS += -I/usr/include/mkl -DUSE_MKL -Wno-redundant-decls
CXXFLAGS += -I${MKL_INCLUDE} -DUSE_MKL -Wno-redundant-decls
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_tbb_thread -ltbb -lmkl_core
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -L/usr/lib/x86_64-linux-gnu/mkl
LINKFLAGS += -L${MKL_LIB} -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
# workaround for MKL slow down on AMD hardware
# https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
default: run
libfakeintel.so:
gcc -shared -fPIC -o libfakeintel.so fakeintel.c
echo "call: export LD_PRELOAD=./libfakeintel.so "
# different libraries in Ubuntu or manajaro
#ifndef UBUNTU
#UBUNTU=1
#endif
## BLAS, LAPACK
#ifeq ($(UBUNTU),1)
#LINKFLAGS += -llapack -lblas
## -lopenblas
#else
## on archlinux
#LINKFLAGS += -llapack -lopenblas -lcblas
#endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
# for debugging purpose (save code)
# -fsanitize=leak # only one out the three can be used
# -fsanitize=address
# -fsanitize=thread
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
#CXXFLAGS += ${SANITARY}
#LINKFLAGS += ${SANITARY}
# OpenMP
CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
-@rm -rf html
run: clean ${PROGRAM} libfakeintel.so
#run: ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
#find . -size +10M > large_files
#--exclude-from ${MY_DIR}/large_files
zip: clean
@echo "Zip the directory: " ${MY_DIR}
@cd .. ;\
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
doc:
doxygen Doxyfile
#########################################################################
.SUFFIXES: .f90
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
.f90.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
# no "-pg" in compile/link options
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Graphical interface
# valkyrie
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
# * install
# * sudo vi /etc/sysctl.conf
# add kernel.perf_event_paranoid = 0
#Trace your heap:
#> heaptrack ./main.GCC_
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
heap: ${PROGRAM}
heaptrack ./$^ ${PARAMS} 11
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
codecheck: $(SOURCES)
cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
########################################################################
# get the detailed status of all optimization flags
info:
echo "detailed status of all optimization flags"
$(CXX) --version
$(CXX) -Q $(CXXFLAGS) --help=optimizers
lscpu
inxi -C
lstopo
# Excellent hardware info
# hardinfo
# Life monitoring of CPU frequency etc.
# sudo i7z
# Memory consumption
# vmstat -at -SM 3
# xfce4-taskmanager
# https://www.tecmint.com/check-linux-cpu-information/
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
# Debugging:
# https://wiki.archlinux.org/index.php/Debugging

111
ex5/GCC_SINGLE_default.mk Normal file
View file

@ -0,0 +1,111 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=GCC_
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
# on mephisto:
#CXXFLAGS += -I/share/apps/atlas/include
#LINKFLAGS += -L/share/apps/atlas/lib -L/usr/lib64/atlas
#LINKFLAGS += -latlas -lcblas
#LINKFLAGS += -lblas
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
#CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native -DNDEBUG ${WARNINGS}
CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native ${WARNINGS}
# info on vectorization
#VECTORIZE = -ftree-vectorize -fdump-tree-vect-blocks=foo.dump
#-fdump-tree-pre=stderr
VECTORIZE = -ftree-vectorize -fopt-info -ftree-vectorizer-verbose=5
#CXXFLAGS += ${VECTORIZE}
# -funroll-all-loops -msse3
#GCC -march=knl -march=broadwell -march=haswell
# interprocedural optimization
#CXXFLAGS += -flto
LINKFLAGS += -flto
# for debugging purpose (save code)
# -fsanitize=leak # only one out the trhee can be used
# -fsanitize=address
# -fsanitize=thread
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
#CXXFLAGS += ${SANITARY}
#LINKFLAGS +=${SANITARY}
# OpenMP but no OpenMP in Single mode
#CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
@rm -rf html
run: clean ${PROGRAM}
# time ./${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar:
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
thread:${PROGRAM}
valgrind -v --tool=helgrind --log-file=$^.thread.out ./$^
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
./$^
gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &

182
ex5/GCC_default.mk Normal file
View file

@ -0,0 +1,182 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=GCC_
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
CXXFLAGS += -ffast-math -O3 -march=native -std=c++20 ${WARNINGS}
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
#-msse3
# -ftree-vectorizer-verbose=2 -DNDEBUG
# -ftree-vectorizer-verbose=5
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
LINKFLAGS += -O3
#architecture
#CPU = -march=znver2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU} -ltbb
# different libraries in Ubuntu or manajaró
ifndef UBUNTU
UBUNTU=1
endif
# BLAS, LAPACK
ifeq ($(UBUNTU),1)
LINKFLAGS += -llapack -lblas
# -lopenblas
else
# on archlinux
LINKFLAGS += -llapack -lopenblas -lcblas
endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
# for debugging purpose (save code)
# -fsanitize=leak # only one out the three can be used
# -fsanitize=address
# -fsanitize=thread
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
#CXXFLAGS += ${SANITARY}
#LINKFLAGS += ${SANITARY}
# OpenMP
CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
-@rm -rf html
run: clean ${PROGRAM}
#run: ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
#find . -size +10M > large_files
#--exclude-from ${MY_DIR}/large_files
zip: clean
@echo "Zip the directory: " ${MY_DIR}
@cd .. ;\
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
doc:
doxygen Doxyfile
#########################################################################
.SUFFIXES: .f90
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
.f90.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
# no "-pg" in compile/link options
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Graphical interface
# valkyrie
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
# * install
# * sudo vi /etc/sysctl.conf
# add kernel.perf_event_paranoid = 0
#Trace your heap:
#> heaptrack ./main.GCC_
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
heap: ${PROGRAM}
heaptrack ./$^ ${PARAMS} 11
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
codecheck: $(SOURCES)
cppcheck --enable=all --inconclusive --std=c++17 -I${CUDA_INC} --suppress=missingIncludeSystem $^
########################################################################
# get the detailed status of all optimization flags
info:
echo "detailed status of all optimization flags"
$(CXX) --version
$(CXX) -Q $(CXXFLAGS) --help=optimizers
lscpu
inxi -C
lstopo
# Excellent hardware info
# hardinfo
# Life monitoring of CPU frequency etc.
# sudo i7z
# Memory consumption
# vmstat -at -SM 3
# xfce4-taskmanager
# https://www.tecmint.com/check-linux-cpu-information/
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
# Debugging:
# https://wiki.archlinux.org/index.php/Debugging

151
ex5/ICC_default.mk Normal file
View file

@ -0,0 +1,151 @@
# Basic Defintions for using INTEL compiler suite sequentially
# requires setting of COMPILER=ICC_
# special on my sony [GH]
#BINDIR = /opt/save.intel/bin/
# very special on my sony [GH]
# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
#export KMP_AFFINITY=verbose,compact
CC = ${BINDIR}icc
CXX = ${BINDIR}icpc
F77 = ${BINDIR}ifort
LINKER = ${CXX}
WARNINGS = -pedantic -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012
#-Winline -Wunreachable-code -Wredundant-decls
CXXFLAGS += -std=c++17 -O3 -fma -DNDEBUG ${WARNINGS} -mkl
#CXXFLAGS += -std=c++17 -O3 -march=core-avx2 -fma -ftz -fomit-frame-pointer -DNDEBUG ${WARNINGS} -mkl
# -fast # fast inludes also -ipo !
CXXFLAGS += -fargument-noalias -fargument-noalias-global -ansi-alias
CXXFLAGS += -align -qopt-dynamic-align
#CXXFLAGS += -xCore-AVX2
#CXXFLAGS += -tp=zen
# -qopt-subscript-in-range
# -vec-threshold0
# -xCORE-AVX2
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
# -ipo
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
# Redirect report from *.optrpt to stderr
# -qopt-report-file=stderr
# Guided paralellization
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
# interprocedural optimization
#CXXFLAGS += -ipo
#LINKFLAGS += -ipo
# annotated Assembler file
ANNOTED = -fsource-asm -S
# OpenMP
CXXFLAGS += -qopenmp
# -qopt-report-phase=openmp
# -diag-enable=sc-full -diag-file=filename -diag-file-append[=filename]
LINKFLAGS += -qopenmp
# use MKL by INTEL
# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
LINKFLAGS += -O2 -mkl
# -ipo
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# # some tools
# # Cache behaviour (CXXFLAGS += -g tracks down to source lines)
# cache: ${PROGRAM}
# valgrind --tool=callgrind --simulate-cache=yes ./$^
# # kcachegrind callgrind.out.<pid> &
#
# # Check for wrong memory accesses, memory leaks, ...
# # use smaller data sets
# mem: ${PROGRAM}
# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
#
# # Simple run time profiling of your code
# # CXXFLAGS += -g -pg
# # LINKFLAGS += -pg
# prof: ${PROGRAM}
# ./$^
# gprof -b ./$^ > gp.out
# # kprof -f gp.out -p gprof &
#
mem: inspector
prof: amplifier
cache: amplifier
gap_par_report:
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
# GUI for performance report
amplifier: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
amplxe-gui &
# GUI for Memory and Thread analyzer (race condition)
inspector: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
#${BINDIR}../inspector_xe_2013/bin64/inspxe-gui &
inspxe-gui &
advisor:
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
export ADVIXE_EXPERIMENTAL=roofline
advixe-gui &
icc-info:
icpc -# main.cpp

181
ex5/ONEAPI_default.mk Normal file
View file

@ -0,0 +1,181 @@
# Basic Defintions for using INTEL compiler suite sequentially
# requires setting of COMPILER=ONEAPI_
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# requires
# source /opt/intel/oneapi/setvars.sh
# on AMD: export MKL_DEBUG_CPU_TYPE=5
#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
#export KMP_AFFINITY=verbose,compact
CC = ${BINDIR}icc
CXX = ${BINDIR}dpcpp
F77 = ${BINDIR}ifort
LINKER = ${CXX}
## Compiler flags
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
WARNINGS += -Wpessimizing-move -Wredundant-move
#-wd2015,2012,2014 -wn3
# -Winline -Wredundant-decls -Wunreachable-code
# -qopt-subscript-in-range
# -vec-threshold0
CXXFLAGS += -O3 -std=c++17 -tbb ${WARNINGS}
# https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned
# needed on Desktop-PC Haase (not needed on mephisto)
CXXFLAGS += -D_GLIBCXX_USE_TBB_PAR_BACKEND=0
#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include"
#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include"
LINKFLAGS += -O3 -tbb
#LINKFLAGS += -no-prec-div
# interprocedural optimization
CXXFLAGS += -ipo
LINKFLAGS += -ipo
LINKFLAGS += -flto
# annotated Assembler file
ANNOTED = -fsource-asm -S
#architecture
CPU = -march=core-avx2
#CPU += -mtp=zen
# -xCORE-AVX2
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU}
# use MKL by INTEL
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# sequential MKL
# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
CXXFLAGS += -qmkl=sequential -UMKL_ILP64
LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static
# install intel-oneapi-compiler-static
# or
LINKFLAGS += -shared-intel
OPENMP = -qopenmp
CXXFLAGS += ${OPENMP}
LINKFLAGS += ${OPENMP}
# profiling tools
#CXXFLAGS += -pg
#LINKFLAGS += -pg
# -vec-report=3
# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
# Redirect report from *.optrpt to stderr
# -qopt-report-file=stderr
# Guided paralellization
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
## run time checks
# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines)
# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
mem: inspector
prof: vtune
cache: inspector
gap_par_report:
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
# GUI for performance report
amplifier: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
amplxe-gui &
# GUI for Memory and Thread analyzer (race condition)
inspector: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# inspxe-gui &
vtune-gui ./${PROGRAM} &
advisor:
# source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
advisor --collect=survey ./${PROGRAM} ${PARAMS}
# advisor --collect=roofline ./${PROGRAM} ${PARAMS}
advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
# advisor-gui
vtune:
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
export ADVIXE_EXPERIMENTAL=roofline
vtune -collect hotspots ./${PROGRAM} ${PARAMS}
vtune -report hotspots -r r000hs > vtune.out
# vtune-gui ./${PROGRAM} &
icc-info:
icpc -# main.cpp
# MKL on AMD
# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
#
# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
# export MKL_DEBUG_CPU_TYPE=5
# export MKL_NUM_THRAEDS=1
# export MKL_DYNAMIC=false
# on Intel compiler
# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html

96
ex5/PGI_default.mk Normal file
View file

@ -0,0 +1,96 @@
# Basic Defintions for using PGI-compiler suite sequentially
# requires setting of COMPILER=PGI_
# OPTIRUN = optirun
# on mephisto:
#CXXFLAGS += -I/share/apps/atlas/include
#LINKFLAGS += -L/share/apps/atlas/lib
#LINKFLAGS += -lcblas -latlas
LINKFLAGS += -lblas
CC = pgcc
CXX = pgc++
F77 = pgfortran
LINKER = ${CXX}
WARNINGS = -Minform=warn
#PGI_PROFILING = -Minfo=loop,vect,opt,intensity,mp,accel
PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
# -Minfo
# -Mprof=lines
CXXFLAGS += -std=c++14 -O3 -fast -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
CXXFLAGS += -Mvect -Mcache_align -Msafeptr -Mprefetch -Mlre -Mdepchk
#-Msmart
LINKFLAGS += ${PGI_PROFILING}
#-lcblas
# OpenMP
CXXFLAGS += -mp=align,bind,numa -Mneginfo=mp
LINKFLAGS += -mp=allcores,bind,numa
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# # some tools
# # Simple run time profiling of your code
# # CXXFLAGS += -g
# # LINKFLAGS +=
# Profiling options PGI, see: pgprof -h
PROF_FILE = jac.pgprof
# CPU_PROF = -allcache
CPU_PROF = --cpu-profiling on --analysis-metrics
# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
#GPU_PROF = -cuda=branch:cc20
#
cache: prof
prof: ${PROGRAM}
# ./$^
# $(CUDA_HOME)/bin/nvvp &
# more /opt/pgi/linux86-64/16.10/bin/pgcollectrc
${OPTIRUN} ${BINDIR}pgprof ${CPU_PROF} -o $(PROF_FILE) ./$^
${OPTIRUN} ${BINDIR}pgprof -i $(PROF_FILE) 2> prof.out

30
ex5/ex5_1/Makefile Normal file
View file

@ -0,0 +1,30 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp mylib.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

99
ex5/ex5_1/check_env.h Normal file
View file

@ -0,0 +1,99 @@
#pragma once
#include <iostream>
#ifdef _OPENMP
#include <omp.h>
#endif
#include <unordered_map>
//#####################################
// G.Haase
// See https://sourceforge.net/p/predef/wiki/Compilers/
// http://www.cplusplus.com/doc/tutorial/preprocessor/
// also: export OMP_DISPLAY_ENV=VERBOSE
//#####################################
/** Checks for compilers, its versions, threads etc.
*
@param[in] argc number of command line arguemnts
@param[in] argv command line arguments as array of C-strings
*/
template <class T>
void check_env(T argc, char const *argv[])
{
std::cout << "\n#######################################################################\n";
std::cout << "Code :";
for (T k = 0; k < argc; ++k) std::cout << " " << argv[k];
std::cout << std::endl;
// compiler: https://sourceforge.net/p/predef/wiki/Compilers/
std::cout << "Compiler: ";
#if defined __INTEL_COMPILER
#pragma message(" ########## INTEL ###############")
std::cout << "INTEL " << __INTEL_COMPILER;
// Ignore warnings for #pragma acc unrecognice
#pragma warning disable 161
// Ignore warnings for #pragma omp unrecognice
#pragma warning disable 3180
#elif defined __PGI
#pragma message(" ########## PGI ###############")
std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
#elif defined __clang__
#pragma message(" ########## CLANG ###############")
std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
#elif defined __GNUC__
#pragma message(" ########## Gnu ###############")
std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
#else
#pragma message(" ########## unknown Compiler ###############")
std::cout << "unknown";
#endif
std::cout << " C++ standard: " << __cplusplus << std::endl;
// Parallel environments
std::cout << "Parallel: ";
#if defined MPI_VERSION
#pragma message(" ########## MPI ###############")
#ifdef OPEN_MPI
std::cout << "OpenMPI ";
#else
std::cout << "MPI ";
#endif
std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " ";
#endif
#ifdef _OPENMP
//https://www.openmp.org/specifications/
//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
std::unordered_map<unsigned, std::string> const map{
{200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
#pragma message(" ########## OPENMP ###############")
//std::cout << _OPENMP;
std::cout << "OpenMP ";
try {
std::cout << map.at(_OPENMP);
}
catch (...) {
std::cout << _OPENMP;
}
#pragma omp parallel
{
#pragma omp master
{
const int nn = omp_get_num_threads(); // OpenMP
std::cout << " ---> " << nn << " Threads ";
}
#pragma omp barrier
}
#endif
#ifdef _OPENACC
#pragma message(" ########## OPENACC ###############")
std::cout << "OpenACC ";
#endif
std::cout << std::endl;
std::cout << "Date : " << __DATE__ << " " << __TIME__;
std::cout << "\n#######################################################################\n";
}
// HG

142
ex5/ex5_1/main.cpp Normal file
View file

@ -0,0 +1,142 @@
#include "check_env.h"
#include "mylib.h"
#include <cstdlib> // atoi()
#include <cstring> // strncmp()
#include <ctime>
#include <iostream>
#include <omp.h> // OpenMP
#include <sstream>
#include <string>
using namespace std;
int main(int argc, char const *argv[])
{
omp_set_schedule(omp_sched_static, 2000000);
//omp_set_schedule(omp_sched_dynamic, 1000000);
//omp_set_schedule(omp_sched_guided, 1000000);
//omp_set_schedule(omp_sched_auto, 1); // chunk size does not matter for auto
// Speedup for different number of cores (incl. hyperthreading)
omp_set_num_threads(8);
// Print number of available processors
cout << "Number of available processors: " << omp_get_num_procs() << endl;
// Currently executing parallel code? -> no
cout << "Currently in parallel? " << omp_in_parallel() << endl;
int const NLOOPS = 10; // chose a value such that the benchmark runs at least 10 sec.
unsigned int N = 500000001;
//##########################################################################
// Read Parameter from command line (C++ style)
cout << "Checking command line parameters for: -n <number> " << endl;
for (int i = 1; i < argc; i++)
{
cout << " arg[" << i << "] = " << argv[i] << endl;
string ss(argv[i]);
if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
{
N = static_cast<unsigned int>(atoi(argv[i + 1]));
}
else
{
cout << "Corect call: " << argv[0] << " -n <number>\n";
}
}
cout << "\nN = " << N << endl;
check_env(argc, argv);
//########################################################################
int nthreads; // OpenMP
#pragma omp parallel default(none) shared(cout,nthreads)
{
stringstream inparallel;
inparallel << "Currently in parallel? " << omp_in_parallel() << endl;
int const th_id = omp_get_thread_num(); // OpenMP
int const nthrds = omp_get_num_threads(); // OpenMP
stringstream ss;
ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
#pragma omp critical
{
cout << ss.str(); // output to a shared ressource
cout << inparallel.str() << endl;
}
#pragma omp master
nthreads = nthrds; // transfer nn to to master thread
}
cout << " " << nthreads << " threads have been started." << endl;
//##########################################################################
// Memory allocation
cout << "Memory allocation\n";
vector<double> x(N), y(N);
cout.precision(2);
cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
//##########################################################################
// Data initialization
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
for (unsigned int i = 0; i < N; ++i)
{
x[i] = i + 1;
y[i] = 1.0 / x[i];
}
//##########################################################################
cout << "\nStart Benchmarking\n";
// Do calculation
double tstart = omp_get_wtime(); // OpenMP
double sk(0.0);
for (int i = 0; i < NLOOPS; ++i)
{
//sk = scalar(x, y);
sk = scalar_parallel(x, y);
//sk = scalar_trans(x, y);
//sk = norm(x);
}
double t1 = omp_get_wtime() - tstart; // OpenMP
t1 /= NLOOPS; // divide by number of function calls
//##########################################################################
// Check the correct result
cout << "\n <x,y> = " << sk << endl;
if (static_cast<unsigned int>(sk) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout << endl;
cout.precision(2);
cout << "Total benchmarking time: " << t1*NLOOPS << endl;
cout << "Timing in sec. : " << t1 << endl;
cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
//#########################################################################
cout << "\n Try the reduction with an STL-vektor \n";
auto vr = reduction_vec_append(5);
cout << "done\n";
cout << vr << endl;
return 0;
} // memory for x and y will be deallocated their destructors

137
ex5/ex5_1/mylib.cpp Normal file
View file

@ -0,0 +1,137 @@
#include "mylib.h"
#include <cassert> // assert()
#include <cmath>
#include <iostream>
#include <functional> // multiplies<>{}
#include <list>
#include <numeric> // iota()
#ifdef _OPENMP
#include <omp.h>
#endif
#include <vector>
using namespace std;
double scalar_parallel(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
size_t const N = x.size();
double sum = 0.0;
#pragma omp parallel default(none) shared(x,y,N, cout) reduction(+:sum)
{
const size_t nthreads = omp_get_num_threads();
const size_t threadnum = omp_get_thread_num();
const size_t chunksize = N/nthreads;
size_t start = threadnum*chunksize;
size_t end = start + chunksize;
if (threadnum == nthreads - 1)
end = N;
for (size_t i = start; i < end; ++i)
{
sum += x[i] * y[i];
}
}
return sum;
}
vector<int> reduction_vec_append(int n)
{
vector<int> vec(n);
#pragma omp parallel default(none) shared(cout) reduction(VecAppend:vec)
{
#pragma omp barrier
#pragma omp critical
cout << omp_get_thread_num() << " : " << vec.size() << endl;
#pragma omp barrier
iota( vec.begin(),vec.end(), omp_get_thread_num() );
#pragma omp barrier
}
return vec;
}
double scalar(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
size_t const N = x.size();
double sum = 0.0;
#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
for (size_t i = 0; i < N; ++i)
{
sum += x[i] * y[i];
//sum += exp(x[i])*log(y[i]);
}
return sum;
}
double norm(vector<double> const &x)
{
size_t const N = x.size();
double sum = 0.0;
#pragma omp parallel for default(none) shared(x,N) reduction(+:sum) schedule(runtime) // added schedule(runtime)
for (size_t i = 0; i < N; ++i)
{
sum += x[i]*x[i];
}
return sum;
}
vector<int> reduction_vec(int n)
{
vector<int> vec(n);
#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
{
#pragma omp barrier
#pragma omp critical
cout << omp_get_thread_num() << " : " << vec.size() << endl;
#pragma omp barrier
iota( vec.begin(),vec.end(), omp_get_thread_num() );
#pragma omp barrier
}
return vec;
}
double scalar_trans(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
vector<double> z(x.size());
//list<double> z(x.size()); // parallel for-loop on iterators not possible (missing 'operator-')
// c++-20 CLANG_, ONEAPI_:condition of OpenMP for loop must be a relational comparison
transform(cbegin(x),cend(x),cbegin(y),begin(z),std::multiplies<>{});
double sum = 0.0;
#pragma omp parallel for default(none) shared(z) reduction(+:sum) schedule(runtime) // added schedule(runtime)
for (auto pi = cbegin(z); pi!=cend(z); ++pi)
{
sum += *pi;
}
//for (auto val: z)
//{
//sum += val;
//}
return sum;
}

88
ex5/ex5_1/mylib.h Normal file
View file

@ -0,0 +1,88 @@
#pragma once
#include <cassert>
#include <iomanip> // setw()
#include <iostream>
#include <omp.h>
#include <vector>
/** Inner product
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double scalar_parallel(std::vector<double> const &x, std::vector<double> const &y);
double scalar(std::vector<double> const &x, std::vector<double> const &y);
double scalar_trans(std::vector<double> const &x, std::vector<double> const &y);
// Declare additional reduction operation in OpenMP for STL-vector
#pragma omp declare reduction(VecAppend : std::vector<int> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end())) \
initializer (omp_priv=omp_orig)
std::vector<int> reduction_vec_append(int n);
/** l2-norm
@param[in] x vector
@return resulting Euclidian norm
*/
double norm(std::vector<double> const &x);
/** Vector @p b adds its elements to vector @p a .
@param[in] a vector
@param[in] b vector
@return a+=b componentwise
*/
template<class T>
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
{
assert(a.size()==b.size());
for (size_t k = 0; k < a.size(); ++k) {
a[k] += b[k];
}
return a;
}
// Declare the reduction operation in OpenMP for an STL-vector
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
// ------------------------------------------------------------
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
#pragma omp declare reduction(VecAdd : std::vector<int> : omp_out += omp_in) \
initializer (omp_priv=omp_orig)
// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type.
//template <class T>
//#pragma omp declare reduction(VecAdd : std::vector<T> : omp_out += omp_in) initializer (omp_priv(omp_orig))
// MS: template nach #pragma !?
// ------------------------------------------------------------
/** Test for vector reduction.
*
* The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
* Afterwards these vectors are accumulated, i.e.,
* @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
*
* @param[in] n size of global/private vector
* @return resulting global vector.
*/
std::vector<int> reduction_vec(int n);
/** Output of a vector.
@param[in,out] s output stream
@param[in] x vector
@return modified output stream
*/
template <class T>
std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
{
for (auto const &v : x) s << std::setw(4) << v << " ";
return s;
}

70
ex5/ex5_1/timing.h Normal file
View file

@ -0,0 +1,70 @@
#pragma once
#include <chrono> // timing
#include <stack>
using Clock = std::chrono::system_clock; //!< The wall clock timer chosen
//using Clock = std::chrono::high_resolution_clock;
using TPoint= std::chrono::time_point<Clock>;
// [Galowicz, C++17 STL Cookbook, p. 29]
inline
std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
/** Starts stopwatch timer.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
*
* The timining is allowed to be nested and the recent time is stored on top of the stack.
*
* @return recent time
* @see toc
*/
inline auto tic()
{
MyStopWatch.push(Clock::now());
return MyStopWatch.top();
}
/** Returns the elapsed time from stopwatch.
*
* The time from top of the stack is used
* if time point @p t_b is not passed as input parameter.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
* or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode
* The last option is to be used in the case of
* non-nested but overlapping time measurements.
*
* @param[in] t_b start time of some stop watch
* @return elapsed time in seconds.
*
*/
inline double toc(TPoint const &t_b = MyStopWatch.top())
{
// https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
using Unit = std::chrono::seconds;
using FpSeconds = std::chrono::duration<double, Unit::period>;
auto t_e = Clock::now();
MyStopWatch.pop();
return FpSeconds(t_e-t_b).count();
}
#include <iostream>
#include <string>
/** Executes function @p f and measures/prints elapsed wall clock time in seconds
*
* Call as
* @code measure("Time for (b = b + 1)", [&]() {
thrust::transform(b.begin(), b.end(), b.begin(), increment());
}); @endcode
*
* @param[in] label additional string to be printed with the measurement.
* @param[in] f function to execute.
* @author Therese Bösmüller, 2025
*
*/
auto measure = [](const std::string& label, auto&& f) {
auto start = std::chrono::high_resolution_clock::now();
f();
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
std::cout << label << ": " << duration << " microseconds" << std::endl;
}; // ';' is needed for a visible documentation of this lambda-function

31
ex5/ex5_2/Makefile Normal file
View file

@ -0,0 +1,31 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp mylib.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

501
ex5/ex5_2/data_1.txt Normal file
View file

@ -0,0 +1,501 @@
141
261
87
430
258
298
425
120
496
707
244
786
75
394
4
221
2
190
143
269
175
139
599
902
940
222
483
377
524
265
69
437
174
27
955
431
962
763
8
681
706
646
553
219
773
229
371
891
857
403
319
609
911
910
592
333
854
443
905
34
533
717
180
337
188
322
404
549
49
553
275
242
244
155
957
936
819
729
176
361
189
2
317
700
626
544
440
288
502
762
763
577
748
646
124
505
348
93
148
199
673
432
695
257
10
533
280
947
907
393
25
672
838
972
57
451
583
687
720
651
727
374
582
117
58
980
285
595
963
186
194
342
933
391
274
152
398
375
132
436
92
615
11
574
790
236
449
570
62
497
643
222
838
972
847
506
279
747
237
958
621
601
173
91
256
859
912
700
726
230
577
811
404
989
90
321
512
61
726
557
530
830
859
790
318
453
753
110
110
270
525
973
711
312
292
851
912
640
256
89
839
585
949
62
585
286
828
191
443
394
827
677
208
319
134
672
571
170
148
477
909
553
33
54
806
452
383
790
365
533
712
872
329
651
975
76
588
414
310
264
759
996
187
782
196
993
803
425
729
499
809
357
74
591
911
194
433
750
40
947
764
559
184
498
518
995
855
963
679
404
935
480
232
397
706
559
757
996
963
536
964
116
52
305
581
531
902
541
432
543
713
17
801
143
479
257
370
662
170
279
199
196
327
881
472
404
180
969
408
845
616
377
878
785
465
814
899
430
335
597
902
703
378
735
955
543
541
312
72
182
93
464
10
916
643
2
31
209
455
128
9
728
355
781
437
437
50
50
92
595
242
842
858
964
489
221
227
537
763
348
462
640
918
162
716
578
434
885
394
179
634
625
328
803
1000
981
128
233
24
608
111
408
885
549
370
209
441
957
125
471
857
44
692
979
284
134
686
910
611
900
194
755
347
419
156
820
625
739
806
68
951
498
756
743
832
157
458
619
933
836
896
583
583
855
35
886
408
37
747
155
144
606
255
325
402
407
387
610
167
189
95
324
770
235
741
693
825
828
294
310
524
326
832
811
557
263
681
234
457
385
539
992
756
981
235
529
52
757
602
858
989
930
410
1
541
208
220
326
96
748
749
544
339
833
553
958
893
357
547
347
623
797
746
126
823
26
415
732
782
368

130
ex5/ex5_2/main.cpp Normal file
View file

@ -0,0 +1,130 @@
#include "mylib.h"
#include <fstream>
#include <iostream>
#include <omp.h>
#include <vector>
using namespace std;
int main()
{
// read vector from file
vector<size_t> data_vector = {};
ifstream input_stream("data_1.txt");
size_t line;
while(input_stream >> line)
{
data_vector.push_back(line);
}
data_vector.shrink_to_fit();
// specify loops
size_t NLOOPS = 10000;
// ############# Parallelization with openMP #############
// calculate arithmetic mean, geometric mean and harmonic mean
double am_omp, gm_omp, hm_omp;
double tstart = omp_get_wtime();
for (size_t i = 0; i < NLOOPS; ++i)
means_omp(data_vector, am_omp, gm_omp, hm_omp);
double t_means_omp = (omp_get_wtime() - tstart)/NLOOPS;
// calculate minimum and maximum
size_t min, max;
tstart = omp_get_wtime();
for (size_t i = 0; i < NLOOPS; ++i)
minmax_omp(data_vector, min, max);
double t_minmax_omp = (omp_get_wtime() - tstart)/NLOOPS;
// ############# Parallelization with C++ algorithms #############
// calculate arithmetic mean, geometric mean and harmonic mean
double am_cpp, gm_cpp, hm_cpp;
tstart = omp_get_wtime();
for (size_t i = 0; i < NLOOPS; ++i)
means_cpp(data_vector, am_cpp, gm_cpp, hm_cpp);
double t_means_cpp = (omp_get_wtime() - tstart)/NLOOPS;
// calculate minimum and maximum
size_t min_cpp, max_cpp;
tstart = omp_get_wtime();
for (size_t i = 0; i < NLOOPS; ++i)
minmax_cpp(data_vector, min_cpp, max_cpp);
double t_minmax_cpp = (omp_get_wtime() - tstart)/NLOOPS;
// print results
cout << "####### OpenMP #######" << endl;
cout << "minimum: " << min << endl;
cout << "maximum: " << max << endl;
cout << "duration: " << t_minmax_omp << endl << endl;
cout << "arithmetic mean: " << am_omp << endl;
cout << "geometric mean: " << gm_omp << endl;
cout << "harmonic mean: " << hm_omp << endl;
cout << "duration: " << t_means_omp << endl << endl;
cout << "####### C++ #######" << endl;
cout << "minimum: " << min_cpp << endl;
cout << "maximum: " << max_cpp << endl;
cout << "duration: " << t_minmax_cpp << endl << endl;
cout << "arithmetic mean: " << am_cpp << endl;
cout << "geometric mean: " << gm_cpp << endl;
cout << "harmonic mean: " << hm_cpp << endl;
cout << "duration: " << t_means_cpp << endl << endl;
// ####### OpenMP #######
// minimum: 1
// maximum: 1000
// duration: 3.52086e-06
// arithmetic mean: 498.184
// geometric mean: 364.412
// harmonic mean: 95.6857
// duration: 5.90171e-06
// ####### C++ #######
// minimum: 1
// maximum: 1000
// duration: 1.76816e-05
// arithmetic mean: 498.184
// geometric mean: 364.412
// harmonic mean: 95.6857
// duration: 2.35728e-05
// --> the openMP variant is faster in both cases
return 0;
}

103
ex5/ex5_2/mylib.cpp Normal file
View file

@ -0,0 +1,103 @@
#include "mylib.h"
#include <algorithm>
#include <cmath>
#include <execution>
#include <iostream>
#include <numeric>
#include <omp.h>
using namespace std;
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
{
size_t const n = numbers.size();
am = 0.;
gm = 0.;
hm = 0.;
#pragma omp parallel for shared(numbers, n, cout) reduction(+:am, gm, hm)
for (size_t i = 0; i < n; ++i)
{
am += numbers[i];
gm += log(numbers[i]);
hm += 1.0/numbers[i];
// #pragma omp critical
// {
// cout << "Thread number " << omp_get_thread_num() << " processes value " << numbers[i] << endl;
// }
}
am /= n;
gm = exp(gm/n);
hm = n/hm;
}
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
{
size_t const n = numbers.size();
global_min = -1; // gives the maximum size_t value
global_max = 0;
#pragma omp parallel shared(numbers, n, global_min, global_max)
{
const size_t nthreads = omp_get_num_threads();
const size_t threadnum = omp_get_thread_num();
const size_t chunksize = n/nthreads;
size_t start = threadnum*chunksize;
size_t end = start + chunksize;
if (threadnum == nthreads - 1)
end = n;
size_t local_min = -1;
size_t local_max = 0;
for (size_t i = start; i < end ; ++i)
{
if (numbers[i] < local_min)
local_min = numbers[i];
if (numbers[i] > local_max)
local_max = numbers[i];
}
#pragma omp critical
{
if (local_min < global_min)
global_min = local_min;
if (local_max > global_max)
global_max = local_max;
}
}
}
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm)
{
size_t const n = numbers.size();
am = reduce(std::execution::par, numbers.begin(), numbers.end());
gm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return log(x); } );
hm = transform_reduce(std::execution::par, numbers.begin(), numbers.end(), 0.0, plus{}, [] (size_t x) -> double { return 1.0/x; });
am /= n;
gm = exp(gm/n);
hm = n/hm;
}
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max)
{
auto min_it = min_element(std::execution::par, numbers.begin(), numbers.end());
auto max_it = max_element(std::execution::par, numbers.begin(), numbers.end());
global_min = *min_it;
global_max = *max_it;
}

42
ex5/ex5_2/mylib.h Normal file
View file

@ -0,0 +1,42 @@
#include <vector>
/**
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
Uses openMP parallelization.
@param[in] numbers vector containing integers
@param[out] am arithmetic mean
@param[out] gm geometric mean
@param[out] hm harmonic mean
*/
void means_omp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
/**
This function calculates the minimum and maximum of a vector.
Uses openMP parallelization.
@param[in] numbers vector containing integers
@param[out] global_min minimum
@param[out] global_max maximum
*/
void minmax_omp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);
/**
This function calculates arithmetic mean, geometric mean and harmonic mean of an integer vector.
Uses C++ parallelization.
@param[in] numbers vector containing integers
@param[out] am arithmetic mean
@param[out] gm geometric mean
@param[out] hm harmonic mean
*/
void means_cpp(const std::vector<size_t> numbers, double &am, double &gm, double &hm);
/**
This function calculates the minimum and maximum of a vector.
Uses C++ parallelization.
@param[in] numbers vector containing integers
@param[out] global_min minimum
@param[out] global_max maximum
*/
void minmax_cpp(const std::vector<size_t> numbers, size_t &global_min, size_t &global_max);

30
ex5/ex5_3/Makefile Normal file
View file

@ -0,0 +1,30 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp goldbach.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

46
ex5/ex5_3/goldbach.cpp Normal file
View file

@ -0,0 +1,46 @@
#include "goldbach.h"
#include <iostream>
#include <iterator>
#include <omp.h>
size_t single_goldbach(size_t k)
{
const std::vector<size_t> relevant_primes = get_primes(k);
size_t m = relevant_primes.size();
size_t counter = 0;
#pragma omp parallel for shared(relevant_primes, m, k) reduction(+:counter)
for(size_t i = 0; i < m; ++i)
{
for(size_t j = i; j < m; ++j)
{
if(relevant_primes[i] + relevant_primes[j] == k)
++counter;
}
}
return counter;
}
std::vector<size_t> count_goldbach(size_t n)
{
const std::vector<size_t> relevant_primes = get_primes(n);
size_t m = relevant_primes.size();
std::vector<size_t> counter_vector(n + 1, 0);
#pragma omp parallel for shared(relevant_primes, m, n) reduction(VecAdd:counter_vector)
for(size_t i = 0; i < m; ++i)
{
for(size_t j = i; j < m; ++j)
{
size_t sum = relevant_primes[i] + relevant_primes[j];
if(sum <= n)
++counter_vector[relevant_primes[i] + relevant_primes[j]];
}
}
return counter_vector;
}

45
ex5/ex5_3/goldbach.h Normal file
View file

@ -0,0 +1,45 @@
#pragma once
#include "mayer_primes.h"
#include <cassert>
#include <vector>
/**
This function returns the number of possible decompositions of an integer into a sum of two prime numbers.
@param[in] k first integer
@param[out] count number of decompositions
*/
size_t single_goldbach(size_t k);
/**
This function returns the number of possible decompositions into a sum of two prime numbers of all even integers in the interval [4,n].
@param[in] n upper integer bound
@param[out] count_vector vector containing the number of decompositions for a natural number the corresponding index
*/
std::vector<size_t> count_goldbach(size_t n);
/** Vector @p b adds its elements to vector @p a .
@param[in] a vector
@param[in] b vector
@return a+=b componentwise
*/
template<class T>
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
{
assert(a.size()==b.size());
for (size_t k = 0; k < a.size(); ++k) {
a[k] += b[k];
}
return a;
}
// Declare the reduction operation in OpenMP for an STL-vector
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
// ------------------------------------------------------------
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
#pragma omp declare reduction(VecAdd : std::vector<size_t> : omp_out += omp_in) initializer (omp_priv=omp_orig)

45
ex5/ex5_3/main.cpp Normal file
View file

@ -0,0 +1,45 @@
#include "goldbach.h"
#include <algorithm>
#include <iostream>
#include <omp.h>
using namespace std;
int main()
{
cout << "Check: 694 has "<< single_goldbach(694) << " decompositions." << endl << "----------------------------------------" << endl;
for(size_t n : {10000, 100000, 400000, 1000000, 2000000})
{
double t_start = omp_get_wtime();
auto goldbach_vector = count_goldbach(n);
auto max_it = max_element(goldbach_vector.begin(), goldbach_vector.end());
size_t max_number = distance(goldbach_vector.begin(), max_it);
double t_end = omp_get_wtime() - t_start;
cout << "The number " << max_number << " has " << *max_it << " decompositions. Duration: " << t_end << endl;
}
/*
###### WITHOUT PARALLELIZATION ######
The number 9240 has 329 decompositions. Duration: 0.00307696
The number 99330 has 2168 decompositions. Duration: 0.189839
The number 390390 has 7094 decompositions. Duration: 1.3042
The number 990990 has 15594 decompositions. Duration: 5.45034
The number 1981980 has 27988 decompositions. Duration: 47.1807
###### WITH PARALLELIZATION ######
The number 9240 has 329 decompositions. Duration: 0.000734854
The number 99330 has 2168 decompositions. Duration: 0.0251322
The number 390390 has 7094 decompositions. Duration: 0.487375
The number 990990 has 15594 decompositions. Duration: 6.16972
The number 1981980 has 27988 decompositions. Duration: 31.5699
*/
return 0;
}

73
ex5/ex5_3/mayer_primes.h Normal file
View file

@ -0,0 +1,73 @@
#pragma once
#include <cstring> //memset
#include <vector>
//using namespace std;
/** \brief Determines all prime numbers in interval [2, @p max].
*
* The sieve of Eratosthenes is used.
*
* The implementation originates from <a href="http://code.activestate.com/recipes/576559-fast-prime-generator/">Florian Mayer</a>.
*
* \param[in] max end of interval for the prime number search.
* \return vector of prime numbers @f$2,3,5, ..., p<=max @f$.
*
* \copyright
* Copyright (c) 2008 Florian Mayer (adapted by Gundolf Haase 2018)
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
template <class T>
std::vector<T> get_primes(T max)
{
std::vector<T> primes;
char *sieve;
sieve = new char[max / 8 + 1];
// Fill sieve with 1
memset(sieve, 0xFF, (max / 8 + 1) * sizeof(char));
for (T x = 2; x <= max; x++)
{
if (sieve[x / 8] & (0x01 << (x % 8))) {
primes.push_back(x);
// Is prime. Mark multiplicates.
for (T j = 2 * x; j <= max; j += x)
{
sieve[j / 8] &= ~(0x01 << (j % 8));
}
}
}
delete[] sieve;
return primes;
}
//---------------------------------------------------------------
//int main() // by Florian Mayer
//{g++ -O3 -std=c++14 -fopenmp main.cpp && ./a.out
// vector<unsigned long> primes;
// primes = get_primes(10000000);
// // return 0;
// // Print out result.
// vector<unsigned long>::iterator it;
// for(it=primes.begin(); it < primes.end(); it++)
// cout << *it << " ";
//
// cout << endl;
// return 0;
//}

30
ex5/ex5_4/Makefile Normal file
View file

@ -0,0 +1,30 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp benchmarks.cpp benchmark_tests.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

View file

@ -0,0 +1,375 @@
#include "benchmark_tests.h"
#include "benchmarks.h"
#include <chrono>
#include <iostream>
#include <math.h>
using namespace std::chrono;
vector<double> test_A(const size_t &NLOOPS, const size_t &N)
{
cout << "#################### (A) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nN = " << N << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> x(N), y(N);
cout.precision(2);
cout << 2.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
for (size_t i = 0; i < N; ++i)
{
x[i] = i % 219 + 1;
y[i] = 1.0/x[i];
}
cout << "\nStart Benchmarking scalar\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
double check(0.0),ss(0.0);
for (size_t i = 0; i < NLOOPS; ++i)
{
check = scalar_parallel(x, y);
ss += check; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n <x,y> = " << check << endl;
if (static_cast<unsigned int>(check) != N)
cout << " !! W R O N G result !!\n";
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = 2.0*N / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = 2.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N)
{
cout << "#################### (A) sum ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nN = " << N << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> x(N);
cout.precision(2);
cout << 1.0*N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
for (size_t i = 0; i < N; ++i)
{
x[i] = 1;
}
cout << "\nStart Benchmarking sum\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
double check(0.0),ss(0.0);
for (size_t i = 0; i < NLOOPS; ++i)
{
check = sum(x);
ss += check; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n <x,y> = " << check << endl;
if (static_cast<unsigned int>(check) != N)
cout << " !! W R O N G result !!\n";
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = 1.0*N / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = 1.0*N / t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M)
{
cout << "#################### (B) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nN = " << N << endl;
cout << "\nM = " << M << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> A(M*N);
vector<double> x(N);
cout.precision(2);
cout << (1.0*M*N + N) * sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
for (size_t i = 0; i < M; ++i)
for (size_t j = 0; j < N; ++j)
A[N*i + j] = (i + j) % 219 + 1;
for (size_t j = 0; j < N; ++j)
{
x[j] = 1.0/A[N*17 + j];
}
cout << "\nStart Benchmarking MatVec\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
vector<double> b(M);
for (size_t i = 0; i < NLOOPS; ++i)
{
b = MatVec_parallel(A, x);
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n <A[17,*],x> = " << b[17] << endl;
if (static_cast<size_t>(b[17]) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = (2.0*N*M) / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = (2.0*N*M + M)/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N)
{
cout << "#################### (C) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nL = " << L << endl;
cout << "\nM = " << M << endl;
cout << "\nN = " << N << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> A(M*L);
vector<double> B(L*N);
cout.precision(2);
cout << (1.0*M*L + L*N) *sizeof(A[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
for (size_t i = 0; i < M; ++i)
for (size_t k = 0; k < L; ++k)
A[L*i + k] = (i + k) % 219 + 1;
for (size_t k = 0; k < L; ++k)
for (size_t j = 0; j < N; ++j)
B[N*k + j] = 1.0/A[L*17 + k];
cout << "\nStart Benchmarking MatMat\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
vector<double> C(M*N);
double check;
double check_sum = 0;
for (size_t i = 0; i < NLOOPS; ++i)
{
C = MatMat_parallel(A, B, L);
check = C[N*17];
check_sum += check; // prevents the optimizer from removing unused calculation results.
}
cout << check_sum;
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n C[17,0] = " << check << endl;
if (static_cast<unsigned int>(check) != L)
{
cout << " !! W R O N G result !!, should be " << L <<"\n";
}
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = (2.0*L*N*M) / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = (2.0*L*N*M + M*N)/ t_diff / 1024 / 1024 / 1024 * sizeof(A[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p)
{
cout << "#################### (D) ####################" << endl;
cout << "\nLOOPS = " << NLOOPS << endl;
cout << "\nN = " << N << endl;
cout << "\np = " << p << endl;
// Memory allocation
cout << "Memory allocation\n";
vector<double> a(p + 1, 0);
vector<double> x(N);
cout.precision(2);
cout << (1.0*(p + 1) + N) *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
// Data initialization
for (size_t j = 0; j < N; ++j)
x[j] = 1.0*j;
for (size_t k = 0; k < p + 1; ++k)
a[k] = pow(-1.0, k); // poly(x) = 1 - x + x^2 - x^3 + x^4 - ...
cout << "\nStart Benchmarking poly\n";
auto t1 = system_clock::now(); // start timer
// Do calculation
vector<double> y(N);
double check;
double check_sum;
for (size_t i = 0; i < NLOOPS; ++i)
{
y = poly_parallel(a, x);
check = y[0];
check_sum += check; // prevents the optimizer from removing unused calculation results.
}
auto t2 = system_clock::now(); // stop timer
auto duration = duration_cast<microseconds>(t2 - t1); // duration in microseconds
double t_diff = static_cast<double>(duration.count()) / 1e6; // overall duration in seconds
t_diff = t_diff/NLOOPS; // duration per loop seconds
// Check the correct result
cout << "\n poly(" << x[0] << ") = " << check << endl;
if (abs(check - 1.0) > 1.0/1e6)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
// Timings and Performance
cout << endl;
cout.precision(2);
double Gflops = (N*(p + 1)*3.0) / t_diff / 1024 / 1024 / 1024;
double MemBandwidth = (N*(2.0 + 3.0*(p + 1)))/ t_diff / 1024 / 1024 / 1024 * sizeof(x[0]);
cout << "Total duration : " << t_diff*NLOOPS << endl;
cout << "Timing in sec. : " << t_diff << endl;
cout << "GFLOPS : " << Gflops << endl;
cout << "GiByte/s : " << MemBandwidth << endl;
return vector<double>{t_diff, Gflops, MemBandwidth};
}

View file

@ -0,0 +1,13 @@
#pragma once
#include <vector>
using namespace std;
vector<double> test_A(const size_t &NLOOPS, const size_t &N);
vector<double> test_A_sum(const size_t &NLOOPS, const size_t &N);
vector<double> test_B(const size_t &NLOOPS, const size_t &N, const size_t &M);
vector<double> test_C(const size_t &NLOOPS, const size_t &L, const size_t &M, const size_t &N);
vector<double> test_D(const size_t &NLOOPS, const size_t &N, const size_t &p);

141
ex5/ex5_4/benchmarks.cpp Normal file
View file

@ -0,0 +1,141 @@
#include "benchmarks.h"
#include <cassert> // assert()
#include <cmath>
#include <iostream>
#include <vector>
#include <omp.h>
// (A) Inner product of two vectors (from skalar_stl)
double scalar_parallel(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size());
size_t const N = x.size();
double sum = 0.0;
//#pragma omp parallel for default(none) shared(x, y, N) reduction(+:sum) schedule(runtime)
#pragma omp parallel for shared(x, y, N) reduction(+:sum)
for (size_t i = 0; i < N; ++i)
{
sum += x[i] * y[i];
}
return sum;
}
// (A) Vector entry sum
double sum(vector<double> const &x)
{
double sum = 0.0;
#pragma omp parallel for shared(x) reduction(+:sum)
for (size_t i = 0; i < x.size(); ++i)
{
sum += x[i];
}
return sum;
}
// (B) Matrix-vector product (from intro_vector_densematrix)
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x)
{
size_t const nelem = A.size();
size_t const N = x.size();
assert(nelem % N == 0); // make sure multiplication is possible
size_t const M = nelem/N;
vector<double> b(M);
#pragma omp parallel for shared(A, x, N, M, b)
for (size_t i = 0; i < M; ++i)
{
double tmp = 0.0;
for (size_t j = 0; j < N; ++j)
tmp += A[N*i + j] * x[j];
b[i] = tmp;
}
return b;
}
// (C) Matrix-matrix product
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &L)
{
size_t const nelem_A = A.size();
size_t const nelem_B = B.size();
assert(nelem_A % L == 0 && nelem_B % L == 0);
size_t const M = nelem_A/L;
size_t const N = nelem_B/L;
vector<double> C(M*N);
#pragma omp parallel for shared(A, B, M, N, L, C)
for (size_t i = 0; i < M; ++i)
{
for (size_t k = 0; k < L; ++k)
{
for (size_t j = 0; j < N; ++j)
{
C[N*i + j] += A[L*i + k]*B[N*k + j];
}
}
}
return C;
}
// (D) Evaluation of a polynomial function
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x)
{
size_t const N = x.size();
size_t const p = a.size() - 1;
vector<double> y(N, 0);
#pragma omp parallel for shared(a, x, N, p, y)
for (size_t i = 0; i < N; ++i)
{
double x_temp = x[i];
double y_temp = 0;
for (size_t k = 0; k < p + 1; ++k)
{
y_temp += x_temp*y_temp + a[p - k];
}
y[i] = y_temp;
}
return y;
}

55
ex5/ex5_4/benchmarks.h Normal file
View file

@ -0,0 +1,55 @@
#pragma once
#include <vector>
using namespace std;
/** (A) Inner product of two vectors (from skalar_stl)
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double scalar_parallel(vector<double> const &x, vector<double> const &y);
/** (A) Sum entries of vector
@param[in] x vector
@return sum
*/
double sum(vector<double> const &x);
/** (B) Matrix-vector product (from intro_vector_densematrix)
* @param[in] A dense matrix (1D access)
* @param[in] u vector
*
* @return resulting vector
*/
vector<double> MatVec_parallel(vector<double> const &A, vector<double> const &x);
/** (C) Matrix-matrix product
* @param[in] A MxL dense matrix (1D access)
* @param[in] B LxN dense matrix (1D access)
* @param[in] shared_dim shared dimension L
*
* @return resulting MxN matrix
*/
vector<double> MatMat_parallel(vector<double> const &A, vector<double> const &B, size_t const &shared_dim);
/** (D) Evaluation of a polynomial function using Horner's scheme
* @param[in] a coefficient vector
* @param[in] x vector with input values
*
* @return vector with output values
*/
vector<double> poly_parallel(vector<double> const &a, vector<double> const &x);

84
ex5/ex5_4/main.cpp Normal file
View file

@ -0,0 +1,84 @@
#include "benchmark_tests.h"
#include <iostream>
#include <cmath>
int main()
{
vector<vector<double>> results_scalar;
results_scalar.push_back(test_A(2000000, pow(10,3)));
results_scalar.push_back(test_A(1000000, pow(10,4)));
results_scalar.push_back(test_A(100000, pow(10,5)));
results_scalar.push_back(test_A(10000, pow(10,6)));
results_scalar.push_back(test_A(750, pow(10,7)));
results_scalar.push_back(test_A(125, pow(10,8)));
vector<vector<double>> results_sum;
results_sum.push_back(test_A_sum(3000000, pow(10,3)));
results_sum.push_back(test_A_sum(2000000, pow(10,4)));
results_sum.push_back(test_A_sum(1000000, pow(10,5)));
results_sum.push_back(test_A_sum(50000, pow(10,6)));
results_sum.push_back(test_A_sum(2000, pow(10,7)));
results_sum.push_back(test_A_sum(250, pow(10,8)));
test_B(100, 20000, 10000);
test_C(25, 500, 1000, 1500);
test_D(100, 100, 1000000);
cout << endl << "###### Scalar ######" << endl;
cout << "Timing\tGFLOPS\tGiByte/s" << endl;
cout << "------------------------------" << endl;
for (size_t i = 0; i < results_scalar.size(); ++i)
cout << results_scalar[i][0] << "\t" << results_scalar[i][1] << "\t" << results_scalar[i][2] << endl;
cout << endl << "###### Sum ######" << endl;
cout << "Timing\tGFLOPS\tGiByte/s" << endl;
cout << "------------------------------" << endl;
for (size_t i = 0; i < results_sum.size(); ++i)
cout << results_sum[i][0] << "\t" << results_sum[i][1] << "\t" << results_sum[i][2] << endl;
// ###### Scalar ######
// Timing GFLOPS GiByte/s
// ------------------------------
// 3.4e-06 0.54 4.3
// 4.6e-06 4 32
// 1.6e-05 12 95
// 0.0011 1.7 13
// 0.0097 1.9 15
// 0.075 2.5 20
// ###### Sum ######
// Timing GFLOPS GiByte/s
// ------------------------------
// 5.5e-06 0.17 1.3
// 5.4e-06 1.7 14
// 1.5e-05 6.1 49
// 0.00013 7.2 57
// 0.0033 2.8 23
// 0.032 2.9 23
// ######### NOT PARALLEL (from exercise sheet 2) #########
// Timing GFLOPS GiByte/s
// ----------------------------------
// (A) 0.038 2.5 20
// (B) 0.13 2.9 23
// (C) 0.44 3.2 25
// (D) 0.19 1.5 12
return 0;
}