This commit is contained in:
dino.celebic 2025-12-05 23:13:00 +01:00
commit 95b3017475
28 changed files with 5800 additions and 0 deletions

View file

@ -0,0 +1,131 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=CLANG_
# https://llvm.org/docs/CompileCudaWithLLVM.html
# https://llvm.org/docs/NVPTXUsage.html
#CLANGPATH=//usr/lib/llvm-10/bin/
CC = ${CLANGPATH}clang
CXX = ${CLANGPATH}clang++
#CXX = ${CLANGPATH}clang++ -lomptarget -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=/opt/pgi/linux86-64/2017/cuda/8.0
#F77 = gfortran
LINKER = ${CXX}
#http://clang.llvm.org/docs/UsersManual.html#options-to-control-error-and-warning-messages
WARNINGS += -pedantic -Weverything -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion
WARNINGS += -Wno-c++98-compat -Wno-sign-conversion -Wno-date-time -Wno-shorten-64-to-32 -Wno-padded -ferror-limit=1
WARNINGS += -Wno-unsafe-buffer-usage
#-fsyntax-only -Wdocumentation -Wconversion -Wshadow -Wfloat-conversion -pedantic
CXXFLAGS += -O3 -std=c++17 -ferror-limit=1 ${WARNINGS}
# don't use -Ofast
# -ftrapv
LINKFLAGS += -O3
# different libraries in Ubuntu or manajaró
ifndef UBUNTU
UBUNTU=1
endif
# BLAS, LAPACK
LINKFLAGS += -llapack -lblas
# -lopenblas
ifeq ($(UBUNTU),1)
# ubuntu
else
# on archlinux
LINKFLAGS += -lcblas
endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
#sudo apt install libomp-dev
# OpenMP
CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
# very good check
# http://clang.llvm.org/extra/clang-tidy/
# good check, see: http://llvm.org/docs/CodingStandards.html#include-style
SWITCH_OFF=,-readability-magic-numbers,-readability-redundant-control-flow,-readability-redundant-member-init
SWITCH_OFF+=,-readability-redundant-member-init,-readability-isolate-declaration
#READABILITY=,readability*${SWITCH_OFF}
#TIDYFLAGS = -checks=llvm-*,-llvm-header-guard -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
TIDYFLAGS = -checks=llvm-*,-llvm-header-guard${READABILITY} -header-filter=.* -enable-check-profile -extra-arg="-std=c++17" -extra-arg="-fopenmp"
#TIDYFLAGS += -checks='modernize*
# ???
#TIDYFLAGS = -checks='cert*' -header-filter=.*
# MPI checks ??
#TIDYFLAGS = -checks='mpi*'
# ??
#TIDYFLAGS = -checks='performance*' -header-filter=.*
#TIDYFLAGS = -checks='portability-*' -header-filter=.*
#TIDYFLAGS = -checks='readability-*' -header-filter=.*
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
codecheck: tidy_check
tidy_check:
clang-tidy ${SOURCES} ${TIDYFLAGS} -- ${SOURCES}
# see also http://clang-developers.42468.n3.nabble.com/Error-while-trying-to-load-a-compilation-database-td4049722.html
run: clean ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
codecheck: tidy_check

View file

@ -0,0 +1,212 @@
# Basic Defintions for using GNU-compiler suite with OpenMP und MKL
# requires setting of COMPILER=GCCMKL_
# install MKL in manjaro
# https://linux-packages.com/manjaro-linux/package/intel-mkl
# > sudo pacman -Sy
# > sudo pacman -S intel-mkl
ifeq ($(ONEAPI),1)
MKL_INCLUDE=/opt/intel/oneapi/mkl/2024.0/include
MKL_LIB=/opt/intel/oneapi/2024.0/lib
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/opt/intel/oneapi/2024.0/lib
else
MKL_INCLUDE=/usr/include/mkl
MKL_LIB=/usr/lib/x86_64-linux-gnu/mkl
endif
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
CXXFLAGS += -ffast-math -O3 -march=native -std=c++17 ${WARNINGS}
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
#-msse3
# -ftree-vectorizer-verbose=2 -DNDEBUG
# -ftree-vectorizer-verbose=5
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
LINKFLAGS += -O3
#architecture
CPU = -march=znver2
#CPU = -march=core-avx2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU}
# MKL
#CXXFLAGS += -I/usr/include/mkl -DUSE_MKL -Wno-redundant-decls
CXXFLAGS += -I${MKL_INCLUDE} -DUSE_MKL -Wno-redundant-decls
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_tbb_thread -ltbb -lmkl_core
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -L/usr/lib/x86_64-linux-gnu/mkl
LINKFLAGS += -L${MKL_LIB} -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core
#LINKFLAGS += -lmkl_intel_lp64 -lmkl_sequential -lmkl_core
# workaround for MKL slow down on AMD hardware
# https://danieldk.eu/Posts/2020-08-31-MKL-Zen.html
default: run
libfakeintel.so:
gcc -shared -fPIC -o libfakeintel.so fakeintel.c
echo "call: export LD_PRELOAD=./libfakeintel.so "
# different libraries in Ubuntu or manajaro
#ifndef UBUNTU
#UBUNTU=1
#endif
## BLAS, LAPACK
#ifeq ($(UBUNTU),1)
#LINKFLAGS += -llapack -lblas
## -lopenblas
#else
## on archlinux
#LINKFLAGS += -llapack -lopenblas -lcblas
#endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
# for debugging purpose (save code)
# -fsanitize=leak # only one out the three can be used
# -fsanitize=address
# -fsanitize=thread
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
#CXXFLAGS += ${SANITARY}
#LINKFLAGS += ${SANITARY}
# OpenMP
CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
-@rm -rf html
run: clean ${PROGRAM} libfakeintel.so
#run: ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
#find . -size +10M > large_files
#--exclude-from ${MY_DIR}/large_files
zip: clean
@echo "Zip the directory: " ${MY_DIR}
@cd .. ;\
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
doc:
doxygen Doxyfile
#########################################################################
.SUFFIXES: .f90
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
.f90.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
# no "-pg" in compile/link options
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Graphical interface
# valkyrie
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
# * install
# * sudo vi /etc/sysctl.conf
# add kernel.perf_event_paranoid = 0
#Trace your heap:
#> heaptrack ./main.GCC_
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
heap: ${PROGRAM}
heaptrack ./$^ ${PARAMS} 11
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
codecheck: $(SOURCES)
cppcheck --enable=all --inconclusive --std=c++17 --suppress=missingIncludeSystem $^
########################################################################
# get the detailed status of all optimization flags
info:
echo "detailed status of all optimization flags"
$(CXX) --version
$(CXX) -Q $(CXXFLAGS) --help=optimizers
lscpu
inxi -C
lstopo
# Excellent hardware info
# hardinfo
# Life monitoring of CPU frequency etc.
# sudo i7z
# Memory consumption
# vmstat -at -SM 3
# xfce4-taskmanager
# https://www.tecmint.com/check-linux-cpu-information/
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
# Debugging:
# https://wiki.archlinux.org/index.php/Debugging

View file

@ -0,0 +1,111 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=GCC_
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
# on mephisto:
#CXXFLAGS += -I/share/apps/atlas/include
#LINKFLAGS += -L/share/apps/atlas/lib -L/usr/lib64/atlas
#LINKFLAGS += -latlas -lcblas
#LINKFLAGS += -lblas
# Der <cblas.h> Header muss mit extern "C" versehen werden, damit g++ alles findet.
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
#CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native -DNDEBUG ${WARNINGS}
CXXFLAGS += -std=c++17 -ffast-math -O3 -march=native ${WARNINGS}
# info on vectorization
#VECTORIZE = -ftree-vectorize -fdump-tree-vect-blocks=foo.dump
#-fdump-tree-pre=stderr
VECTORIZE = -ftree-vectorize -fopt-info -ftree-vectorizer-verbose=5
#CXXFLAGS += ${VECTORIZE}
# -funroll-all-loops -msse3
#GCC -march=knl -march=broadwell -march=haswell
# interprocedural optimization
#CXXFLAGS += -flto
LINKFLAGS += -flto
# for debugging purpose (save code)
# -fsanitize=leak # only one out the trhee can be used
# -fsanitize=address
# -fsanitize=thread
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
#CXXFLAGS += ${SANITARY}
#LINKFLAGS +=${SANITARY}
# OpenMP but no OpenMP in Single mode
#CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
@rm -rf html
run: clean ${PROGRAM}
# time ./${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar:
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
thread:${PROGRAM}
valgrind -v --tool=helgrind --log-file=$^.thread.out ./$^
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
./$^
gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &

View file

@ -0,0 +1,182 @@
# Basic Defintions for using GNU-compiler suite sequentially
# requires setting of COMPILER=GCC_
CC = gcc
CXX = g++
F77 = gfortran
LINKER = ${CXX}
WARNINGS = -Wall -pedantic -Wextra -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow \
-Wredundant-decls -Winline -fmax-errors=1
# -Wunreachable-code
CXXFLAGS += -ffast-math -O3 -march=native -std=c++20 ${WARNINGS}
#CXXFLAGS += -Ofast -funroll-all-loops -std=c++17 ${WARNINGS}
#-msse3
# -ftree-vectorizer-verbose=2 -DNDEBUG
# -ftree-vectorizer-verbose=5
# -ftree-vectorize -fdump-tree-vect-blocks=foo.dump -fdump-tree-pre=stderr
# CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
# CFLAGS = -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
# #CFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# FFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# LFLAGS = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
LINKFLAGS += -O3
#architecture
#CPU = -march=znver2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU}
# different libraries in Ubuntu or manajaró
ifndef UBUNTU
UBUNTU=1
endif
# BLAS, LAPACK
ifeq ($(UBUNTU),1)
LINKFLAGS += -llapack -lblas
# -lopenblas
else
# on archlinux
LINKFLAGS += -llapack -lopenblas -lcblas
endif
# interprocedural optimization
CXXFLAGS += -flto
LINKFLAGS += -flto
# for debugging purpose (save code)
# -fsanitize=leak # only one out the three can be used
# -fsanitize=address
# -fsanitize=thread
SANITARY = -fsanitize=address -fsanitize=undefined -fsanitize=null -fsanitize=return \
-fsanitize=bounds -fsanitize=alignment -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow \
-fsanitize=bool -fsanitize=enum -fsanitize=vptr
#CXXFLAGS += ${SANITARY}
#LINKFLAGS += ${SANITARY}
# OpenMP
CXXFLAGS += -fopenmp
LINKFLAGS += -fopenmp
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
-@rm -f *_ *~ *.bak *.log *.out *.tar *.orig *.optrpt
-@rm -rf html
run: clean ${PROGRAM}
#run: ${PROGRAM}
# time ./${PROGRAM} ${PARAMS}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
#find . -size +10M > large_files
#--exclude-from ${MY_DIR}/large_files
zip: clean
@echo "Zip the directory: " ${MY_DIR}
@cd .. ;\
zip -r ${MY_DIR}.zip ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
doc:
doxygen Doxyfile
#########################################################################
.SUFFIXES: .f90
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $<.log
# $(CXX) -c $(CXXFLAGS) -o $@ $< 2>&1 | tee -a $(<:.cpp=.log)
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
.f90.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines; no -pg in linkflags)
cache: ${PROGRAM}
valgrind --tool=callgrind --simulate-cache=yes ./$^ ${PARAMS}
# kcachegrind callgrind.out.<pid> &
kcachegrind `ls -1tr callgrind.out.* |tail -1`
# Check for wrong memory accesses, memory leaks, ...
# use smaller data sets
# no "-pg" in compile/link options
mem: ${PROGRAM}
valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^ ${PARAMS}
# Graphical interface
# valkyrie
# Simple run time profiling of your code
# CXXFLAGS += -g -pg
# LINKFLAGS += -pg
prof: ${PROGRAM}
perf record ./$^ ${PARAMS}
perf report
# gprof -b ./$^ > gp.out
# kprof -f gp.out -p gprof &
# perf in Ubuntu 20.04: https://www.howtoforge.com/how-to-install-perf-performance-analysis-tool-on-ubuntu-20-04/
# * install
# * sudo vi /etc/sysctl.conf
# add kernel.perf_event_paranoid = 0
#Trace your heap:
#> heaptrack ./main.GCC_
#> heaptrack_gui heaptrack.main.GCC_.<pid>.gz
heap: ${PROGRAM}
heaptrack ./$^ ${PARAMS} 11
heaptrack_gui `ls -1tr heaptrack.$^.* |tail -1` &
codecheck: $(SOURCES)
cppcheck --enable=all --inconclusive --std=c++17 -I${CUDA_INC} --suppress=missingIncludeSystem $^
########################################################################
# get the detailed status of all optimization flags
info:
echo "detailed status of all optimization flags"
$(CXX) --version
$(CXX) -Q $(CXXFLAGS) --help=optimizers
lscpu
inxi -C
lstopo
# Excellent hardware info
# hardinfo
# Life monitoring of CPU frequency etc.
# sudo i7z
# Memory consumption
# vmstat -at -SM 3
# xfce4-taskmanager
# https://www.tecmint.com/check-linux-cpu-information/
#https://www.tecmint.com/monitor-cpu-and-gpu-temperature-in-ubuntu/
# Debugging:
# https://wiki.archlinux.org/index.php/Debugging

View file

@ -0,0 +1,151 @@
# Basic Defintions for using INTEL compiler suite sequentially
# requires setting of COMPILER=ICC_
# special on my sony [GH]
#BINDIR = /opt/save.intel/bin/
# very special on my sony [GH]
# FIND_LIBS = -L /opt/save.intel/composer_xe_2013.1.117/mkl/lib/intel64/libmkl_intel_lp64.so
#export KMP_AFFINITY=verbose,compact
CC = ${BINDIR}icc
CXX = ${BINDIR}icpc
F77 = ${BINDIR}ifort
LINKER = ${CXX}
WARNINGS = -pedantic -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -wd2015,2012
#-Winline -Wunreachable-code -Wredundant-decls
CXXFLAGS += -std=c++17 -O3 -fma -DNDEBUG ${WARNINGS} -mkl
#CXXFLAGS += -std=c++17 -O3 -march=core-avx2 -fma -ftz -fomit-frame-pointer -DNDEBUG ${WARNINGS} -mkl
# -fast # fast inludes also -ipo !
CXXFLAGS += -fargument-noalias -fargument-noalias-global -ansi-alias
CXXFLAGS += -align -qopt-dynamic-align
#CXXFLAGS += -xCore-AVX2
#CXXFLAGS += -tp=zen
# -qopt-subscript-in-range
# -vec-threshold0
# -xCORE-AVX2
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
# -ipo
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
# Redirect report from *.optrpt to stderr
# -qopt-report-file=stderr
# Guided paralellization
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
# interprocedural optimization
#CXXFLAGS += -ipo
#LINKFLAGS += -ipo
# annotated Assembler file
ANNOTED = -fsource-asm -S
# OpenMP
CXXFLAGS += -qopenmp
# -qopt-report-phase=openmp
# -diag-enable=sc-full -diag-file=filename -diag-file-append[=filename]
LINKFLAGS += -qopenmp
# use MKL by INTEL
# LINKFLAGS += -L${BINDIR}../composer_xe_2013.1.117/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
LINKFLAGS += -O2 -mkl
# -ipo
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# # some tools
# # Cache behaviour (CXXFLAGS += -g tracks down to source lines)
# cache: ${PROGRAM}
# valgrind --tool=callgrind --simulate-cache=yes ./$^
# # kcachegrind callgrind.out.<pid> &
#
# # Check for wrong memory accesses, memory leaks, ...
# # use smaller data sets
# mem: ${PROGRAM}
# valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^
#
# # Simple run time profiling of your code
# # CXXFLAGS += -g -pg
# # LINKFLAGS += -pg
# prof: ${PROGRAM}
# ./$^
# gprof -b ./$^ > gp.out
# # kprof -f gp.out -p gprof &
#
mem: inspector
prof: amplifier
cache: amplifier
gap_par_report:
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
# GUI for performance report
amplifier: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
amplxe-gui &
# GUI for Memory and Thread analyzer (race condition)
inspector: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
#${BINDIR}../inspector_xe_2013/bin64/inspxe-gui &
inspxe-gui &
advisor:
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
export ADVIXE_EXPERIMENTAL=roofline
advixe-gui &
icc-info:
icpc -# main.cpp

View file

@ -0,0 +1,181 @@
# Basic Defintions for using INTEL compiler suite sequentially
# requires setting of COMPILER=ONEAPI_
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# requires
# source /opt/intel/oneapi/setvars.sh
# on AMD: export MKL_DEBUG_CPU_TYPE=5
#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
#export KMP_AFFINITY=verbose,compact
CC = ${BINDIR}icc
CXX = ${BINDIR}dpcpp
F77 = ${BINDIR}ifort
LINKER = ${CXX}
## Compiler flags
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
WARNINGS += -Wpessimizing-move -Wredundant-move
#-wd2015,2012,2014 -wn3
# -Winline -Wredundant-decls -Wunreachable-code
# -qopt-subscript-in-range
# -vec-threshold0
CXXFLAGS += -O3 -std=c++17 -tbb ${WARNINGS}
# https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned
# needed on Desktop-PC Haase (not needed on mephisto)
CXXFLAGS += -D_GLIBCXX_USE_TBB_PAR_BACKEND=0
#CXXFLAGS += -DMKL_ILP64 -I"${MKLROOT}/include"
#CXXFLAGS += -DMKL_ILP32 -I"${MKLROOT}/include"
LINKFLAGS += -O3 -tbb
#LINKFLAGS += -no-prec-div
# interprocedural optimization
CXXFLAGS += -ipo
LINKFLAGS += -ipo
LINKFLAGS += -flto
# annotated Assembler file
ANNOTED = -fsource-asm -S
#architecture
CPU = -march=core-avx2
#CPU += -mtp=zen
# -xCORE-AVX2
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
CXXFLAGS += ${CPU}
LINKFLAGS += ${CPU}
# use MKL by INTEL
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# sequential MKL
# use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
CXXFLAGS += -qmkl=sequential -UMKL_ILP64
LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
# shared libs: https://aur.archlinux.org/packages/intel-oneapi-compiler-static
# install intel-oneapi-compiler-static
# or
LINKFLAGS += -shared-intel
OPENMP = -qopenmp
CXXFLAGS += ${OPENMP}
LINKFLAGS += ${OPENMP}
# profiling tools
#CXXFLAGS += -pg
#LINKFLAGS += -pg
# -vec-report=3
# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=vec,par
#CXXFLAGS += -qopt-report=5 -qopt-report-phase=cg
# Redirect report from *.optrpt to stderr
# -qopt-report-file=stderr
# Guided paralellization
# -guide -parallel
# -guide-opts=string -guide-par[=n] -guide-vec[=n]
# -auto-p32 -simd
## run time checks
# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
rm -f ${PROGRAM} ${OBJECTS} *.optrpt
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM} ${PARAMS}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# some tools
# Cache behaviour (CXXFLAGS += -g tracks down to source lines)
# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html
mem: inspector
prof: vtune
cache: inspector
gap_par_report:
${CXX} -c -guide -parallel $(SOURCES) 2> gap.txt
# GUI for performance report
amplifier: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
amplxe-gui &
# GUI for Memory and Thread analyzer (race condition)
inspector: ${PROGRAM}
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# inspxe-gui &
vtune-gui ./${PROGRAM} &
advisor:
# source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
# /opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
advisor --collect=survey ./${PROGRAM} ${PARAMS}
# advisor --collect=roofline ./${PROGRAM} ${PARAMS}
advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv
# advisor-gui
vtune:
echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
# https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
export ADVIXE_EXPERIMENTAL=roofline
vtune -collect hotspots ./${PROGRAM} ${PARAMS}
vtune -report hotspots -r r000hs > vtune.out
# vtune-gui ./${PROGRAM} &
icc-info:
icpc -# main.cpp
# MKL on AMD
# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
#
# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
# export MKL_DEBUG_CPU_TYPE=5
# export MKL_NUM_THRAEDS=1
# export MKL_DYNAMIC=false
# on Intel compiler
# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html

View file

@ -0,0 +1,96 @@
# Basic Defintions for using PGI-compiler suite sequentially
# requires setting of COMPILER=PGI_
# OPTIRUN = optirun
# on mephisto:
#CXXFLAGS += -I/share/apps/atlas/include
#LINKFLAGS += -L/share/apps/atlas/lib
#LINKFLAGS += -lcblas -latlas
LINKFLAGS += -lblas
CC = pgcc
CXX = pgc++
F77 = pgfortran
LINKER = ${CXX}
WARNINGS = -Minform=warn
#PGI_PROFILING = -Minfo=loop,vect,opt,intensity,mp,accel
PGI_PROFILING = -Minfo=ccff,accel,ipa,loop,lre,mp,opt,par,unified,vect,intensity
# -Minfo
# -Mprof=lines
CXXFLAGS += -std=c++14 -O3 -fast -DNDEBUG ${PGI_PROFILING} ${WARNINGS}
CXXFLAGS += -Mvect -Mcache_align -Msafeptr -Mprefetch -Mlre -Mdepchk
#-Msmart
LINKFLAGS += ${PGI_PROFILING}
#-lcblas
# OpenMP
CXXFLAGS += -mp=align,bind,numa -Mneginfo=mp
LINKFLAGS += -mp=allcores,bind,numa
default: ${PROGRAM}
${PROGRAM}: ${OBJECTS}
$(LINKER) $^ ${LINKFLAGS} -o $@
clean:
@rm -f ${PROGRAM} ${OBJECTS}
clean_all:: clean
@rm -f *_ *~ *.bak *.log *.out *.tar
run: clean ${PROGRAM}
./${PROGRAM}
# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
@echo "Tar the directory: " ${MY_DIR}
@cd .. ;\
tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
cd ${MY_DIR}
# tar cf `basename ${PWD}`.tar *
doc:
doxygen Doxyfile
#########################################################################
.cpp.o:
$(CXX) -c $(CXXFLAGS) -o $@ $<
.c.o:
$(CC) -c $(CFLAGS) -o $@ $<
.f.o:
$(F77) -c $(FFLAGS) -o $@ $<
##################################################################################################
# # some tools
# # Simple run time profiling of your code
# # CXXFLAGS += -g
# # LINKFLAGS +=
# Profiling options PGI, see: pgprof -h
PROF_FILE = jac.pgprof
# CPU_PROF = -allcache
CPU_PROF = --cpu-profiling on --analysis-metrics
# GPU_PROF = -cuda=gmem,branch,cc13 -cudainit
#GPU_PROF = -cuda=branch:cc20
#
cache: prof
prof: ${PROGRAM}
# ./$^
# $(CUDA_HOME)/bin/nvvp &
# more /opt/pgi/linux86-64/16.10/bin/pgcollectrc
${OPTIRUN} ${BINDIR}pgprof ${CPU_PROF} -o $(PROF_FILE) ./$^
${OPTIRUN} ${BINDIR}pgprof -i $(PROF_FILE) 2> prof.out

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,30 @@
#
# use GNU-Compiler tools
COMPILER=GCC_
# alternatively from the shell
# export COMPILER=GCC_
# or, alternatively from the shell
# make COMPILER=GCC_
# use Intel compilers
#COMPILER=ICC_
# use PGI compilers
# COMPILER=PGI_
SOURCES = main.cpp mylib.cpp
OBJECTS = $(SOURCES:.cpp=.o)
PROGRAM = main.${COMPILER}
# uncomment the next to lines for debugging and detailed performance analysis
CXXFLAGS += -g
LINKFLAGS += -g
# do not use -pg with PGI compilers
ifndef COMPILER
COMPILER=GCC_
endif
include ../${COMPILER}default.mk

View file

@ -0,0 +1,99 @@
#pragma once
#include <iostream>
#ifdef _OPENMP
#include <omp.h>
#endif
#include <unordered_map>
//#####################################
// G.Haase
// See https://sourceforge.net/p/predef/wiki/Compilers/
// http://www.cplusplus.com/doc/tutorial/preprocessor/
// also: export OMP_DISPLAY_ENV=VERBOSE
//#####################################
/** Checks for compilers, its versions, threads etc.
*
@param[in] argc number of command line arguemnts
@param[in] argv command line arguments as array of C-strings
*/
template <class T>
void check_env(T argc, char const *argv[])
{
std::cout << "\n#######################################################################\n";
std::cout << "Code :";
for (T k = 0; k < argc; ++k) std::cout << " " << argv[k];
std::cout << std::endl;
// compiler: https://sourceforge.net/p/predef/wiki/Compilers/
std::cout << "Compiler: ";
#if defined __INTEL_COMPILER
#pragma message(" ########## INTEL ###############")
std::cout << "INTEL " << __INTEL_COMPILER;
// Ignore warnings for #pragma acc unrecognice
#pragma warning disable 161
// Ignore warnings for #pragma omp unrecognice
#pragma warning disable 3180
#elif defined __PGI
#pragma message(" ########## PGI ###############")
std::cout << "PGI " << __PGIC__ << "." << __PGIC_MINOR__ << "." << __PGIC_PATCHLEVEL__;
#elif defined __clang__
#pragma message(" ########## CLANG ###############")
std::cout << "CLANG " << __clang_major__ << "." << __clang_minor__ << "."; // << __clang_patchlevel__;
#elif defined __GNUC__
#pragma message(" ########## Gnu ###############")
std::cout << "Gnu " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__;
#else
#pragma message(" ########## unknown Compiler ###############")
std::cout << "unknown";
#endif
std::cout << " C++ standard: " << __cplusplus << std::endl;
// Parallel environments
std::cout << "Parallel: ";
#if defined MPI_VERSION
#pragma message(" ########## MPI ###############")
#ifdef OPEN_MPI
std::cout << "OpenMPI ";
#else
std::cout << "MPI ";
#endif
std::cout << MPI_VERSION << "." << MPI_SUBVERSION << " ";
#endif
#ifdef _OPENMP
//https://www.openmp.org/specifications/
//https://stackoverflow.com/questions/1304363/how-to-check-the-version-of-openmp-on-linux
std::unordered_map<unsigned, std::string> const map{
{200505, "2.5"}, {200805, "3.0"}, {201107, "3.1"}, {201307, "4.0"}, {201511, "4.5"}, {201611, "5.0"}, {201811, "5.0"}};
#pragma message(" ########## OPENMP ###############")
//std::cout << _OPENMP;
std::cout << "OpenMP ";
try {
std::cout << map.at(_OPENMP);
}
catch (...) {
std::cout << _OPENMP;
}
#pragma omp parallel
{
#pragma omp master
{
const int nn = omp_get_num_threads(); // OpenMP
std::cout << " ---> " << nn << " Threads ";
}
#pragma omp barrier
}
#endif
#ifdef _OPENACC
#pragma message(" ########## OPENACC ###############")
std::cout << "OpenACC ";
#endif
std::cout << std::endl;
std::cout << "Date : " << __DATE__ << " " << __TIME__;
std::cout << "\n#######################################################################\n";
}
// HG

View file

@ -0,0 +1,132 @@
#include "check_env.h"
#include "mylib.h"
#include <cstdlib> // atoi()
#include <cstring> // strncmp()
#include <ctime>
#include <iostream>
#include <omp.h> // OpenMP
#include <sstream>
#include <string>
using namespace std;
int main(int argc, char const *argv[])
{
omp_set_num_threads(4);
// #################################################################
unsigned int N = 50000001;
int const NLOOPS = 50; // chose a value such that the benchmark runs at least 10 sec.
// unsigned int N = 11;
// int const NLOOPS = 1;
//##########################################################################
// Read Parameter from command line (C++ style)
cout << "Checking command line parameters for: -n <number> " << endl;
for (int i = 1; i < argc; i++)
{
cout << " arg[" << i << "] = " << argv[i] << endl;
string ss(argv[i]);
if ("-n"==ss && i + 1 < argc) // found "-n" followed by another parameter
{
N = static_cast<unsigned int>(atoi(argv[i + 1]));
}
else
{
cout << "Corect call: " << argv[0] << " -n <number>\n";
}
}
cout << "\nN = " << N << endl;
check_env(argc, argv);
//########################################################################
int nthreads; // OpenMP
#pragma omp parallel default(none) shared(cout,nthreads)
{
int const th_id = omp_get_thread_num(); // OpenMP
int const nthrds = omp_get_num_threads(); // OpenMP
stringstream ss;
ss << "C++: Hello World from thread " << th_id << " / " << nthrds << endl;
#pragma omp critical
{
cout << ss.str(); // output to a shared ressource
}
#pragma omp master
nthreads = nthrds; // transfer nn to to master thread
}
cout << " " << nthreads << " threads have been started." << endl;
//##########################################################################
// Memory allocation
cout << "Memory allocation\n";
vector<double> x(N), y(N);
cout.precision(2);
cout << 2.0 * N *sizeof(x[0]) / 1024 / 1024 / 1024 << " GByte Memory allocated\n";
cout.precision(6);
//##########################################################################
// Data initialization
// Special: x_i = i+1; y_i = 1/x_i ==> <x,y> == N
for (unsigned int i = 0; i < N; ++i)
{
x[i] = i + 1.0;
y[i] = 1.0 / x[i];
}
//##########################################################################
cout << "\nStart Benchmarking\n";
// Do calculation
double tstart = omp_get_wtime(); // OpenMP
double sk(0.0);
for (int i = 0; i < NLOOPS; ++i)
{
sk = scalar2(x, y);
}
double t1 = omp_get_wtime() - tstart; // OpenMP
t1 /= NLOOPS; // divide by number of function calls
//##########################################################################
// Check the correct result
cout << "\n <x,y> = " << sk << endl;
if (static_cast<unsigned int>(sk) != N)
{
cout << " !! W R O N G result !!\n";
}
cout << endl;
//##########################################################################
// Timings and Performance
cout.precision(2);
cout << "Timing in sec. : " << t1 << endl;
cout << "GFLOPS : " << 2.0 * N / t1 / 1024 / 1024 / 1024 << endl;
cout << "GiByte/s : " << 2.0 * N / t1 / 1024 / 1024 / 1024 * sizeof(x[0]) << endl;
//#########################################################################
// ---- Speedup for scalar ----
// Threads | Time | Speedup
// 1 | 0.032s | 1
// 2 | 0.022s | 1.45
// 4 | 0.021s | 1.52
// 8 | 0.022s | 1.45
// 16 | 0.022s | 1.45
//#########################################################################
cout << "\n Try the reduction with an STL-vektor \n";
auto vr = reduction_vec_append(10);
cout << "done\n";
cout << vr << endl;
return 0;
} // memory for x and y will be deallocated their destructors

View file

@ -0,0 +1,84 @@
#include "mylib.h"
#include <cassert> // assert()
#include <cmath>
#include <iostream>
#include <functional> // multiplies<>{}
#include <list>
#include <numeric> // iota()
#ifdef _OPENMP
#include <omp.h>
#endif
#include <vector>
using namespace std;
double scalar(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size()); // switch off via compile flag: -DNDEBUG
size_t const N = x.size();
double sum = 0.0;
#pragma omp parallel for default(none) shared(x,y,N) reduction(+:sum) schedule(static)
for (size_t i = 0; i < N; ++i)
{
sum += x[i] * y[i];
//sum += exp(x[i])*log(y[i]);
}
return sum;
}
double scalar2(vector<double> const &x, vector<double> const &y)
{
assert(x.size() == y.size());
size_t const N = x.size();
double sum = 0.0;
#pragma omp parallel shared(x,y,N,sum)
{
double local_sum = 0.0;
int const th_id = omp_get_thread_num();
int const nthrds = omp_get_num_threads();
for (size_t i = th_id; i < N; i += nthrds) {
local_sum += x[i] * y[i];
}
#pragma omp atomic
sum += local_sum;
}
return sum;
}
vector<int> reduction_vec(int n)
{
vector<int> vec(n);
#pragma omp parallel default(none) shared(cout) reduction(VecAdd:vec)
{
#pragma omp barrier
#pragma omp critical
cout << omp_get_thread_num() << " : " << vec.size() << endl;
#pragma omp barrier
iota( vec.begin(),vec.end(), omp_get_thread_num() );
#pragma omp barrier
#pragma omp critical
cout << vec << endl;
}
return vec;
}
vector<int> reduction_vec_append(int n)
{
vector<int> vec;
#pragma omp parallel default(none) shared(cout,n) reduction(VecAppend:vec)
{
vector<int> local(n);
#pragma omp barrier
#pragma omp critical
cout << omp_get_thread_num() << " : " << local.size() << endl;
#pragma omp barrier
iota( local.begin(),local.end(), omp_get_thread_num() );
#pragma omp critical
vec.insert(vec.end(), local.begin(), local.end());
}
return vec;
}

View file

@ -0,0 +1,77 @@
#pragma once
#include <cassert>
#include <iomanip> // setw()
#include <iostream>
#include <omp.h>
#include <vector>
/** Inner product
@param[in] x vector
@param[in] y vector
@return resulting Euclidian inner product <x,y>
*/
double scalar(std::vector<double> const &x, std::vector<double> const &y);
double scalar2(std::vector<double> const &x, std::vector<double> const &y);
/** Vector @p b adds its elements to vector @p a .
@param[in] a vector
@param[in] b vector
@return a+=b componentwise
*/
template<class T>
std::vector<T> &operator+=(std::vector<T> &a, std::vector<T> const &b)
{
assert(a.size()==b.size());
for (size_t k = 0; k < a.size(); ++k) {
a[k] += b[k];
}
return a;
}
// Declare the reduction operation in OpenMP for an STL-vector
// omp_out += omp_in requires operator+=(vector<int> &, vector<int> const &) from above
// ------------------------------------------------------------
// https://scc.ustc.edu.cn/zlsc/tc4600/intel/2016.0.109/compiler_c/common/core/GUID-7312910C-D175-4544-99C5-29C12D980744.htm
// https://gist.github.com/eruffaldi/7180bdec4c8c9a11f019dd0ba9a2d68c
// https://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size
// see also p.74ff in https://www.fz-juelich.de/ias/jsc/EN/AboutUs/Staff/Hagemeier_A/docs-parallel-programming/OpenMP-Slides.pdf
#pragma omp declare reduction(VecAdd : std::vector<int> : omp_out += omp_in) \
initializer (omp_priv=omp_orig)
// Templates are n o t possible, i.e. the reduction has to be declared fore a specified type.
//template <class T>
//#pragma omp declare reduction(VecAdd : std::vector<T> : omp_out += omp_in) initializer (omp_priv(omp_orig))
// MS: template nach #pragma !?
// ------------------------------------------------------------
/** Test for vector reduction.
*
* The thread-private vectors of size @p n are initialized via @f$v_k^{tID}=tID+k@f$.
* Afterwards these vectors are accumulated, i.e.,
* @f$v_k= \sum_{tID=0}^{numThreads} v_k^{tID}@f$.
*
* @param[in] n size of global/private vector
* @return resulting global vector.
*/
std::vector<int> reduction_vec(int n);
/** Output of a vector.
@param[in,out] s output stream
@param[in] x vector
@return modified output stream
*/
template <class T>
std::ostream &operator<<(std::ostream &s, std::vector<T> const &x)
{
for (auto const &v : x) s << std::setw(4) << v << " ";
return s;
}
#pragma omp declare reduction(VecAppend : std::vector<int> : omp_out.insert(omp_out.end(), omp_in.begin(), omp_in.end()) ) initializer( omp_priv = std::vector<int>() )
std::vector<int> reduction_vec_append(int n);

View file

@ -0,0 +1,70 @@
#pragma once
#include <chrono> // timing
#include <stack>
using Clock = std::chrono::system_clock; //!< The wall clock timer chosen
//using Clock = std::chrono::high_resolution_clock;
using TPoint= std::chrono::time_point<Clock>;
// [Galowicz, C++17 STL Cookbook, p. 29]
inline
std::stack<TPoint> MyStopWatch; //!< starting time of stopwatch
/** Starts stopwatch timer.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
*
* The timining is allowed to be nested and the recent time is stored on top of the stack.
*
* @return recent time
* @see toc
*/
inline auto tic()
{
MyStopWatch.push(Clock::now());
return MyStopWatch.top();
}
/** Returns the elapsed time from stopwatch.
*
* The time from top of the stack is used
* if time point @p t_b is not passed as input parameter.
* Use as @code tic(); myfunction(...) ; double tsec = toc(); @endcode
* or as @code auto t_b = tic(); myfunction(...) ; double tsec = toc(t_b); @endcode
* The last option is to be used in the case of
* non-nested but overlapping time measurements.
*
* @param[in] t_b start time of some stop watch
* @return elapsed time in seconds.
*
*/
inline double toc(TPoint const &t_b = MyStopWatch.top())
{
// https://en.cppreference.com/w/cpp/chrono/treat_as_floating_point
using Unit = std::chrono::seconds;
using FpSeconds = std::chrono::duration<double, Unit::period>;
auto t_e = Clock::now();
MyStopWatch.pop();
return FpSeconds(t_e-t_b).count();
}
#include <iostream>
#include <string>
/** Executes function @p f and measures/prints elapsed wall clock time in seconds
*
* Call as
* @code measure("Time for (b = b + 1)", [&]() {
thrust::transform(b.begin(), b.end(), b.begin(), increment());
}); @endcode
*
* @param[in] label additional string to be printed with the measurement.
* @param[in] f function to execute.
* @author Therese Bösmüller, 2025
*
*/
auto measure = [](const std::string& label, auto&& f) {
auto start = std::chrono::high_resolution_clock::now();
f();
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
std::cout << label << ": " << duration << " microseconds" << std::endl;
}; // ';' is needed for a visible documentation of this lambda-function