# Basic Defintions for using INTEL compiler suite sequentially
# requires setting of COMPILER=ONEAPI_

#         https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# requires
# source /opt/intel/oneapi/setvars.sh
# on  AMD:    export MKL_DEBUG_CPU_TYPE=5

#BINDIR = /opt/intel/oneapi/compiler/latest/linux/bin/
#MKL_ROOT = /opt/intel/oneapi/mkl/latest/
#export KMP_AFFINITY=verbose,compact

CC	= ${BINDIR}icc
CXX     = ${BINDIR}dpcpp
F77	= ${BINDIR}ifort
LINKER  = ${CXX}

## Compiler flags
WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -Wfloat-equal -Wshadow -pedantic
WARNINGS += -Wpessimizing-move -Wredundant-move
#-wd2015,2012,2014 -wn3
#    -Winline -Wredundant-decls -Wunreachable-code
# -qopt-subscript-in-range
# -vec-threshold0

CXXFLAGS += -O3 -std=c++17  ${WARNINGS}
#CXXFLAGS += -DMKL_ILP64  -I"${MKLROOT}/include"
#CXXFLAGS += -DMKL_ILP32  -I"${MKLROOT}/include"
LINKFLAGS += -O3

# interprocedural optimization
CXXFLAGS  += -ipo
LINKFLAGS += -ipo
LINKFLAGS += -flto

# annotated Assembler file
ANNOTED = -fsource-asm -S 

#architecture
CPU  = -march=core-avx2
#CPU += -mtp=zen
# -xCORE-AVX2
# -axcode COMMON-AVX512 -axcode MIC-AVX512 -axcode CORE-AVX512 -axcode CORE-AVX2
CXXFLAGS  += ${CPU}
LINKFLAGS += ${CPU}

# use MKL by INTEL
# https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl/link-line-advisor.html
# sequential MKL
#                              use the 32 bit interface (LP64) instead of 64 bit interface (ILP64)
CXXFLAGS +=  -qmkl=sequential  -UMKL_ILP64
LINKFLAGS += -O3 -qmkl=sequential -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread
#LINKFLAGS += -O3 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread

# shared libs:  https://aur.archlinux.org/packages/intel-oneapi-compiler-static
#     install intel-oneapi-compiler-static   
# or 
LINKFLAGS += -shared-intel


OPENMP = -qopenmp
CXXFLAGS += ${OPENMP}
LINKFLAGS += ${OPENMP}


# profiling tools
#CXXFLAGS  += -pg
#LINKFLAGS += -pg
# -vec-report=3
# -qopt-report=5 -qopt-report-phase=vec -qopt-report-phase=openmp
# -guide -parallel
# -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
# -auto-p32 -simd

# Reports: https://software.intel.com/en-us/articles/getting-the-most-out-of-your-intel-compiler-with-the-new-optimization-reports
#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=vec,par
#CXXFLAGS +=  -qopt-report=5 -qopt-report-phase=cg
# Redirect report from *.optrpt to stderr
#    -qopt-report-file=stderr
# Guided paralellization
#    -guide -parallel
#    -guide-opts=string  -guide-par[=n]  -guide-vec[=n]
#    -auto-p32 -simd

## run time checks
# https://www.intel.com/content/www/us/en/develop/documentation/fortran-compiler-oneapi-dev-guide-and-reference/top/compiler-reference/compiler-options/offload-openmp-and-parallel-processing-options/par-runtime-control-qpar-runtime-control.html


default:	${PROGRAM}

${PROGRAM}:	${OBJECTS}
	$(LINKER)  $^  ${LINKFLAGS} -o $@

clean:
	rm -f ${PROGRAM} ${OBJECTS} *.optrpt

clean_all:: clean
	@rm -f *_ *~ *.bak *.log *.out *.tar

run: clean ${PROGRAM}
	./${PROGRAM}

# tar the current directory
MY_DIR = `basename ${PWD}`
tar: clean_all
	@echo "Tar the directory: " ${MY_DIR}
	@cd .. ;\
	tar cf ${MY_DIR}.tar ${MY_DIR} *default.mk ;\
	cd ${MY_DIR}
# 	tar cf `basename ${PWD}`.tar *

doc:
	doxygen Doxyfile

#########################################################################

.cpp.o:
	$(CXX) -c $(CXXFLAGS) -o $@ $<

.c.o:
	$(CC) -c $(CFLAGS) -o $@ $<

.f.o:
	$(F77) -c $(FFLAGS) -o $@ $<

##################################################################################################
#    some tools
# Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
# https://software.intel.com/content/www/us/en/develop/documentation/vtune-help/top/analyze-performance/microarchitecture-analysis-group/memory-access-analysis.html

mem: inspector
prof: vtune
cache: inspector

gap_par_report:
	${CXX}  -c -guide -parallel $(SOURCES) 2> gap.txt

# GUI for performance report
amplifier: ${PROGRAM}
	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
	echo 0 | sudo tee /proc/sys/kernel/perf_event_paranoid
	amplxe-gui &

# GUI for Memory and Thread analyzer (race condition)
inspector: ${PROGRAM}
	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
#	inspxe-gui &
	vtune-gui ./${PROGRAM} &

advisor:
	source /opt/intel/oneapi/advisor/2021.2.0/advixe-vars.sh
#	/opt/intel/oneapi/advisor/latest/bin64/advixe-gui &
	advisor --collect=survey ./${PROGRAM} 
#	advisor --collect=roofline ./${PROGRAM} 
	advisor --report=survey --project-dir=./ src:r=./ --format=csv --report-output=./out/survey.csv

vtune:
	echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
#	https://software.intel.com/en-us/articles/intel-advisor-2017-update-1-what-s-new
	export ADVIXE_EXPERIMENTAL=roofline
	vtune -collect hotspots ./${PROGRAM}
	vtune -report hotspots -r r000hs > vtune.out
#	vtune-gui ./${PROGRAM} &	

icc-info:
	icpc -# main.cpp

# MKL on AMD
# https://www.computerbase.de/2019-11/mkl-workaround-erhoeht-leistung-auf-amd-ryzen/
#
# https://sites.google.com/a/uci.edu/mingru-yang/programming/mkl-has-bad-performance-on-an-amd-cpu
# export MKL_DEBUG_CPU_TYPE=5
# export MKL_NUM_THRAEDS=1
# export MKL_DYNAMIC=false
#  on Intel compiler
# http://publicclu2.blogspot.com/2013/05/intel-complier-suite-reference-card.html