##################################################################################################
# makefile for a simlple project
# available targets:
#      make
#      make clean
##################################################################################################
#  Edit the following lines according to your needs
##################################################################################################
PROGRAM	= main
SOURCES = main.cpp mylib.cpp
HEADERS = mylib.h
OBJECTS = $(SOURCES:.cpp=.o)

ROOT_PGI = /state/partition1/apps/pgi/linux86-64/12.9
CUDA_HOME = /usr/local/cuda
CC	= $(ROOT_PGI)/bin/pgcc
# CC      = gcc -std=c99
# CXX     = $(ROOT_PGI)/bin/pgCC
CXX     = $(ROOT_PGI)/bin/pgcpp
F90     = $(ROOT_PGI)/bin/pgfortran
LINKER  = $(CC)

WARNINGS = -Wall -Weffc++ -Woverloaded-virtual -W -Wfloat-equal -Wshadow \
           -Wredundant-decls -Winline
#  -Wunreachable-code
# CXXFLAGS = -ffast-math -O3 -funroll-all-loops -DNDEBUG ${WARNING} -ftree-vectorizer-verbose=2
# CXXFLAGS = -ffast-math -O3 -funroll-all-loops -DNDEBUG ${WARNING} -ftree-vectorizer-verbose=2 \
#             -fopenmp
#
PGI_PROFILING = -Mprof=lines –Minfo=ccff
#
#  for OpenMP
# CXXFLAGS = -O3 -mp
# LDFLAGS  = -lpthread

#
#  for OpenACC
# Target architecture (nvidia,host)
# TA_ARCH = nvidia,host
TA_ARCH = nvidia:cc20
# CFLAGS = -O3 -ta=$(TA_ARCH)
CFLAGS = -g -fast -acc -ta=$(TA_ARCH) –Minfo $(PGI_PROFILING)

LDFLAGS  = -g -ta=$(TA_ARCH) -L$(ROOT_PGI)/lib $(PGI_PROFILING)

# CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp -fdump-tree-vect-details
# CFLAGS	= -ffast-math -O3 -funroll-loops -DNDEBUG -msse3 -fopenmp -ftree-vectorizer-verbose=2
# #CFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# FFLAGS	= -ffast-math -O3 -DNDEBUG -msse3 -fopenmp
# LFLAGS  = -ffast-math -O3 -DNDEBUG -msse3 -fopenmp

##################################################################################################
#    Don't change anything below this line
##################################################################################################
all:	${PROGRAM}

${PROGRAM}:: Makefile $(HEADERS)
	make clean

${PROGRAM}::	${OBJECTS}
	$(LINKER) $^ -o $@  ${LDFLAGS} 

clean:
	rm -f ${PROGRAM} ${OBJECTS} *~ a.out

.cpp.o:
	$(CXX) -c $(CXXFLAGS) $<

.c.o:
	$(CC) -c $(CFLAGS) $<

.f.o:
	$(F77) -c $(FFLAGS) $<

###################################################################################################
# #    some tools
# # Cache behaviour (CXXFLAGS += -g  tracks down to source lines)
# cache: ${PROGRAM}
# 	valgrind --tool=callgrind --simulate-cache=yes ./$^
# #	kcachegrind callgrind.out.<pid> &
# 
# # Check for wrong memory accesses, memory leaks, ...
# # use smaller data sets
# mem: ${PROGRAM}
# 	valgrind -v --leak-check=yes --tool=memcheck --undef-value-errors=yes --track-origins=yes --log-file=$^.addr.out --show-reachable=yes ./$^

# Profiling options PGI, see: pgcollect -help
CPU_PROF = -dcache
GPU_PROF = -cuda=gmem
#GPU_PROF = -cuda=branch:cc20
#
PROF_FILE = pgprof.out

prof: ${PROGRAM}
#	./$^
#	$(CUDA_HOME)/bin/nvvp &
#	export LD_LIBRARY_PATH=/state/partition1/apps/pgi/linux86-64/12.9/lib:$LD_LIBRARY_PATH
	$(ROOT_PGI)/bin/pgcollect $(GPU_PROF) ./$^
	$(ROOT_PGI)/bin/pgprof -exe ./$^  $(PROF_FILE) &


# Memory checker (slooooow!!!): 
# see doc at /usr/local/cuda/doc/cuda-memcheck.pdf
mem: ${PROGRAM}
	$(CUDA_HOME)/bin memcheck./$^
