From 9b09c6c0a32a383278eeafafd94380c4a299fe52 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Wed, 4 Aug 2021 18:54:25 -0400 Subject: [PATCH 1/5] Cleaning up build as we port to spock Need to clean up installation location of dependent packages to allow for re-building without deleting the install directory. --- cmake/Modules/FindACTIVEHARMONY.cmake | 10 +++++----- cmake/Modules/FindBFD.cmake | 20 +++++++++++++------- cmake/Modules/FindOMPT.cmake | 9 ++++----- cmake/Modules/FindOTF2.cmake | 12 ++++++------ src/apex/apex_ompt.cpp | 2 ++ src/apex/policy_handler.hpp | 1 - 6 files changed, 30 insertions(+), 24 deletions(-) diff --git a/cmake/Modules/FindACTIVEHARMONY.cmake b/cmake/Modules/FindACTIVEHARMONY.cmake index 1c59f1d0..c8dfde67 100644 --- a/cmake/Modules/FindACTIVEHARMONY.cmake +++ b/cmake/Modules/FindACTIVEHARMONY.cmake @@ -31,7 +31,7 @@ mark_as_advanced(ACTIVEHARMONY_INCLUDE_DIR ACTIVEHARMONY_LIBRARY) # --------- DOWNLOAD AND BUILD THE EXTERNAL PROJECT! ------------ # if((APEX_BUILD_ACTIVEHARMONY OR (NOT ACTIVEHARMONY_FOUND)) AND NOT APPLE) - set(CACHE ACTIVEHARMONY_ROOT ${CMAKE_INSTALL_PREFIX} STRING "Active Harmony Root directory") + set(CACHE ACTIVEHARMONY_ROOT ${CMAKE_INSTALL_PREFIX}/ah STRING "Active Harmony Root directory") message("Attention: Downloading and Building ActiveHarmony as external project!") message(INFO " A working internet connection is required!") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") @@ -41,17 +41,17 @@ if((APEX_BUILD_ACTIVEHARMONY OR (NOT ACTIVEHARMONY_FOUND)) AND NOT APPLE) PREFIX ${CMAKE_CURRENT_BINARY_DIR}/activeharmony-4.6.0 CONFIGURE_COMMAND "" BUILD_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/activeharmony-4.6.0/src/project_activeharmony && make MPICC=mpicc_disabled CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS=${CMAKE_C_FLAGS} CXXFLAGS=${CMAKE_CXX_FLAGS} LDFLAGS=${CMAKE_C_FLAGS} - INSTALL_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/activeharmony-4.6.0/src/project_activeharmony && make MPICC=mpicc_disabled CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS=${CMAKE_C_FLAGS} CXXFLAGS=${CMAKE_CXX_FLAGS} LDFLAGS=${CMAKE_C_FLAGS} install prefix=${CMAKE_INSTALL_PREFIX} - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + INSTALL_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/activeharmony-4.6.0/src/project_activeharmony && make MPICC=mpicc_disabled CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS=${CMAKE_C_FLAGS} CXXFLAGS=${CMAKE_CXX_FLAGS} LDFLAGS=${CMAKE_C_FLAGS} install prefix=${CMAKE_INSTALL_PREFIX}/ah + INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/ah LOG_DOWNLOAD 1 # LOG_CONFIGURE 1 # LOG_BUILD 1 # LOG_INSTALL 1 ) - set(ACTIVEHARMONY_ROOT ${CMAKE_INSTALL_PREFIX}) + set(ACTIVEHARMONY_ROOT ${CMAKE_INSTALL_PREFIX}/ah) #ExternalProject_Get_Property(project_activeharmony install_dir) add_library(harmony STATIC IMPORTED) - set_property(TARGET harmony PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/libharmony.a) + set_property(TARGET harmony PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/ah/lib/libharmony.a) set(ACTIVEHARMONY_INCLUDE_DIR "${ACTIVEHARMONY_ROOT}/include") set(ACTIVEHARMONY_LIBRARY "${ACTIVEHARMONY_ROOT}/lib/libharmony.a") # handle the QUIETLY and REQUIRED arguments and set ACTIVEHARMONY_FOUND to TRUE diff --git a/cmake/Modules/FindBFD.cmake b/cmake/Modules/FindBFD.cmake index 34766ab1..1f62a4ae 100644 --- a/cmake/Modules/FindBFD.cmake +++ b/cmake/Modules/FindBFD.cmake @@ -18,8 +18,8 @@ pkg_check_modules(PC_BFD QUIET BFD) set(BFD_DEFINITIONS ${PC_BFD_CFLAGS_OTHER}) find_path(BFD_INCLUDE_DIR bfd.h - HINTS ${BFD_ROOT}/include - ${PC_BFD_INCLUDEDIR} + HINTS ${BFD_ROOT}/include + ${PC_BFD_INCLUDEDIR} ${PC_BFD_INCLUDE_DIRS} PATH_SUFFIXES BFD ) @@ -29,8 +29,8 @@ if ($TMP_PATH) endif() find_library(BFD_LIBRARY NAMES bfd HINTS ${BFD_ROOT}/lib ${BFD_ROOT}/lib64 - ${PC_BFD_LIBDIR} - ${PC_BFD_LIBRARY_DIRS} + ${PC_BFD_LIBDIR} + ${PC_BFD_LIBRARY_DIRS} ${LD_LIBRARY_PATH_STR}) include(FindPackageHandleStandardArgs) @@ -49,7 +49,7 @@ if((APEX_BUILD_BFD OR (NOT BFD_FOUND)) AND NOT APPLE) ExternalProject_Add(project_binutils URL "http://ftp.gnu.org/gnu/binutils/binutils-2.25.tar.bz2" URL_HASH SHA256=22defc65cfa3ef2a3395faaea75d6331c6e62ea5dfacfed3e2ec17b08c882923 - CONFIGURE_COMMAND /configure CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS=${CMAKE_C_FLAGS} CXXFLAGS=${CMAKE_CXX_FLAGS} LDFLAGS=${CMAKE_EXE_LINKER_FLAGS} --prefix=${CMAKE_INSTALL_PREFIX} --disable-dependency-tracking --enable-interwork --disable-multilib --enable-shared --enable-64-bit-bfd --target=${TARGET_ARCH} --enable-install-libiberty + CONFIGURE_COMMAND /configure CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS=${CMAKE_C_FLAGS} CXXFLAGS=${CMAKE_CXX_FLAGS} LDFLAGS=${CMAKE_EXE_LINKER_FLAGS} --prefix=${CMAKE_INSTALL_PREFIX}/binutils --disable-dependency-tracking --enable-interwork --disable-multilib --enable-shared --enable-64-bit-bfd --target=${TARGET_ARCH} --enable-install-libiberty BUILD_COMMAND make MAKEINFO=true -j${MAKEJOBS} INSTALL_COMMAND make MAKEINFO=true install LOG_DOWNLOAD 1 @@ -59,11 +59,17 @@ if((APEX_BUILD_BFD OR (NOT BFD_FOUND)) AND NOT APPLE) ) ExternalProject_Add_Step(project_binutils basedirs DEPENDEES install - COMMAND cp /include/demangle.h ${CMAKE_INSTALL_PREFIX}/include/. + COMMAND cp /include/demangle.h ${CMAKE_INSTALL_PREFIX}/binutils/include/. COMMENT "Copying additional headers" ) + ExternalProject_Add_Step(project_binutils basedirs2 + DEPENDEES install + COMMAND cp /include/demangle.h ${CMAKE_INSTALL_PREFIX}/binutils/include/. + COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_INSTALL_PREFIX}/binutils/lib ${CMAKE_INSTALL_PREFIX}/binutils/lib64 + COMMENT "Adding lib64 simlink" + ) - set(BFD_ROOT ${CMAKE_INSTALL_PREFIX}) + set(BFD_ROOT ${CMAKE_INSTALL_PREFIX}/binutils) ExternalProject_Get_Property(project_binutils install_dir) add_library(bfd STATIC IMPORTED) set_property(TARGET bfd PROPERTY IMPORTED_LOCATION ${install_dir}/lib/libbfd.so) diff --git a/cmake/Modules/FindOMPT.cmake b/cmake/Modules/FindOMPT.cmake index 732ab046..26cbf4ec 100644 --- a/cmake/Modules/FindOMPT.cmake +++ b/cmake/Modules/FindOMPT.cmake @@ -66,7 +66,7 @@ endif() # --------- DOWNLOAD AND BUILD THE EXTERNAL PROJECT! ------------ # if(APEX_BUILD_OMPT OR (NOT OMPT_FOUND)) - set(CACHE OMPT_ROOT ${CMAKE_INSTALL_PREFIX} STRING "OMPT Root directory") + set(CACHE OMPT_ROOT ${CMAKE_INSTALL_PREFIX}/ompt STRING "OMPT Root directory") message("Attention: Downloading and Building OMPT as external project!") message(INFO " A working internet connection is required!") include(ExternalProject) @@ -75,19 +75,18 @@ if(APEX_BUILD_OMPT OR (NOT OMPT_FOUND)) #URL http://www.cs.uoregon.edu/research/paracomp/tau/tauprofile/dist/LLVM-openmp-2021-05-14.tar.gz URL http://tau.uoregon.edu/LLVM-openmp-2021-05-14.tar.gz PREFIX ${CMAKE_CURRENT_BINARY_DIR}/LLVM-ompt-5.0 - CONFIGURE_COMMAND cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} -DCMAKE_BUILD_TYPE=Release ${APEX_OMPT_EXTRA_CONFIG} ../project_ompt + CONFIGURE_COMMAND cmake -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_INSTALL_PREFIX=${OMPT_ROOT} -DCMAKE_BUILD_TYPE=Release ${APEX_OMPT_EXTRA_CONFIG} ../project_ompt BUILD_COMMAND make libomp-needed-headers all INSTALL_COMMAND make install - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + INSTALL_DIR ${OMPT_ROOT} LOG_DOWNLOAD 1 LOG_CONFIGURE 1 LOG_BUILD 1 LOG_INSTALL 1 ) - set(OMPT_ROOT ${CMAKE_INSTALL_PREFIX}) #ExternalProject_Get_Property(project_ompt install_dir) add_library(omp SHARED IMPORTED) - set_property(TARGET omp PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/libomp.so) + set_property(TARGET omp PROPERTY IMPORTED_LOCATION ${OMPT_ROOT}/lib/libomp.so) set(OMPT_INCLUDE_DIR "${OMPT_ROOT}/include") set(OMPT_LIBRARY "${OMPT_ROOT}/lib/libomp.so") # handle the QUIETLY and REQUIRED arguments and set OMPT_FOUND to TRUE diff --git a/cmake/Modules/FindOTF2.cmake b/cmake/Modules/FindOTF2.cmake index 9b405eed..e6f75a18 100644 --- a/cmake/Modules/FindOTF2.cmake +++ b/cmake/Modules/FindOTF2.cmake @@ -33,28 +33,28 @@ mark_as_advanced(OTF2_INCLUDE_DIR OTF2_LIBRARY) # --------- DOWNLOAD AND BUILD THE EXTERNAL PROJECT! ------------ # if(APEX_BUILD_OTF2 OR (NOT OTF2_FOUND)) - set(CACHE OTF2_ROOT ${CMAKE_INSTALL_PREFIX} STRING "OTF2 Root directory") + set(CACHE OTF2_ROOT ${CMAKE_INSTALL_PREFIX}/otf2 STRING "OTF2 Root directory") message("Attention: Downloading and Building OTF2 as external project!") message(INFO " A working internet connection is required!") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") include(ExternalProject) ExternalProject_Add(project_otf2 - URL http://www.vi-hps.org/upload/packages/otf2/otf2-2.0.tar.gz + URL https://www.vi-hps.org/cms/upload/packages/otf2/otf2-2.0.tar.gz PREFIX ${CMAKE_CURRENT_BINARY_DIR}/otf2-2.0 - CONFIGURE_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/otf2-2.0/src/project_otf2 && ./configure CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS=${CMAKE_C_FLAGS} CXXFLAGS=${CMAKE_CXX_FLAGS} LDFLAGS=${CMAKE_EXE_LINKER_FLAGS} --prefix=${CMAKE_INSTALL_PREFIX} --enable-shared + CONFIGURE_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/otf2-2.0/src/project_otf2 && ./configure CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} CFLAGS=${CMAKE_C_FLAGS} CXXFLAGS=${CMAKE_CXX_FLAGS} LDFLAGS=${CMAKE_EXE_LINKER_FLAGS} --prefix=${CMAKE_INSTALL_PREFIX}/otf2 --enable-shared BUILD_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/otf2-2.0/src/project_otf2 && make INSTALL_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/otf2-2.0/src/project_otf2 && make install - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/otf2 LOG_DOWNLOAD 1 LOG_CONFIGURE 1 LOG_BUILD 1 LOG_INSTALL 1 ) - set(OTF2_ROOT ${CMAKE_INSTALL_PREFIX}) + set(OTF2_ROOT ${CMAKE_INSTALL_PREFIX}/otf2) #ExternalProject_Get_Property(project_otf2 install_dir) add_library(otf2 STATIC IMPORTED) - set_property(TARGET otf2 PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/libotf2.a) + set_property(TARGET otf2 PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/otf2/lib/libotf2.a) set(OTF2_INCLUDE_DIR "${OTF2_ROOT}/include") set(OTF2_LIBRARY "${OTF2_ROOT}/lib/libotf2.a") # handle the QUIETLY and REQUIRED arguments and set OTF2_FOUND to TRUE diff --git a/src/apex/apex_ompt.cpp b/src/apex/apex_ompt.cpp index 24618cba..6dd44808 100644 --- a/src/apex/apex_ompt.cpp +++ b/src/apex/apex_ompt.cpp @@ -1061,6 +1061,8 @@ ompt_start_tool_result_t * ompt_start_tool( DEBUG_PRINT("APEX: WARNING! %d != %d (OpenMP Version used to compile APEX)\n", omp_version, _OPENMP); } +#else + APEX_UNUSED(omp_version); // in case we aren't printing debug messages #endif static ompt_start_tool_result_t result; result.initialize = &ompt_initialize; diff --git a/src/apex/policy_handler.hpp b/src/apex/policy_handler.hpp index fee3776f..442a5489 100644 --- a/src/apex/policy_handler.hpp +++ b/src/apex/policy_handler.hpp @@ -84,7 +84,6 @@ class policy_handler : public handler, public event_listener #ifdef APEX_HAVE_HPX hpx::util::interval_timer hpx_timer; #endif - std::atomic active_policies; public: policy_handler (void); /* From 45da7a5166fcd5bbd269c24f0b80424a100955b9 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Thu, 5 Aug 2021 08:56:06 -0700 Subject: [PATCH 2/5] updating documentation links --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 51ee684f..dc51ab94 100644 --- a/README.md +++ b/README.md @@ -51,18 +51,18 @@ APEX provides a mechanism for dynamic runtime behavior, either for autotuning or Documentation ============= -Full user documentation is available here: http://khuck.github.io/xpress-apex. +Full user documentation is available here: http://uo-oaciss.github.io/apex. The source code is instrumented with Doxygen comments, and the API reference manual can be generated by executing `make doc` in the build directory, after CMake configuration. [A fairly recent version of the API reference documentation is also available here] (http://www.nic.uoregon.edu/~khuck/apex_docs/doc/html/index.html). Installation ============ -[Full installation documentation is available here] (http://khuck.github.io/xpress-apex). Below is a quickstart for the impatient... +[Full installation documentation is available here] (http://uo-oaciss.github.io/apex). Below is a quickstart for the impatient... Please Note: ------------ -*These instructions are for building the stand-alone APEX library. For instructions on building APEX with HPX, please see [http://khuck.github.io/xpress-apex/usage](http://khuck.github.io/xpress-apex/usage)* +*These instructions are for building the stand-alone APEX library. For instructions on building APEX with HPX, please see [http://uo-oaciss.github.io/apex/usage](http://uo-oaciss.github.io/apex/usage)* To build APEX stand-alone (to use with OpenMP, OpenACC, CUDA, Kokkos, TBB, C++ threads, etc.) do the following: From 3c5aadc27cf4642e2f275eeac2dc625ca9fc51c3 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Thu, 5 Aug 2021 09:58:27 -0700 Subject: [PATCH 3/5] Updating the readme with hip and pthread info, as well as reference links --- README.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dc51ab94..2ffc9d25 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,11 @@ HPX5 (Indiana University) HPX-5 (High Performance ParalleX) is a second implementation of the ParalleX model. Developed and maintained by the CREST Group at Indiana University, HPX-5 is implemented in C. For more information, see [https://hpx.crest.iu.edu](https://hpx.crest.iu.edu). +Pthreads / C++ Threads +---------------------- + +POSIX.1 specifies a set of interfaces (functions, header files) for threaded programming commonly known as POSIX threads, or Pthreads. A single process can contain multiple threads, all of which are executing the same program. These threads share the same global memory (data and heap segments), but each thread has its own stack (automatic variables). C++ threads are a language portable abstraction on top of native threading implementations. APEX supports pthreads by wrapping and capturing the `pthread_create` function call. For more information, see [https://man7.org/linux/man-pages/man7/pthreads.7.html](https://man7.org/linux/man-pages/man7/pthreads.7.html) and [https://www.cplusplus.com/reference/thread/thread/](https://www.cplusplus.com/reference/thread/thread/). + OpenMP ------ @@ -149,17 +154,22 @@ The OpenMP API supports multi-platform shared-memory parallel programming in C/C OpenACC ------- -OpenACC is a user-driven directive-based performance-portable parallel programming model. It is designed for scientists and engineers interested in porting their codes to a wide-variety of heterogeneous HPC hardware platforms and architectures with significantly less programming effort than required with a low-level model. The OpenACC specification supports C, C++, Fortran programming languages and multiple hardware architectures including X86 & POWER CPUs, and NVIDIA GPUs. +OpenACC is a user-driven directive-based performance-portable parallel programming model. It is designed for scientists and engineers interested in porting their codes to a wide-variety of heterogeneous HPC hardware platforms and architectures with significantly less programming effort than required with a low-level model. The OpenACC specification supports C, C++, Fortran programming languages and multiple hardware architectures including X86 & POWER CPUs, and NVIDIA GPUs. For more information, see [https://www.openacc.org](https://www.openacc.org). Kokkos ------ -Kokkos Core implements a programming model in C++ for writing performance portable applications targeting all major HPC platforms. For that purpose it provides abstractions for both parallel execution of code and data management. Kokkos is designed to target complex node architectures with N-level memory hierarchies and multiple types of execution resources. It currently can use CUDA, HPX, OpenMP and Pthreads as backend programming models with several other backends in development. +Kokkos Core implements a programming model in C++ for writing performance portable applications targeting all major HPC platforms. For that purpose it provides abstractions for both parallel execution of code and data management. Kokkos is designed to target complex node architectures with N-level memory hierarchies and multiple types of execution resources. It currently can use CUDA, HIP, HPX, OpenMP and Pthreads as backend programming models with several other backends in development. For more information, see [https://kokkos.org](https://kokkos.org). CUDA ---- -CUDA® is a parallel computing platform and programming model developed by NVIDIA for general computing on graphical processing units (GPUs). With CUDA, developers are able to dramatically speed up computing applications by harnessing the power of GPUs. +CUDA® is a parallel computing platform and programming model developed by NVIDIA for general computing on graphical processing units (GPUs). With CUDA, developers are able to dramatically speed up computing applications by harnessing the power of GPUs. APEX uses the CUPTI and NVML libraries provided by NVIDIA to gather performance information from the GPUs. For more information, see [https://developer.nvidia.com/cupti](https://developer.nvidia.com/cupti) and [https://developer.nvidia.com/nvidia-management-library-nvml](https://developer.nvidia.com/nvidia-management-library-nvml). + +HIP/ROCm +-------- + +Heterogeneous-Computing Interface for Portability (HIP) is a C++ dialect from AMD designed to ease conversion of CUDA applications to portable C++ code. It provides a C-style API and a C++ kernel language. The C++ interface can use templates and classes across the host/kernel boundary. APEX uses the roctracer library to gather performance information from the GPUs. For more information, see [https://github.com/ROCm-Developer-Tools/roctracer](https://github.com/ROCm-Developer-Tools/roctracer). References ========== From 2d1a92ac11b25cf7bafe8866ba691c73cb63c469 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Thu, 5 Aug 2021 11:31:32 -0700 Subject: [PATCH 4/5] adding implementation of `kokkosp_request_tool_settings` so we can disable fencing when profiling --- src/apex/Kokkos_Profiling_C_Interface.h | 2 +- src/apex/apex_kokkos.cpp | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/apex/Kokkos_Profiling_C_Interface.h b/src/apex/Kokkos_Profiling_C_Interface.h index ed8751c5..2c8d1428 100644 --- a/src/apex/Kokkos_Profiling_C_Interface.h +++ b/src/apex/Kokkos_Profiling_C_Interface.h @@ -54,7 +54,7 @@ #include #endif -#define KOKKOSP_INTERFACE_VERSION 20210225 +#define KOKKOSP_INTERFACE_VERSION 20210623 // Profiling diff --git a/src/apex/apex_kokkos.cpp b/src/apex/apex_kokkos.cpp index be42cb2a..66e924aa 100644 --- a/src/apex/apex_kokkos.cpp +++ b/src/apex/apex_kokkos.cpp @@ -27,6 +27,7 @@ #include #include #include "apex.hpp" +#include "Kokkos_Profiling_C_Interface.h" /* static std::mutex memory_mtx; @@ -74,6 +75,14 @@ void kokkosp_finalize_library() { apex::finalize(); } +/* This is a new function to tell Kokkos to not fence */ +void kokkosp_request_tool_settings(int num_actions, + struct Kokkos_Tools_ToolSettings *settings) { + if ((num_actions > 0) && (settings != nullptr)) { + settings->requires_global_fencing = apex::apex_options::use_kokkos_profiling_fences(); + } +} + /* These functions are called before their respective parallel constructs * execute (Kokkos::parallel_for, Kokkos::parallel_reduce, * Kokkos::parallel_scan). The name argument is the name given by the user From 21c908d00400540cfdc33486bdebfb8f55016775 Mon Sep 17 00:00:00 2001 From: Kevin Huck Date: Thu, 5 Aug 2021 11:34:05 -0700 Subject: [PATCH 5/5] Updating Kokkos tuning to support multiple sessions at the same time, of unlimited number. Also tweaked the simulated annealing search to converge in a reasonable time frame. --- src/apex/apex_kokkos_tuning.cpp | 199 ++++++++++++++++++++++--------- src/apex/apex_policies.cpp | 2 +- src/apex/apex_types.h | 4 +- src/apex/policy_handler.cpp | 4 +- src/apex/policy_handler.hpp | 2 +- src/apex/simulated_annealing.cpp | 9 +- src/apex/simulated_annealing.hpp | 5 +- 7 files changed, 161 insertions(+), 64 deletions(-) diff --git a/src/apex/apex_kokkos_tuning.cpp b/src/apex/apex_kokkos_tuning.cpp index 0173e743..7cc513a6 100644 --- a/src/apex/apex_kokkos_tuning.cpp +++ b/src/apex/apex_kokkos_tuning.cpp @@ -124,7 +124,7 @@ class Variable { ss << " info.type: " << pVT(info.type) << std::endl; ss << " info.category: " << pCat(info.category) << std::endl; ss << " info.valueQuantity: " << pCVT(info.valueQuantity) << std::endl; - ss << " info.candidates: " << pCan(info) << std::endl; + ss << " info.candidates: " << pCan(info); std::string tmp{ss.str()}; return tmp; } @@ -142,41 +142,110 @@ class Variable { }; class KokkosSession { -public: +private: // EXHAUSTIVE, RANDOM, NELDER_MEAD, PARALLEL_RANK_ORDER KokkosSession() : - window(3), + window(5), strategy(apex_ah_tuning_strategy::SIMULATED_ANNEALING), + //strategy(apex_ah_tuning_strategy::NELDER_MEAD), verbose(false), - use_history(false), + use_history(checkForCache()), running(false), - history_file("") { + cacheFilename("./apex_converged_tuning.yaml") { verbose = apex::apex_options::use_kokkos_verbose(); } +public: + ~KokkosSession() { + writeCache(); + } + static KokkosSession& getSession(); + KokkosSession(const KokkosSession&) =delete; + KokkosSession& operator=(const KokkosSession&) =delete; int window; apex_ah_tuning_strategy strategy; std::unordered_map> requests; + std::unordered_map> var_ids; bool verbose; bool use_history; bool running; - std::string history_file; std::unordered_map inputs; std::unordered_map outputs; apex_policy_handle * start_policy_handle; apex_policy_handle * stop_policy_handle; std::unordered_map active_requests; std::unordered_map context_starts; + void writeCache(); + bool checkForCache(); + void saveInputVar(size_t id, Variable * var); + void saveOutputVar(size_t id, Variable * var); + std::stringstream cachedResults; + std::string cacheFilename; }; -KokkosSession& getSession() { +/* If we've cached values, we can bypass a lot. */ +bool KokkosSession::checkForCache() { + // did the user specify a file? + if (strlen(apex::apex_options::kokkos_tuning_cache()) > 0) { + cacheFilename = std::string(apex::apex_options::kokkos_tuning_cache()); + } + std::ifstream f(cacheFilename); + if (f.good()) { + use_history = true; + std::cout << "Cache found" << std::endl; + } else { + std::cout << "Cache not found" << std::endl; + } + return use_history; +} + +void KokkosSession::saveInputVar(size_t id, Variable * var) { + inputs.insert(std::make_pair(id, var)); + cachedResults << "Input_" << id << ":" << std::endl; + cachedResults << var->toString(); +} + +void KokkosSession::saveOutputVar(size_t id, Variable * var) { + outputs.insert(std::make_pair(id, var)); + cachedResults << "Output_" << id << ":" << std::endl; + cachedResults << var->toString(); +} + +void KokkosSession::writeCache(void) { + if(use_history) { return; } + std::ofstream results(cacheFilename); + std::cout << "Writing cache of Kokkos tuning results to: '" << cacheFilename << "'" << std::endl; + results << cachedResults.rdbuf(); + size_t count = 0; + for (const auto &req : requests) { + results << "Context_" << count++ << ":" << std::endl; + results << " Name: \"" << req.first << "\"" << std::endl; + std::shared_ptr request = req.second; + results << " Converged: " << + (request->has_converged() ? "true" : "false") << std::endl; + if (request->has_converged()) { + results << " Results:" << std::endl; + for (const auto &id : var_ids[req.first]) { + Variable* var{KokkosSession::getSession().outputs[id]}; + auto param = std::static_pointer_cast( + request->get_param(var->name)); + results << " id: " << id << std::endl; + results << " value: \"" << param->get_value() << "\"" << std::endl; + } + } + // if not converged, need to get the "best so far" values for the parameters. + } + results.close(); +} + +KokkosSession& KokkosSession::getSession() { static KokkosSession session; return session; } Variable::Variable(size_t _id, std::string _name, Kokkos_Tools_VariableInfo& _info) : id(_id), name(_name), info(_info) { - if (getSession().verbose) { + if (KokkosSession::getSession().verbose) { std::cout << toString(); } } @@ -297,47 +366,46 @@ std::string hashContext(size_t numVars, void printContext(size_t numVars, const Kokkos_Tools_VariableValue* values) { std::cout << ", cv: " << numVars; - std::cout << hashContext(numVars, values, getSession().inputs); + std::cout << hashContext(numVars, values, KokkosSession::getSession().inputs); } void printTuning(const size_t numVars, Kokkos_Tools_VariableValue* values) { std::cout << "tv: " << numVars; - std::cout << hashContext(numVars, values, getSession().outputs); + std::cout << hashContext(numVars, values, KokkosSession::getSession().outputs); std::cout << std::endl; } void set_params(std::shared_ptr request, const size_t vars, Kokkos_Tools_VariableValue* values) { - APEX_UNUSED(request); for (size_t i = 0 ; i < vars ; i++) { auto id = values[i].type_id; - Variable* var{getSession().outputs[id]}; + Variable* var{KokkosSession::getSession().outputs[id]}; if (var->info.valueQuantity == kokkos_value_set) { - auto thread_param = std::static_pointer_cast( + auto param = std::static_pointer_cast( request->get_param(var->name)); if (var->info.type == kokkos_value_double) { - values[i].value.double_value = std::stod(thread_param->get_value()); + values[i].value.double_value = std::stod(param->get_value()); std::string tmp(request->get_name()+":"+var->name); apex::sample_value(tmp, values[i].value.double_value); } else if (var->info.type == kokkos_value_int64) { - values[i].value.int_value = std::stol(thread_param->get_value()); + values[i].value.int_value = std::stol(param->get_value()); std::string tmp(request->get_name()+":"+var->name); apex::sample_value(tmp, values[i].value.int_value); } else if (var->info.type == kokkos_value_string) { - strncpy(values[i].value.string_value, thread_param->get_value().c_str(), 64); + strncpy(values[i].value.string_value, param->get_value().c_str(), 64); } } else { // range if (var->info.type == kokkos_value_double) { - auto thread_param = std::static_pointer_cast( + auto param = std::static_pointer_cast( request->get_param(var->name)); - values[i].value.double_value = thread_param->get_value(); + values[i].value.double_value = param->get_value(); std::string tmp(request->get_name()+":"+var->name); apex::sample_value(tmp, values[i].value.double_value); } else if (var->info.type == kokkos_value_int64) { - auto thread_param = std::static_pointer_cast( + auto param = std::static_pointer_cast( request->get_param(var->name)); - values[i].value.int_value = thread_param->get_value(); + values[i].value.int_value = param->get_value(); std::string tmp(request->get_name()+":"+var->name); apex::sample_value(tmp, values[i].value.int_value); } @@ -345,36 +413,48 @@ void set_params(std::shared_ptr request, } } -void handle_start(const std::string & name, const size_t vars, - Kokkos_Tools_VariableValue* values) { - KokkosSession& session = getSession(); +bool handle_start(const std::string & name, const size_t vars, + Kokkos_Tools_VariableValue* values, uint64_t * delta) { + KokkosSession& session = KokkosSession::getSession(); auto search = session.requests.find(name); + bool newSearch = false; if(search == session.requests.end()) { + *delta = apex::profiler::now_ns(); // Start a new tuning session. if(session.verbose) { fprintf(stderr, "Starting tuning session for %s\n", name.c_str()); } std::shared_ptr request{std::make_shared(name)}; session.requests.insert(std::make_pair(name, request)); + // save the variable ids associated with this session + std::vector var_ids; + for (size_t i = 0 ; i < vars ; i++) { + var_ids.push_back(values[i].type_id); + } + session.var_ids.insert(std::make_pair(name, var_ids)); // Create an event to trigger this tuning session. apex_event_type trigger = apex::register_custom_event(name); request->set_trigger(trigger); + // need this in the lambda + bool verbose = session.verbose; // Create a metric std::function metric = [=]()->double{ apex_profile * profile = apex::get_profile(name); if(profile == nullptr) { std::cerr << "ERROR: no profile for " << name << std::endl; + //abort(); return 0.0; } if(profile->calls == 0.0) { std::cerr << "ERROR: calls = 0 for " << name << std::endl; + //abort(); return 0.0; } double result = profile->accumulated/profile->calls; - if(session.verbose) { - fprintf(stdout, "time per call: %fs\n", (double)(result)/1000000000.0); + if(verbose) { + std::cout << "querying time per call: " << (double)(result)/1000000000.0 << "s" << std::endl; } return result; }; @@ -389,7 +469,7 @@ void handle_start(const std::string & name, const size_t vars, for (size_t i = 0 ; i < vars ; i++) { auto id = values[i].type_id; - Variable* var{getSession().outputs[id]}; + Variable* var{session.outputs[id]}; /* If it's a set, the initial value can be a double, int or string * because we store all interval sets as enumerations of strings */ if (var->info.valueQuantity == kokkos_value_set) { @@ -429,22 +509,27 @@ void handle_start(const std::string & name, const size_t vars, // Start the tuning session. apex::setup_custom_tuning(*request); + newSearch = true; + // measure how long it took us to set this up + *delta = apex::profiler::now_ns() - *delta; } else { // We've seen this region before. std::shared_ptr request = search->second; set_params(request, vars, values); } + return newSearch; } void handle_stop(const std::string & name) { - auto search = getSession().requests.find(name); - if(search == getSession().requests.end()) { + KokkosSession& session = KokkosSession::getSession(); + auto search = session.requests.find(name); + if(search == session.requests.end()) { std::cerr << "ERROR: No data for \"" << name << std::endl; } else { apex_profile * profile = apex::get_profile(name); - if(getSession().window == 1 || + if(session.window == 1 || (profile != nullptr && - profile->calls >= getSession().window)) { + profile->calls >= session.window)) { //std::cout << "Num calls: " << profile->calls << std::endl; std::shared_ptr request = search->second; // Evaluate the results @@ -474,15 +559,15 @@ void kokkosp_declare_output_type(const char* name, const size_t id, Kokkos_Tools_VariableInfo& info) { // don't track memory in this function. apex::in_apex prevent_memory_tracking; + KokkosSession& session = KokkosSession::getSession(); //if (!apex::apex_options::use_kokkos_tuning()) { return; } - if(getSession().verbose) { + if(session.verbose) { std::cout << std::string(getDepth(), ' '); std::cout << __func__ << std::endl; } Variable * output = new Variable(id, name, info); output->makeSpace(); - getSession().outputs.insert(std::make_pair(id, output)); - getSession().inputs.insert(std::make_pair(id, output)); + session.saveOutputVar(id, output); return; } @@ -496,14 +581,14 @@ void kokkosp_declare_input_type(const char* name, const size_t id, Kokkos_Tools_VariableInfo& info) { // don't track memory in this function. apex::in_apex prevent_memory_tracking; + KokkosSession& session = KokkosSession::getSession(); //if (!apex::apex_options::use_kokkos_tuning()) { return; } - if(getSession().verbose) { + if(session.verbose) { std::cout << std::string(getDepth(), ' '); std::cout << __func__ << std::endl; } Variable * input = new Variable(id, name, info); - getSession().inputs.insert(std::make_pair(id, input)); - getSession().outputs.insert(std::make_pair(id, input)); + session.saveInputVar(id, input); } /* Here Kokkos is requesting the values of tuning variables, and most @@ -534,24 +619,27 @@ void kokkosp_request_values( Kokkos_Tools_VariableValue* tuningVariableValues) { // don't track memory in this function. apex::in_apex prevent_memory_tracking; - if (getSession().verbose) { + KokkosSession& session = KokkosSession::getSession(); + if (session.verbose) { std::cout << std::string(getDepth(), ' '); std::cout << __func__ << " ctx: " << contextId; printContext(numContextVariables, contextVariableValues); } std::string name{hashContext(numContextVariables, contextVariableValues, - getSession().inputs)}; - getSession().active_requests.insert( + session.inputs)}; + session.active_requests.insert( std::pair(contextId, name)); if (apex::apex_options::use_kokkos_tuning()) { - handle_start(name, numTuningVariables, tuningVariableValues); + uint64_t delta = 0; + if (handle_start(name, numTuningVariables, tuningVariableValues, &delta)) { + // throw away the time spent setting up tuning + session.context_starts[contextId] = session.context_starts[contextId] + delta; + } } - if (getSession().verbose) { + if (session.verbose) { std::cout << std::endl << std::string(getDepth(), ' '); printTuning(numTuningVariables, tuningVariableValues); } - // throw away the time spent in this step! - getSession().context_starts[contextId] = apex::profiler::now_ns(); } /* This starts the context pointed at by contextId. If tools use @@ -561,13 +649,14 @@ void kokkosp_request_values( void kokkosp_begin_context(size_t contextId) { // don't track memory in this function. apex::in_apex prevent_memory_tracking; + KokkosSession& session = KokkosSession::getSession(); //if (!apex::apex_options::use_kokkos_tuning()) { return; } - if (getSession().verbose) { + if (session.verbose) { std::cout << std::string(getDepth()++, ' '); std::cout << __func__ << "\t" << contextId << std::endl; } std::stringstream ss; - getSession().context_starts.insert( + session.context_starts.insert( std::pair(contextId, apex::profiler::now_ns())); } @@ -578,21 +667,23 @@ void kokkosp_begin_context(size_t contextId) { void kokkosp_end_context(const size_t contextId) { // don't track memory in this function. apex::in_apex prevent_memory_tracking; - if (getSession().verbose) { + KokkosSession& session = KokkosSession::getSession(); + uint64_t end = apex::profiler::now_ns(); + auto start = session.context_starts.find(contextId); + auto name = session.active_requests.find(contextId); + if (session.verbose) { std::cout << std::string(--getDepth(), ' '); std::cout << __func__ << "\t" << contextId << std::endl; + std::cout << name->second << "\t" << (end-(start->second)) << std::endl; } - uint64_t end = apex::profiler::now_ns(); - auto start = getSession().context_starts.find(contextId); - auto name = getSession().active_requests.find(contextId); - if (name != getSession().active_requests.end() && - start != getSession().context_starts.end()) { - apex::sample_value(name->second, (double)(end-start->second)); + if (name != session.active_requests.end() && + start != session.context_starts.end()) { + apex::sample_value(name->second, (double)(end-(start->second))); if (apex::apex_options::use_kokkos_tuning()) { handle_stop(name->second); } - getSession().active_requests.erase(contextId); - getSession().context_starts.erase(contextId); + session.active_requests.erase(contextId); + session.context_starts.erase(contextId); } } diff --git a/src/apex/apex_policies.cpp b/src/apex/apex_policies.cpp index b7075b2f..2d032b4f 100644 --- a/src/apex/apex_policies.cpp +++ b/src/apex/apex_policies.cpp @@ -884,7 +884,7 @@ int apex_sa_policy(shared_ptr tuning_session, if (apex_final) return APEX_NOERROR; // we terminated std::unique_lock l{shutdown_mutex}; if (tuning_session->sa_session.converged()) { - if (!tuning_session->converged_message) { + if (!tuning_session->converged_message && apex::apex_options::use_verbose()) { tuning_session->converged_message = true; cout << "Tuning has converged for session " << tuning_session->id << "." << endl; diff --git a/src/apex/apex_types.h b/src/apex/apex_types.h index 941988f1..8c0c3b7d 100644 --- a/src/apex/apex_types.h +++ b/src/apex/apex_types.h @@ -329,6 +329,7 @@ inline unsigned int sc_nprocessors_onln() macro (APEX_JUPYTER_SUPPORT, use_jupyter_support, int, false) \ macro (APEX_KOKKOS_VERBOSE, use_kokkos_verbose, bool, false) \ macro (APEX_KOKKOS_TUNING, use_kokkos_tuning, bool, true) \ + macro (APEX_KOKKOS_PROFILING_FENCES, use_kokkos_profiling_fences, bool, false) \ #define FOREACH_APEX_FLOAT_OPTION(macro) \ macro (APEX_SCATTERPLOT_FRACTION, scatterplot_fraction, double, 0.01) \ @@ -342,7 +343,8 @@ inline unsigned int sc_nprocessors_onln() APEX_DEFAULT_OTF2_ARCHIVE_PATH) \ macro (APEX_OTF2_ARCHIVE_NAME, otf2_archive_name, char*, \ APEX_DEFAULT_OTF2_ARCHIVE_NAME) \ - macro (APEX_EVENT_FILTER_FILE, task_event_filter_file, char*, "") + macro (APEX_EVENT_FILTER_FILE, task_event_filter_file, char*, "") \ + macro (APEX_KOKKOS_TUNING_CACHE, kokkos_tuning_cache, char*, "") // Do the clang check first #if defined(__APPLE__) || defined(__clang__) diff --git a/src/apex/policy_handler.cpp b/src/apex/policy_handler.cpp index 118bfd1a..15640e2e 100644 --- a/src/apex/policy_handler.cpp +++ b/src/apex/policy_handler.cpp @@ -217,9 +217,9 @@ namespace apex { //write_lock_type l(custom_event_mutex); static std::mutex foo; foo.lock(); - while(custom_event_policies.size() < (size_t)(when+1)) { + if(custom_event_policies.find(when) == custom_event_policies.end()) { std::list > new_list; - custom_event_policies.push_back(std::move(new_list)); + custom_event_policies.insert(std::make_pair(when, std::move(new_list))); } foo.unlock(); custom_event_policies[when].push_back(instance); diff --git a/src/apex/policy_handler.hpp b/src/apex/policy_handler.hpp index fee3776f..e21807fd 100644 --- a/src/apex/policy_handler.hpp +++ b/src/apex/policy_handler.hpp @@ -62,7 +62,7 @@ class policy_handler : public handler, public event_listener std::list > send_policies; std::list > recv_policies; std::list > periodic_policies; - std::vector > > + std::map > > custom_event_policies; shared_mutex_type startup_mutex; shared_mutex_type shutdown_mutex; diff --git a/src/apex/simulated_annealing.cpp b/src/apex/simulated_annealing.cpp index c4ae859f..e5528ff6 100644 --- a/src/apex/simulated_annealing.cpp +++ b/src/apex/simulated_annealing.cpp @@ -1,4 +1,5 @@ #include "simulated_annealing.hpp" +#include namespace apex { @@ -13,15 +14,15 @@ size_t SimulatedAnnealing::get_max_iterations() { for (auto& v : vars) { switch (v.second.vtype) { case VariableType::doubletype: { - max_iter = max_iter + v.second.dvalues.size(); + max_iter = max_iter * v.second.dvalues.size(); break; } case VariableType::longtype: { - max_iter = max_iter + v.second.lvalues.size(); + max_iter = max_iter * v.second.lvalues.size(); break; } case VariableType::stringtype: { - max_iter = max_iter + v.second.svalues.size(); + max_iter = max_iter * v.second.svalues.size(); break; } default: { @@ -31,7 +32,7 @@ size_t SimulatedAnnealing::get_max_iterations() { } //return max_iter / vars.size(); //return max_iter * vars.size() *vars.size(); - return max_iter / 3; // the window + return std::min(max_iterations, (std::max(min_iterations, max_iter))); } double SimulatedAnnealing::acceptance_probability(double new_cost) { diff --git a/src/apex/simulated_annealing.hpp b/src/apex/simulated_annealing.hpp index f1a0e4d3..b2c6153b 100644 --- a/src/apex/simulated_annealing.hpp +++ b/src/apex/simulated_annealing.hpp @@ -134,12 +134,15 @@ class SimulatedAnnealing { size_t kmax; size_t k; std::map vars; + const size_t max_iterations{200}; + const size_t min_iterations{100}; public: void evaluate(double new_cost); SimulatedAnnealing() : restart(0), since_restart(0), temp(0), kmax(0), k(1) { cost = std::numeric_limits::max(); best_cost = cost; + std::cout << "New Session!" << std::endl; } double getEnergy() { return best_cost; } bool converged() { @@ -167,7 +170,7 @@ class SimulatedAnnealing { vars.insert(std::make_pair(name, var)); kmax = get_max_iterations(); /* get max iterations */ - //std::cout << "Max iterations : " << kmax << std::endl; + std::cout << "Max iterations : " << kmax << std::endl; restart = kmax / 10; } };