diff --git a/CMakeLists.txt b/CMakeLists.txt index 56378a0b579..31f5a1cbe59 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,6 +132,9 @@ option(LBANN_WITH_HWLOC option(LBANN_WITH_NVPROF "Enable NVTX-based instrumentation for nvprof" OFF) +option(LBANN_WITH_ROCTRACER + "Enable roctx-based instrumentation for nvprof" OFF) + option(LBANN_WITH_PYTHON_FRONTEND "Install Python frontend and enable embedded Python" ON) @@ -410,6 +413,13 @@ if (LBANN_HAS_ROCM) list(APPEND LBANN_ROCM_LIBS roc::rocfft) endif () + if (LBANN_WITH_ROCTRACER) + find_package(Roctracer MODULE COMPONENTS roctracer roctx) + if (Roctracer_FOUND) + set(LBANN_HAS_ROCTRACER TRUE) + list(APPEND LBANN_ROCM_LIBS roctracer::roctracer) + endif () + endif () endif (LBANN_HAS_ROCM) # This is used in the sample list implementation diff --git a/cmake/configure_files/lbann_config.hpp.in b/cmake/configure_files/lbann_config.hpp.in index b39eb5c4a06..0f15e036671 100644 --- a/cmake/configure_files/lbann_config.hpp.in +++ b/cmake/configure_files/lbann_config.hpp.in @@ -80,6 +80,7 @@ #cmakedefine LBANN_HAS_ROCM #cmakedefine LBANN_HAS_MIOPEN +#cmakedefine LBANN_HAS_ROCTRACER #cmakedefine LBANN_HAS_ONEDNN #cmakedefine LBANN_HAS_ONEDNN_CPU diff --git a/cmake/modules/FindRoctracer.cmake b/cmake/modules/FindRoctracer.cmake new file mode 100644 index 00000000000..409c309916d --- /dev/null +++ b/cmake/modules/FindRoctracer.cmake @@ -0,0 +1,103 @@ +################################################################################ +## Copyright (c) 2014-2022, Lawrence Livermore National Security, LLC. +## Produced at the Lawrence Livermore National Laboratory. +## Written by the LBANN Research Team (B. Van Essen, et al.) listed in +## the CONTRIBUTORS file. +## +## LLNL-CODE-697807. +## All rights reserved. +## +## This file is part of LBANN: Livermore Big Artificial Neural Network +## Toolkit. For details, see http://software.llnl.gov/LBANN or +## https://github.com/LLNL/LBANN. +## +## Licensed under the Apache License, Version 2.0 (the "Licensee"); you +## may not use this file except in compliance with the License. You may +## obtain a copy of the License at: +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +## implied. See the License for the specific language governing +## permissions and limitations under the license. +################################################################################ +# Sets the following variables +# +# Roctracer_FOUND +# Roctracer_LIBRARIES +# +# Defines the following imported target: +# +# roctracer::roctracer +# roctracer::roctracer_api +# roctracer::roctx_api +# + +set(_supported_components roctracer roctx) +if (NOT Roctracer_FIND_COMPONENTS) + set(Roctracer_FIND_COMPONENTS ${_supported_components}) +endif () + +foreach (comp IN LISTS Roctracer_FIND_COMPONENTS) + if (NOT ${comp} IN_LIST _supported_components) + message(FATAL_ERROR + "Cannot specify component \"${comp}\" for package Roctracer. " + "Supported components are: ${_supported_components}.") + endif () + + set(_header_name "${comp}.h") + set(_lib_name "${comp}64") + + find_path(${comp}_INCLUDE_PATH ${_header_name} + HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer + PATH_SUFFIXES include + DOC "The ${comp} include directory for roctracer." + NO_DEFAULT_PATH) + find_path(${comp}_INCLUDE_PATH ${_header_name} + HINTS ${ROCM_PATH}/include/roctracer $ENV{ROCM_PATH}/include/roctracer + DOC "The ${comp} include directory for roctracer." + NO_DEFAULT_PATH) + find_path(${comp}_INCLUDE_PATH ${_header_name}) + + find_library(${comp}_LIBRARY ${_lib_name} + HINTS ${ROCM_PATH}/roctracer $ENV{ROCM_PATH}/roctracer + HINTS ${ROCM_PATH} $ENV{ROCM_PATH} + PATH_SUFFIXES lib64 lib + DOC "The ${comp} library for roctracer." + NO_DEFAULT_PATH) + find_library(${comp}_LIBRARY ${_lib_name}) + + if (${comp}_LIBRARY AND ${comp}_INCLUDE_PATH) + set(Roctracer_${comp}_FOUND TRUE) + + if (NOT TARGET roctracer::${comp}_api) + add_library(roctracer::${comp}_api INTERFACE IMPORTED) + endif () + target_link_libraries(roctracer::${comp}_api INTERFACE + "${${comp}_LIBRARY}") + target_include_directories(roctracer::${comp}_api INTERFACE + "${${comp}_INCLUDE_PATH}") + + mark_as_advanced(${comp}_LIBRARY) + mark_as_advanced(${comp}_INCLUDE_PATH) + + list(APPEND _imported_libraries roctracer::${comp}_api) + else () + set(Roctracer_${comp}_FOUND FALSE) + endif () +endforeach () + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Roctracer HANDLE_COMPONENTS) + +if (Roctracer_FOUND) + if (NOT TARGET roctracer::roctracer) + add_library(roctracer::roctracer INTERFACE IMPORTED) + endif () + foreach (lib IN LISTS _imported_libraries) + target_link_libraries(roctracer::roctracer INTERFACE ${lib}) + endforeach () + set(Roctracer_LIBRARIES roctracer::roctracer) +endif (Roctracer_FOUND) diff --git a/src/utils/profiling.cpp b/src/utils/profiling.cpp index 03859e93442..875fc80e3e0 100644 --- a/src/utils/profiling.cpp +++ b/src/utils/profiling.cpp @@ -28,6 +28,8 @@ #include "lbann/base.hpp" #include "lbann/utils/profiling.hpp" +#include "lbann/utils/exception.hpp" + #if defined(LBANN_SCOREP) #include #elif defined(LBANN_NVPROF) @@ -39,6 +41,11 @@ #include "lbann/utils/gpu/helpers.hpp" #endif +#if defined(LBANN_HAS_ROCTRACER) +#include +#include +#endif + namespace { bool profiling_started = false; } @@ -94,12 +101,36 @@ void prof_region_end(const char *, bool sync) { } nvtxRangePop(); } +#elif defined(LBANN_HAS_ROCTRACER) +void prof_start() { + roctracer_start(); + profiling_started = true; +} +void prof_stop() { + roctracer_stop(); + profiling_started = false; +} +void prof_region_begin(const char *s, int, bool sync) { + if (!profiling_started) return; + if (sync) { + hydrogen::gpu::SynchronizeDevice(); + } + LBANN_ASSERT(0 <= roctxRangePush(s)); +} +void prof_region_end(const char *, bool sync) { + if (!profiling_started) return; + if (sync) { + hydrogen::gpu::SynchronizeDevice(); + } + LBANN_ASSERT(0 <= roctxRangePop()); +} #else void prof_start() { profiling_started = true; return; } void prof_stop() { + profiling_started = false; return; } void prof_region_begin(const char *, int, bool) {