Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NVTX support via DrHook #23

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,33 @@ ecbuild_add_option( FEATURE WARNINGS
DEFAULT ON
DESCRIPTION "Add warnings to compiler" )

if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_C_COMPILER_ID STREQUAL "NVHPC" )
set (DEFAULT_DR_NVTX ON)
else ()
set (DEFAULT_DR_NVTX OFF)
endif ()

ecbuild_add_option( FEATURE DR_NVTX
DEFAULT ${DEFAULT_DR_NVTX}
DESCRIPTION "Add nvtx intrumentation" )

ecbuild_find_package( NAME Realtime QUIET )

####

if(HAVE_DR_NVTX)

find_package(CUDAToolkit REQUIRED COMPONENTS nvtx3)
find_package(NVHPC REQUIRED COMPONENTS HOSTUTILS)

find_library(NVTOOLSEXT_LIB NAMES nvToolsExt REQUIRED HINTS ${CUDAToolkit_LIBRARY_DIR})
find_library(NVHPCWRAPNVTX_LIB NAMES nvhpcwrapnvtx REQUIRED HINTS ${NVHPC_HOSTUTILS_LIBRARY_DIR} )

endif()




### Sources

include( fiat_compiler_warnings )
Expand Down
6 changes: 6 additions & 0 deletions cmake/project_summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,11 @@ ecbuild_info( "MPI (export MPI_HOME to correct MPI implementation)" )
ecbuild_info( " MPI_Fortran_INCLUDE_DIRS : [${MPI_Fortran_INCLUDE_DIRS}]" )
ecbuild_info( " MPI_Fortran_LIBRARIES : [${MPI_Fortran_LIBRARIES}]" )
ecbuild_info( " MPIEXEC : [${MPIEXEC}]" )

if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_C_COMPILER_ID STREQUAL "NVHPC" )
ecbuild_info( " nvToolsExt library from CUDAToolkit : [${NVTOOLSEXT_LIB}]" )
ecbuild_info( " nvhpcwrapnvtx library from NVHPC : [${NVHPCWRAPNVTX_LIB}]" )
endif()

ecbuild_info( "---------------------------------------------------------" )

16 changes: 16 additions & 0 deletions src/fiat/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ endif()
configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/library/version.c.in ${CMAKE_CURRENT_BINARY_DIR}/version.c @ONLY )

ecbuild_list_add_pattern( LIST fiat_src GLOB *.c *.F* *.cc )

if( NOT HAVE_DR_NVTX)
# The files in the drnvtx directory are only intended to work with NVHPC
# So don't try to compile them when using another compiler
ecbuild_list_exclude_pattern( LIST fiat_src REGEX drnvtx/* )
endif()

set( fiat_src ${fiat_src} PARENT_SCOPE )

ecbuild_add_library( TARGET fiat
Expand All @@ -66,6 +73,15 @@ ecbuild_add_library( TARGET fiat
)


## if compiler is pgi add two libs

if (HAVE_DR_NVTX)
target_link_libraries(fiat PUBLIC ${NVTOOLSEXT_LIB})
target_link_libraries(fiat PUBLIC ${NVHPCWRAPNVTX_LIB})
target_include_directories(fiat PRIVATE "${CUDAToolkit_LIBRARY_DIR}/../include")
add_compile_definitions(HAVE_DR_NVTX)
endif()

if( ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" )
# Following should not be necessary;
# Probably a bug in the M1 prerelease of gfortran 10.2.0.4
Expand Down
29 changes: 29 additions & 0 deletions src/fiat/drhook/internal/dr_hook_util.F90
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ SUBROUTINE DR_HOOK_UTIL(LDHOOK,CDNAME,KCASE,PKEY,CDFILENAME,KSIZEINFO)
USE OML_MOD , ONLY : OML_MY_THREAD
USE YOMHOOK , ONLY : LHOOK
USE DR_HACK_MOD, ONLY : LL_DRHACK, DR_HACK_INIT, DR_HACK
#ifdef HAVE_DR_NVTX
USE NVTX
USE DR_NVTX
#endif

IMPLICIT NONE

Expand All @@ -33,6 +37,31 @@ SUBROUTINE DR_HOOK_UTIL(LDHOOK,CDNAME,KCASE,PKEY,CDFILENAME,KSIZEINFO)

#include "dr_hook_init.intfb.h"

#ifdef HAVE_DR_NVTX
INTEGER, SAVE :: II_DRNVTX = 0 ! 0=no initialized, -1=nvtx off, +1=nvtx on
CHARACTER*32 :: CL_NVTX
#endif


#ifdef HAVE_DR_NVTX
IF (II_DRNVTX == 0) THEN
CALL GETENV ('DR_NVTX', CL_NVTX)
IF (CL_NVTX == '1') THEN
II_DRNVTX = +1
ELSE
II_DRNVTX = -1
ENDIF
ENDIF

IF (II_DRNVTX == 1) THEN
IF (KCASE == 0) THEN
CALL DR_NVTX_PUSH_RANGE (CDNAME)
ELSEIF (KCASE==1) THEN
CALL DR_NVTX_POP_RANGE (CDNAME)
ENDIF
ENDIF
#endif

IF (.NOT.LDHOOK) RETURN

IMYTID = OML_MY_THREAD()
Expand Down
52 changes: 52 additions & 0 deletions src/fiat/drnvtx/dr_nvtx.F90
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
module dr_nvtx

use iso_c_binding
implicit none

interface

subroutine dr_nvtx_start (name)
use iso_c_binding
character(kind=c_char,len=*) :: name
end subroutine

subroutine dr_nvtx_end (name)
use iso_c_binding
character(kind=c_char,len=*) :: name
end subroutine

end interface

public :: dr_nvtx_push_range
public :: dr_nvtx_pop_range

contains

subroutine dr_nvtx_push_range (fstr)
character(kind=c_char,len=*), intent(in) :: fstr
character(kind=c_char,len=1024) :: cstr

!$omp master

cstr=trim(fstr)//c_null_char
call dr_nvtx_start (cstr)

!$omp end master

end subroutine

subroutine dr_nvtx_pop_range (fstr)
character(kind=c_char,len=*), intent(in) :: fstr
character(kind=c_char,len=1024) :: cstr

!$omp master

cstr=trim(fstr)//c_null_char
call dr_nvtx_end (cstr)

!$omp end master

end subroutine

end module dr_nvtx

116 changes: 116 additions & 0 deletions src/fiat/drnvtx/dr_nvtx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include <nvToolsExt.h>
#include <string.h>
#include <stdlib.h>

#include "dr_nvtx_map.h"

#define INDENT(n) \
do { \
int __i; \
for (int __i = 0; __i < (n); __i++) \
printf (" "); \
} while (1)

static uint32_t myadler32 (const unsigned char *data)
{
const uint32_t MOD_ADLER = 65521;
uint32_t a = 1, b = 0;
size_t index;

for (index = 0; data[index] != 0; ++index)
{
a = (a + data[index]*2) % MOD_ADLER;
b = (b + a) % MOD_ADLER;
}

return (b << 16) | a;
}

#ifdef NVTX_VERYVERBOSE
static const char namestack[256][256];
static int istack=0;
#endif

void dr_nvtx_start_ (const char * name)
{
if (! dr_nvtx_map_start (name))
{
#ifdef NVTX_VERYVERBOSE
INDENT (istack);
printf ("Skipped open --- %s\n", name);
#endif
return;
}

int hash = 0;
int color_id = myadler32 ((const unsigned char*)name);
int r,g,b;

r=color_id & 0x000000ff;
g=(color_id & 0x000ff000) >> 12;
b=(color_id & 0x0ff00000) >> 20;

if (r<64 & g<64 & b<64)
{
r=r*3;
g=g*3+64;
b=b*4;
}

color_id = 0xff000000 | (r << 16) | (g << 8) | (b);

nvtxEventAttributes_t eventAttrib = {0};
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = color_id;
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = name;

#ifdef NVTX_VERYVERBOSE
INDENT (istack);
printf ("Opening %s\n", name);
#endif

nvtxRangePushEx (&eventAttrib);

#ifdef NVTX_VERYVERBOSE
strncpy (namestack[istack], name, 128);
istack++;
#endif

}

void dr_nvtx_end_ (const char * name)
{

if (! dr_nvtx_map_stop ())
{
#ifdef NVTX_VERYVERBOSE
INDENT (istack);
printf ("Skipped end --- %s\n",name);
#endif
return;
}

#ifdef NVTX_VERYVERBOSE
istack--;
if (istack < 0)
{
printf ("NVTX error negative stack\n");
abort ();
}

INDENT (istack);

printf ("Closing %s\n",name);

if (strcmp (name,namestack[istack]))
{
printf ("Error just closed the wrong marker: %s expected: %s\n",name, namestack[istack]);
abort ();
}
#endif

nvtxRangePop ();
}
78 changes: 78 additions & 0 deletions src/fiat/drnvtx/dr_nvtx_map.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#include <unordered_map>
#include <cstring>
#include <iostream>

#include "dr_nvtx_map.h"

using namespace std;

extern "C" double MPI_Wtime ();
#pragma weak MPI_Wtime

namespace
{
struct counter
{
int calls = 0;
double elapsed = 0;
double t0 = 0;
};

template <class _Tp>
struct equal_to : public binary_function<_Tp, _Tp, bool>
{
bool operator()(const _Tp& __x, const _Tp& __y) const
{
return strcmp( __x, __y ) == 0;
}
};


struct hash
{
//BKDR hash algorithm
int operator() (const char * str) const
{
int seed = 131;//31 131 1313 13131131313 etc//
int hash = 0;
while(*str)
{
hash = (hash * seed) + (*str);
str ++;
}

return hash & (0x7FFFFFFF);
}
};

counter * stack[128];

std::unordered_map<char const*, counter, hash, equal_to<const char*>> map;

int ilast = 0;
};

extern "C" int dr_nvtx_map_start (const char * str)
{
counter * elem = &(map[str]);
ilast++;
stack[ilast] = elem;
elem->calls ++;
if (elem->calls >= 11 && elem->elapsed < 0.0001)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @pmarguinaud, I've been doing some work on this patch to integrate it into the core of drhook so that it's faster, works with all support drhook languages, and is easier for us to maintain.

This condition is giving me a bit of confusion though. My understanding is that it's meant to eliminate high call but low cumulative runtime drhook regions from the nvtx trace, so as to reduce noise in the trace. However, the way it's implemented (particularly lines 72-75) means that once you go over 10 calls, it will always be skipped. This also adds a fun situation where the following recursive code won't close any nvtx regions it opens:

void foo(void) {
    static int calls = 0;
    calls++;
    
    int skipstart = dr_hook_nvtx_map_start("foo");
    std::cout<<"In  call "<<calls<<" Skipped? "<<!skipstart<<std::endl;
    
    
    if (calls < 11) {
        foo();
    }
    
    int skipend = dr_hook_nvtx_map_stop();
    std::cout<<"Out call "<<calls<<" Skipped? "<<!skipend<<std::endl;
    
    calls--;
    return;
}

This gave the following output:

In  call 1 Skipped? 0
In  call 2 Skipped? 0
In  call 3 Skipped? 0
In  call 4 Skipped? 0
In  call 5 Skipped? 0
In  call 6 Skipped? 0
In  call 7 Skipped? 0
In  call 8 Skipped? 0
In  call 9 Skipped? 0
In  call 10 Skipped? 0
In  call 11 Skipped? 1
Out call 11 Skipped? 1
Out call 10 Skipped? 1
Out call 9 Skipped? 1
Out call 8 Skipped? 1
Out call 7 Skipped? 1
Out call 6 Skipped? 1
Out call 5 Skipped? 1
Out call 4 Skipped? 1
Out call 3 Skipped? 1
Out call 2 Skipped? 1
Out call 1 Skipped? 1

Am I correct in thinking this is not the intended behaviour and I should attempt to fix it in my patch?

return 0;
if (elem->calls > 1)
elem->t0 = MPI_Wtime();
return 1;
}

extern "C" int dr_nvtx_map_stop ()
{
counter * last = stack[ilast];
ilast--;
if (last->calls >= 11 && last->elapsed < 0.0001)
return 0;
if (last->calls > 1)
last->elapsed += MPI_Wtime() - last->t0;
return 1;
}

18 changes: 18 additions & 0 deletions src/fiat/drnvtx/dr_nvtx_map.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef _DR_NVTX_MAP_START
#define _DR_NVTX_MAP_START

#ifdef __cplusplus
extern "C"
{
#endif

int dr_nvtx_map_start (const char * str);
int dr_nvtx_map_stop ();

#ifdef __cplusplus
}
#endif


#endif

3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ endif()
add_subdirectory( drhook )


if (HAVE_DR_NVTX)
add_subdirectory( drnvtx )
endif ()


# ----------------------------------------------------------------------------------------
Expand Down
Loading