diff --git a/validation_tests/papi-rocm/Makefile b/validation_tests/papi-rocm/Makefile index 8c88fdb7..9657c108 100644 --- a/validation_tests/papi-rocm/Makefile +++ b/validation_tests/papi-rocm/Makefile @@ -1,10 +1,10 @@ PAPI_ROCM_ROOT ?= $(ROCM_ROOT) #/path/to/rocm/ -CC = $(PAPI_ROCM_ROOT)/bin/hipcc -CXX = $(PAPI_ROCM_ROOT)/bin/hipcc -CXXFLAGS += -g -O2 -fopenmp +CC = $(HIP_ROOT)/bin/hipcc +CXX = $(HIP_ROOT)/bin/hipcc +CXXFLAGS += -g -O2 CPPFLAGS += -I$(PAPI_ROCM_ROOT)/include -I$(PAPI_ROOT)/include -LDFLAGS += -L$(PAPI_ROOT)/lib -lpapi -fopenmp +LDFLAGS += -L$(PAPI_ROOT)/lib -lpapi ALL: single_monitor multi_monitor overflow diff --git a/validation_tests/papi-rocm/compile.sh b/validation_tests/papi-rocm/compile.sh index 7f9114e2..d61017b2 100755 --- a/validation_tests/papi-rocm/compile.sh +++ b/validation_tests/papi-rocm/compile.sh @@ -3,4 +3,4 @@ set -e set -x -make PAPIROOT=$PAPI_ROOT +make PAPIROOT=$PAPI_ROOT diff --git a/validation_tests/papi-rocm/multi_monitor.cpp b/validation_tests/papi-rocm/multi_monitor.cpp index f4c25fe7..2847776c 100644 --- a/validation_tests/papi-rocm/multi_monitor.cpp +++ b/validation_tests/papi-rocm/multi_monitor.cpp @@ -2,10 +2,119 @@ #include #include #include -#include +#include #include "papi.h" #include "matmul.h" +#define NUM_EVENTS 2 +const char *events[NUM_EVENTS] = { + "rocm:::SQ_WAVES", + "rocm:::SQ_WAVES_RESTORED", +}; + +typedef struct { + int num_thread; +} thread_arg_t; + +void *run(void *arg) +{ + int eventset = PAPI_NULL; + int papi_errno = PAPI_create_eventset(&eventset); + if (papi_errno != PAPI_OK) { + fprintf(stderr, "ERROR: PAPI_create_eventset: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); + exit(EXIT_FAILURE); + } + + int thread_num = ((thread_arg_t *) arg)->num_thread; + for (int j = 0; j < NUM_EVENTS; ++j) { + char named_event[PAPI_MAX_STR_LEN] = { 0 }; + sprintf(named_event, "%s:device=%d", events[j], thread_num); + papi_errno = PAPI_add_named_event(eventset, (const char *) named_event); + if (papi_errno != PAPI_OK && papi_errno != PAPI_ENOEVNT) { + fprintf(stderr, "ERROR: PAPI_add_named_event: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); + exit(EXIT_FAILURE); + } + } + + papi_errno = PAPI_start(eventset); + if (papi_errno != PAPI_OK) { + fprintf(stderr, "ERROR: PAPI_start: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); + exit(EXIT_FAILURE); + } + + hipError_t hip_errno = hipSetDevice(thread_num); + if (hip_errno != hipSuccess) { + fprintf(stderr, "ERROR: hipSetDevice: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); + exit(EXIT_FAILURE); + } + + hipStream_t stream; + hip_errno = hipStreamCreate(&stream); + if (hip_errno != hipSuccess) { + fprintf(stderr, "ERROR: hipStreamCreate: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); + exit(EXIT_FAILURE); + } + + void *handle; + int matmul_errno; + matmul_errno = matmul_init(&handle); + if (matmul_errno != MATMUL_SUCCESS) { + fprintf(stderr, "ERROR: matmul_init: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); + exit(EXIT_FAILURE); + } + + matmul_errno = matmul_run(handle, stream); + if (matmul_errno != MATMUL_SUCCESS) { + fprintf(stderr, "ERROR: matmul_run: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); + exit(EXIT_FAILURE); + } + + hip_errno = hipStreamSynchronize(stream); + if (hip_errno != hipSuccess) { + fprintf(stderr, "ERROR: hipStreamSynchronize: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); + exit(EXIT_FAILURE); + } + + hip_errno = hipStreamDestroy(stream); + if (hip_errno != hipSuccess) { + fprintf(stderr, "ERROR: hipStreamDestroy: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); + exit(EXIT_FAILURE); + } + + matmul_errno = matmul_finalize(&handle); + if (matmul_errno != MATMUL_SUCCESS) { + fprintf(stderr, "ERROR: matmul_finalize: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); + exit(EXIT_FAILURE); + } + + long long counters[NUM_EVENTS] = { 0 }; + papi_errno = PAPI_stop(eventset, counters); + if (papi_errno != PAPI_OK) { + fprintf(stderr, "ERROR: PAPI_stop: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); + exit(EXIT_FAILURE); + } + + for (int i = 0; i < NUM_EVENTS; ++i) { + fprintf(stdout, "[tid:%d] %s:device=%d : %lld\n", + thread_num, events[i], thread_num, + counters[i]); + } + + papi_errno = PAPI_cleanup_eventset(eventset); + if (papi_errno != PAPI_OK) { + fprintf(stderr, "ERROR: PAPI_cleanup_eventset: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); + exit(EXIT_FAILURE); + } + + papi_errno = PAPI_destroy_eventset(&eventset); + if (papi_errno != PAPI_OK) { + fprintf(stderr, "ERROR: PAPI_destroy_eventset: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); + exit(EXIT_FAILURE); + } + + pthread_exit(NULL); +} + int main(int argc, char *argv[]) { int papi_errno; @@ -30,7 +139,7 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - papi_errno = PAPI_thread_init((unsigned long (*)(void)) omp_get_thread_num); + papi_errno = PAPI_thread_init((unsigned long (*)(void)) pthread_self); if (papi_errno != PAPI_OK) { fprintf(stderr, "ERROR: PAPI_thread_init: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); exit(EXIT_FAILURE); @@ -44,112 +153,33 @@ int main(int argc, char *argv[]) } num_threads = (num_threads < num_devices) ? num_threads : num_devices; - omp_set_num_threads(num_threads); fprintf(stdout, "Run rocm test with %d threads\n", num_threads); -#define NUM_EVENTS 2 - const char *events[NUM_EVENTS] = { - "rocm:::SQ_WAVES", - "rocm:::SQ_WAVES_RESTORED", - }; - -#pragma omp parallel - { - int eventset = PAPI_NULL; - papi_errno = PAPI_create_eventset(&eventset); - if (papi_errno != PAPI_OK) { - fprintf(stderr, "ERROR: PAPI_create_eventset: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); - exit(EXIT_FAILURE); - } - - int thread_num = omp_get_thread_num(); - for (int j = 0; j < NUM_EVENTS; ++j) { - char named_event[PAPI_MAX_STR_LEN] = { 0 }; - sprintf(named_event, "%s:device=%d", events[j], thread_num); - papi_errno = PAPI_add_named_event(eventset, (const char *) named_event); - if (papi_errno != PAPI_OK && papi_errno != PAPI_ENOEVNT) { - fprintf(stderr, "ERROR: PAPI_add_named_event: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); - exit(EXIT_FAILURE); - } - } - - papi_errno = PAPI_start(eventset); - if (papi_errno != PAPI_OK) { - fprintf(stderr, "ERROR: PAPI_start: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); - exit(EXIT_FAILURE); - } - - hip_errno = hipSetDevice(thread_num); - if (hip_errno != hipSuccess) { - fprintf(stderr, "ERROR: hipSetDevice: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); - exit(EXIT_FAILURE); - } - - hipStream_t stream; - hip_errno = hipStreamCreate(&stream); - if (hip_errno != hipSuccess) { - fprintf(stderr, "ERROR: hipStreamCreate: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); - exit(EXIT_FAILURE); - } - - void *handle; - int matmul_errno; - matmul_errno = matmul_init(&handle); - if (matmul_errno != MATMUL_SUCCESS) { - fprintf(stderr, "ERROR: matmul_init: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); - exit(EXIT_FAILURE); - } - - matmul_errno = matmul_run(handle, stream); - if (matmul_errno != MATMUL_SUCCESS) { - fprintf(stderr, "ERROR: matmul_run: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); - exit(EXIT_FAILURE); - } - - hip_errno = hipStreamSynchronize(stream); - if (hip_errno != hipSuccess) { - fprintf(stderr, "ERROR: hipStreamSynchronize: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); - exit(EXIT_FAILURE); - } - - hip_errno = hipStreamDestroy(stream); - if (hip_errno != hipSuccess) { - fprintf(stderr, "ERROR: hipStreamDestroy: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); - exit(EXIT_FAILURE); - } - - matmul_errno = matmul_finalize(&handle); - if (matmul_errno != MATMUL_SUCCESS) { - fprintf(stderr, "ERROR: matmul_finalize: %d: %s\n", PAPI_EMISC, PAPI_strerror(PAPI_EMISC)); - exit(EXIT_FAILURE); - } + pthread_t *thread = (pthread_t *)malloc(num_threads * sizeof(*thread)); + if (thread == NULL) { + return EXIT_FAILURE; + } - long long counters[NUM_EVENTS] = { 0 }; - papi_errno = PAPI_stop(eventset, counters); - if (papi_errno != PAPI_OK) { - fprintf(stderr, "ERROR: PAPI_stop: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); - exit(EXIT_FAILURE); - } + thread_arg_t *arg = (thread_arg_t *)malloc(num_threads * sizeof(*arg)); + if (arg == NULL) { + return EXIT_FAILURE; + } - for (int i = 0; i < NUM_EVENTS; ++i) { - fprintf(stdout, "[tid:%d] %s:device=%d : %lld\n", - omp_get_thread_num(), events[i], thread_num, - counters[i]); - } + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - papi_errno = PAPI_cleanup_eventset(eventset); - if (papi_errno != PAPI_OK) { - fprintf(stderr, "ERROR: PAPI_cleanup_eventset: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); - exit(EXIT_FAILURE); - } + for (int i = 0; i < num_threads; ++i) { + arg[i].num_thread = i; + pthread_create(&thread[i], &attr, run, &arg[i]); + } - papi_errno = PAPI_destroy_eventset(&eventset); - if (papi_errno != PAPI_OK) { - fprintf(stderr, "ERROR: PAPI_destroy_eventset: %d: %s\n", papi_errno, PAPI_strerror(papi_errno)); - exit(EXIT_FAILURE); - } + for (int i = 0; i < num_threads; ++i) { + pthread_join(thread[i], NULL); } + free(thread); + free(arg); PAPI_shutdown(); return EXIT_SUCCESS; diff --git a/validation_tests/papi-rocm/run_rocm_tests.sh b/validation_tests/papi-rocm/run_rocm_tests.sh index 534ab551..bafd0245 100755 --- a/validation_tests/papi-rocm/run_rocm_tests.sh +++ b/validation_tests/papi-rocm/run_rocm_tests.sh @@ -1,12 +1,12 @@ #!/bin/bash echo "=== Run rocm test in sampling mode" -ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib ./single_monitor -ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib ./multi_monitor --threads=1 -ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib ./multi_monitor --threads=2 -ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib ./overflow +ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib:$LD_LIBRARY_PATH echo "single_monitor" && ./single_monitor +ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib:$LD_LIBRARY_PATH echo "multi_monitor" && ./multi_monitor --threads=1 +ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib:$LD_LIBRARY_PATH echo "multi_monitor" &&./multi_monitor --threads=2 +ROCP_HSA_INTERCEPT=0 LD_LIBRARY_PATH=$PAPI_ROOT/lib:$LD_LIBRARY_PATH echo "overflow" && ./overflow echo "=== Run rocm test in intercept mode" -ROCP_HSA_INTERCEPT=1 LD_LIBRARY_PATH=$PAPI_ROOT/lib ./single_monitor -ROCP_HSA_INTERCEPT=1 LD_LIBRARY_PATH=$PAPI_ROOT/lib ./multi_monitor --threads=1 -ROCP_HSA_INTERCEPT=1 LD_LIBRARY_PATH=$PAPI_ROOT/lib ./multi_monitor --threads=2 +ROCP_HSA_INTERCEPT=1 LD_LIBRARY_PATH=$PAPI_ROOT/lib:$LD_LIBRARY_PATH echo "single_monitor" && ./single_monitor +ROCP_HSA_INTERCEPT=1 LD_LIBRARY_PATH=$PAPI_ROOT/lib:$LD_LIBRARY_PATH echo "multi_monitor" && ./multi_monitor --threads=1 +ROCP_HSA_INTERCEPT=1 LD_LIBRARY_PATH=$PAPI_ROOT/lib:$LD_LIBRARY_PATH echo "multi_monitor" && ./multi_monitor --threads=2 diff --git a/validation_tests/papi-rocm/setup.sh b/validation_tests/papi-rocm/setup.sh index 50ef2bff..59294a2b 100644 --- a/validation_tests/papi-rocm/setup.sh +++ b/validation_tests/papi-rocm/setup.sh @@ -1,3 +1,3 @@ #!/bin/bash . ../../setup.sh -spackLoadUnique papi+rocm@6.0.0.2: +spackLoadUnique papi+rocm@master: