Skip to content

Commit

Permalink
Debugging OMPT target offload with latest Intel OneAPI
Browse files Browse the repository at this point in the history
  • Loading branch information
khuck committed May 9, 2022
1 parent 8c4a961 commit 43b3d43
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 27 deletions.
42 changes: 29 additions & 13 deletions src/apex/apex_ompt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,10 @@ static void print_record_ompt(ompt_record_ompt_t *rec) {
target_data_op_rec.end_time, node, tt);
store_counter_data("OpenMP Target DataOp", "Bytes", target_data_op_rec.end_time,
target_data_op_rec.bytes, node);
// converting from B/us to MB/s
double bw = (target_data_op_rec.bytes) / (target_data_op_rec.end_time - rec->time);
store_counter_data("OpenMP Target DataOp", "BW (MB/s)", target_data_op_rec.end_time,
bw, node);
break;
}
case ompt_callback_target_submit:
Expand Down Expand Up @@ -807,12 +811,18 @@ extern "C" void apex_target (
APEX_UNUSED(device_num);
APEX_UNUSED(target_id);
if (!enabled) { return; }
static std::unordered_map<ompt_id_t, ompt_data_t*> target_map;
DEBUG_PRINT("Callback Target:\n"
"\ttarget_id=%lu kind=%d endpoint=%d device_num=%d code=%p,\n"
"\ttask_data->value=%" PRId64 ", task_data->ptr=%p\n",
target_id, kind, endpoint, device_num, codeptr_ra,
task_data->value, task_data->ptr);
task_data == nullptr ? 0 : task_data->value, task_data == nullptr ? nullptr : task_data->ptr);
if (endpoint == ompt_scope_begin) {
if (task_data == nullptr) {
task_data = new ompt_data_t;
task_data->value = 0;
task_data->ptr = nullptr;
}
char regionIDstr[128] = {0};
if (codeptr_ra != nullptr) {
sprintf(regionIDstr, "OpenMP Target: UNRESOLVED ADDR %p",
Expand All @@ -822,7 +832,12 @@ extern "C" void apex_target (
sprintf(regionIDstr, "OpenMP Target");
apex_ompt_start(regionIDstr, task_data, nullptr, true);
}
target_map[target_id] = task_data;
} else {
if (task_data == nullptr) {
task_data = target_map[target_id];
target_map.erase(target_id);
}
// save a copy of the task wrapper
std::shared_ptr<apex::task_wrapper> tw = ((linked_timer*)(task_data->ptr))->tw;
Globals::insert_timer(target_id, tw);
Expand Down Expand Up @@ -1136,18 +1151,18 @@ extern "C" void apex_sync_region_wait (
* barrier, waiting for the next parallel region where they
* are needed. So... should that barrier be associated with
* the previous parallel region, or just be anonymous? */
#if 0

/* If OpenMP doesn't give us a codeptr, use the one from the
* parent, if it has one */
if (codeptr_ra == nullptr) {
if (codeptr_ra == nullptr &&
apex::apex_options::ompt_high_overhead_events()) {
if (parallel_data != nullptr && parallel_data->ptr != nullptr) {
linked_timer* parent = (linked_timer*)(parallel_data->ptr);
if (parent->codeptr != nullptr) {
local_codeptr = (void*)parent->codeptr;
}
}
}
#endif
char regionIDstr[128] = {0};
if (local_codeptr != nullptr) {
sprintf(regionIDstr, "OpenMP %s: UNRESOLVED ADDR %p", tmp_str,
Expand Down Expand Up @@ -1327,10 +1342,11 @@ extern "C" void apex_ompt_sync_region (
* barrier, waiting for the next parallel region where they
* are needed. So... should that barrier be associated with
* the previous parallel region, or just be anonymous? */
#if 0
#if 1
/* If OpenMP doesn't give us a codeptr, use the one from the
* parent, if it has one */
if (codeptr_ra == nullptr) {
if (codeptr_ra == nullptr &&
apex::apex_options::ompt_high_overhead_events()) {
if (parallel_data != nullptr && parallel_data->ptr != nullptr) {
linked_timer* parent = (linked_timer*)(parallel_data->ptr);
if (parent->codeptr != nullptr) {
Expand Down Expand Up @@ -1543,32 +1559,32 @@ static int apex_ompt_stop_trace() {
// This function is for checking that the function registration worked.
int apex_ompt_register(ompt_callbacks_t e, ompt_callback_t c ,
const char * name) {
fprintf(stderr,"Registering OMPT callback %s...",name); fflush(stderr);
DEBUG_PRINT("Registering OMPT callback %s...",name); fflush(stderr);
ompt_set_result_t rc = ompt_set_callback(e, c);
switch (rc) {
case ompt_set_error:
fprintf(stderr,"\n\tFailed to register OMPT callback %s!\n",name);
DEBUG_PRINT("\n\tFailed to register OMPT callback %s!\n",name);
fflush(stderr);
break;
case ompt_set_never:
fprintf(stderr,"\n\tOMPT callback %s never supported by this runtime.\n",name);
DEBUG_PRINT("\n\tOMPT callback %s never supported by this runtime.\n",name);
fflush(stderr);
break;
case ompt_set_impossible:
fprintf(stderr,"\n\tOMPT callback %s impossible from this runtime.\n",name);
DEBUG_PRINT("\n\tOMPT callback %s impossible from this runtime.\n",name);
fflush(stderr);
break;
case ompt_set_sometimes:
fprintf(stderr,"\n\tOMPT callback %s sometimes supported by this runtime.\n",name);
DEBUG_PRINT("\n\tOMPT callback %s sometimes supported by this runtime.\n",name);
fflush(stderr);
break;
case ompt_set_sometimes_paired:
fprintf(stderr,"\n\tOMPT callback %s sometimes paired by this runtime.\n",name);
DEBUG_PRINT("\n\tOMPT callback %s sometimes paired by this runtime.\n",name);
fflush(stderr);
break;
case ompt_set_always:
default:
fprintf(stderr,"success.\n");
DEBUG_PRINT("success.\n");
}
return 0;
}
Expand Down
2 changes: 0 additions & 2 deletions src/apex/async_thread_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,11 @@ namespace apex {
std::stringstream ss;
ss << "GPU [" << _device << ":" << _stream << "]";
std::string tmp{ss.str()};
printf("Device: %u, Thread: %u, string: %s\n", _device, _stream, tmp.c_str());
return tmp;
}
virtual uint32_t sortable_tid () {
uint32_t tid = ((_device+1) << 28);
tid = tid + _stream;
printf("Device: %u, Thread: %u, sort_index: %u\n", _device, _stream, tid);
return tid;
}
};
Expand Down
2 changes: 1 addition & 1 deletion src/apex/dependency_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ double Node::writeNodeJSON(std::ofstream& outfile, double total, size_t indent)
sort(sorted.begin(), sorted.end(), cmp);

// do all the children
double children_total;
double children_total = 0.0;
bool first = true;
for (auto c : sorted) {
if (!first) { outfile << ",\n"; }
Expand Down
2 changes: 1 addition & 1 deletion src/apex/profiler_listener.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class profiler_listener_globals {
std::vector<int> event_sets; // PAPI event sets
std::vector<size_t> event_set_sizes; // PAPI event set sizes
papi_state thread_papi_state;
profiler_listener_globals() : my_tid(-1), thread_papi_state(papi_suspended) { }
profiler_listener_globals() : my_tid(0), thread_papi_state(papi_suspended) { }
~profiler_listener_globals() { if (my_tid == 0) finalize(); }
};

Expand Down
20 changes: 10 additions & 10 deletions src/openmp/ompt_target_matmult.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@

#define elem(_m,_i,_j) (_m[((_i)*NRA) + (_j)])

double* allocateMatrix(int rows, int cols) {
float* allocateMatrix(int rows, int cols) {
int i;
double *matrix = (double*)malloc((sizeof(double*)) * rows * cols);
float *matrix = (float*)malloc((sizeof(float*)) * rows * cols);
#pragma omp target enter data map(alloc:matrix[0:rows*cols])
return matrix;
}

void initialize(double *matrix, int rows, int cols) {
void initialize(float *matrix, int rows, int cols) {
int i,j;
#pragma omp parallel private(i,j) shared(matrix)
{
Expand All @@ -45,7 +45,7 @@ void initialize(double *matrix, int rows, int cols) {
}
}

void freeMatrix(double* matrix, int rows, int cols) {
void freeMatrix(float* matrix, int rows, int cols) {
#pragma omp target exit data map(delete:matrix[0:rows*cols])
free(matrix);
}
Expand All @@ -54,7 +54,7 @@ void freeMatrix(double* matrix, int rows, int cols) {
// compute multiplies a and b and returns the result in c using ijk.
// cols_a and rows_b are the same value
/////////////////////////////////////////////////////////////////////
void compute(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
void compute(float *a, float *b, float *c, int rows_a, int cols_a, int cols_b) {
int i,j,k;
printf("%s\n", __func__);
#pragma omp parallel private(i,j,k) shared(a,b,c)
Expand All @@ -76,7 +76,7 @@ void compute(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b
// compute_interchange multiplies a and b and returns the result in c
// using ikj loop. cols_a and rows_b are the same value
///////////////////////////////////////////////////////////////////////
void compute_interchange(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
void compute_interchange(float *a, float *b, float *c, int rows_a, int cols_a, int cols_b) {
int i,j,k;
printf("%s\n", __func__);
#pragma omp parallel private(i,j,k) shared(a,b,c)
Expand All @@ -98,7 +98,7 @@ void compute_interchange(double *a, double *b, double *c, int rows_a, int cols_a
// compute_interchange multiplies a and b and returns the result in c
// using ikj loop. cols_a and rows_b are the same value
///////////////////////////////////////////////////////////////////////
void compute_target(double *a, double *b, double *c, int rows_a, int cols_a, int cols_b) {
void compute_target(float *a, float *b, float *c, int rows_a, int cols_a, int cols_b) {
printf("%s\n", __func__);
int i, j, k;
#pragma omp target data map (to: a[0:rows_a*cols_a],b[0:cols_a*cols_b]) map (tofrom: c[0:rows_a*cols_b])
Expand All @@ -120,8 +120,8 @@ void compute_target(double *a, double *b, double *c, int rows_a, int cols_a, int
#endif
}

double do_work(void) {
double *a, /* matrix A to be multiplied */
float do_work(void) {
float *a, /* matrix A to be multiplied */
*b, /* matrix B to be multiplied */
*c; /* result matrix C */
a = allocateMatrix(NRA, NCA);
Expand All @@ -138,7 +138,7 @@ double do_work(void) {
// compute_interchange(a, b, c, NRA, NCA, NCB);
compute_target(a, b, c, NRA, NCA, NCB);

double result = elem(c,0,1);
float result = elem(c,0,1);

freeMatrix(a, NRA, NCA);
freeMatrix(b, NCA, NCB);
Expand Down

0 comments on commit 43b3d43

Please sign in to comment.