From cb0537fcf318ad053883860234f9206a36aaa9da Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Fri, 19 May 2023 03:46:07 -0400 Subject: [PATCH 1/4] timing: Allocate all timing events dynamically This paves the way for a Julia-side API that can create new events on-the-fly without having to modify timing.h The core of the change is to introduce two different structs: - An "event" stores all of the statically-determined attributes of a profiler event (typically, zone name and source location info) - A "timing block" stores the dynamic information relevant to a particular span/measurement in the timing run Events and timing blocks have a one-to-many relationship. The intended pattern for a Julia-side API is to construct an event once at parse-time using `jl_timing_event_create` (since this is relatively expensive due to profiler traffic and allocations) and then to create its own timing block on-the-fly for each block entry/exit. This also re-factors the API a bit to hopefully be more consistently named --- src/julia.expmap | 1 + src/julia.h | 1 + src/task.c | 12 +-- src/timing.c | 235 +++++++++++++++++++++++++++++++++-------- src/timing.h | 264 +++++++++++++++++------------------------------ 5 files changed, 297 insertions(+), 216 deletions(-) diff --git a/src/julia.expmap b/src/julia.expmap index 447c3c4d8a5f5..ee35997827221 100644 --- a/src/julia.expmap +++ b/src/julia.expmap @@ -30,6 +30,7 @@ _Z24jl_coverage_data_pointerN4llvm9StringRefEi; _Z22jl_coverage_alloc_lineN4llvm9StringRefEi; _Z22jl_malloc_data_pointerN4llvm9StringRefEi; + _jl_timing_*; LLVMExtra*; JLJIT*; llvmGetPassPluginInfo; diff --git a/src/julia.h b/src/julia.h index d214509c7d0b6..2140b0ad0ab90 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1982,6 +1982,7 @@ JL_DLLEXPORT void jl_sigatomic_end(void); // tasks and exceptions ------------------------------------------------------- typedef struct _jl_timing_block_t jl_timing_block_t; +typedef struct _jl_timing_event_t jl_timing_event_t; typedef struct _jl_excstack_t jl_excstack_t; // info describing an exception handler diff --git a/src/task.c b/src/task.c index 477ae481071a0..1dab8688cb079 100644 --- a/src/task.c +++ b/src/task.c @@ -646,7 +646,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER int finalizers_inhibited = ptls->finalizers_inhibited; ptls->finalizers_inhibited = 0; - jl_timing_block_t *blk = jl_timing_block_exit_task(ct, ptls); + jl_timing_block_t *blk = jl_timing_block_task_exit(ct, ptls); ctx_switch(ct); #ifdef MIGRATE_TASKS @@ -666,7 +666,7 @@ JL_DLLEXPORT void jl_switch(void) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER 0 != ct->ptls && 0 == ptls->finalizers_inhibited); ptls->finalizers_inhibited = finalizers_inhibited; - jl_timing_block_enter_task(ct, ptls, blk); (void)blk; + jl_timing_block_task_enter(ct, ptls, blk); (void)blk; sig_atomic_t other_defer_signal = ptls->defer_signal; ptls->defer_signal = defer_signal; @@ -705,7 +705,7 @@ JL_DLLEXPORT JL_NORETURN void jl_no_exc_handler(jl_value_t *e, jl_task_t *ct) #define pop_timings_stack() \ jl_timing_block_t *cur_block = ptls->timing_stack; \ while (cur_block && eh->timing_stack != cur_block) { \ - cur_block = jl_pop_timing_block(cur_block); \ + cur_block = jl_timing_block_pop(cur_block); \ } \ assert(cur_block == eh->timing_stack); #else @@ -1084,7 +1084,7 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion t->ptls = NULL; t->world_age = ct->world_age; t->reentrant_timing = 0; - jl_timing_init_task(t); + jl_timing_task_init(t); #ifdef COPY_STACKS if (!t->copy_stack) { @@ -1221,7 +1221,7 @@ CFI_NORETURN ct->started = 1; JL_PROBE_RT_START_TASK(ct); - jl_timing_block_enter_task(ct, ptls, NULL); + jl_timing_block_task_enter(ct, ptls, NULL); if (jl_atomic_load_relaxed(&ct->_isexception)) { record_backtrace(ptls, 0); jl_push_excstack(&ct->excstack, ct->result, @@ -1693,7 +1693,7 @@ jl_task_t *jl_init_root_task(jl_ptls_t ptls, void *stack_lo, void *stack_hi) ct->ctx.asan_fake_stack = NULL; #endif - jl_timing_block_enter_task(ct, ptls, NULL); + jl_timing_block_task_enter(ct, ptls, NULL); #ifdef COPY_STACKS // initialize the base_ctx from which all future copy_stacks will be copies diff --git a/src/timing.c b/src/timing.c index 9a1af1bdf0ac1..8af101c0f8026 100644 --- a/src/timing.c +++ b/src/timing.c @@ -24,24 +24,21 @@ extern "C" { static uint64_t t0; -JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)]; +JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_SUBSYSTEM_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)]; -JL_DLLEXPORT _Atomic(uint64_t) jl_timing_self_counts[(int)JL_TIMING_EVENT_LAST]; -JL_DLLEXPORT _Atomic(uint64_t) jl_timing_full_counts[(int)JL_TIMING_EVENT_LAST]; +static arraylist_t jl_timing_counts_events; // Used to as an item limit when several strings of metadata can // potentially be associated with a single timing zone. JL_DLLEXPORT uint32_t jl_timing_print_limit = 10; -static const char *jl_timing_names[(int)JL_TIMING_EVENT_LAST] = +const char *jl_timing_subsystems[(int)JL_TIMING_SUBSYSTEM_LAST] = { #define X(name) #name, - JL_TIMING_EVENTS + JL_TIMING_SUBSYSTEMS #undef X }; -static int jl_timing_names_sorted[(int)JL_TIMING_EVENT_LAST]; - JL_DLLEXPORT jl_timing_counter_t jl_timing_counters[JL_TIMING_COUNTER_LAST]; void jl_print_timings(void) @@ -49,19 +46,25 @@ void jl_print_timings(void) #ifdef USE_TIMING_COUNTS uint64_t total_time = cycleclock() - t0; uint64_t root_time = total_time; - for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) { - root_time -= jl_atomic_load_relaxed(jl_timing_self_counts + i); + for (int i = 0; i < jl_timing_counts_events.len; i++) { + jl_timing_counts_event_t *other_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i]; + root_time -= jl_atomic_load_relaxed(&other_event->self); } - jl_atomic_store_relaxed(jl_timing_self_counts + JL_TIMING_ROOT, root_time); - jl_atomic_store_relaxed(jl_timing_full_counts + JL_TIMING_ROOT, total_time); + jl_timing_counts_event_t *root_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[0]; + jl_atomic_store_relaxed(&root_event->self, root_time); + jl_atomic_store_relaxed(&root_event->total, total_time); + fprintf(stderr, "\nJULIA TIMINGS\n"); fprintf(stderr, "%-25s, %-30s, %-30s\n", "Event", "Self Cycles (% of Total)", "Total Cycles (% of Total)"); - for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) { - int j = jl_timing_names_sorted[i]; - uint64_t self = jl_atomic_load_relaxed(jl_timing_self_counts + j); - uint64_t total = jl_atomic_load_relaxed(jl_timing_full_counts + j); + for (int i = 0; i < jl_timing_counts_events.len; i++) { + jl_timing_counts_event_t *event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i]; + uint64_t self = jl_atomic_load_relaxed(&event->self); + uint64_t total = jl_atomic_load_relaxed(&event->total); if (total != 0) - fprintf(stderr, "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n", jl_timing_names[j], self, 100 * (((double)self) / total_time), total, 100 * (((double)total) / total_time)); + fprintf(stderr, "%-25s, %20" PRIu64 " (%5.2f %%), %20" PRIu64 " (%5.2f %%)\n", + event->name, + self, 100 * (((double)self) / total_time), + total, 100 * (((double)total) / total_time)); } fprintf(stderr, "\nJULIA COUNTERS\n"); @@ -77,22 +80,32 @@ void jl_print_timings(void) #endif } -int cmp_names(const void *a, const void *b) { - int ia = *(const int*)a; - int ib = *(const int*)b; - return strcmp(jl_timing_names[ia], jl_timing_names[ib]); +static const int indirect_strcmp(const void *a, const void *b) { + return strcmp(*(const char **)a, *(const char **)b); } void jl_init_timing(void) { t0 = cycleclock(); - _Static_assert((int)JL_TIMING_LAST <= (int)JL_TIMING_EVENT_LAST, "More owners than events!"); + _Static_assert(JL_TIMING_SUBSYSTEM_LAST < sizeof(uint64_t) * CHAR_BIT, "Too many timing subsystems!"); - for (int i = 0; i < JL_TIMING_EVENT_LAST; i++) { - jl_timing_names_sorted[i] = i; - } - qsort(jl_timing_names_sorted, JL_TIMING_EVENT_LAST, sizeof(int), cmp_names); +#ifdef USE_TIMING_COUNTS + // Create events list for counts backend + arraylist_new(&jl_timing_counts_events, 1); + + jl_timing_counts_event_t *new_event = (jl_timing_counts_event_t *)malloc(sizeof(jl_timing_counts_event_t)); + arraylist_push(&jl_timing_counts_events, (void *)new_event); + + new_event->name = "ROOT"; + jl_atomic_store_relaxed(&new_event->total, 0); +#endif + + // Sort the subsystem names for quick enable/disable lookups + qsort( + jl_timing_subsystems, JL_TIMING_SUBSYSTEM_LAST, + sizeof(const char *), indirect_strcmp + ); int i __attribute__((unused)) = 0; #ifdef USE_ITTAPI @@ -142,18 +155,157 @@ void jl_destroy_timing(void) jl_ptls_t ptls = jl_current_task->ptls; jl_timing_block_t *stack = ptls->timing_stack; while (stack) { - _jl_timing_block_destroy(stack); + jl_timing_block_end(stack); stack = stack->prev; } } -jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block) +static const int get_timing_subsystem(const char *subsystem) { + const char **match = (const char **)bsearch( + &subsystem, jl_timing_subsystems, JL_TIMING_SUBSYSTEM_LAST, + sizeof(const char *), indirect_strcmp + ); + if (!match) + return JL_TIMING_SUBSYSTEM_LAST; + + return (int)(match - &jl_timing_subsystems[0]); +} + +#ifdef USE_TIMING_COUNTS + +// This function is analogous to __itt_event_create but for the counts backend +// +// `event` is required to live forever +static jl_timing_counts_event_t *_jl_timing_counts_event_create(const char *event) { + const size_t n = jl_timing_counts_events.len; + for (size_t i = 0; i < n; i++) { + jl_timing_counts_event_t *other_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i]; + if (strcmp(event, other_event->name) == 0) + return other_event; + } + + // No matching event found - create a new one + jl_timing_counts_event_t *new_event = (jl_timing_counts_event_t *)malloc(sizeof(jl_timing_counts_event_t)); + arraylist_push(&jl_timing_counts_events, (void *)new_event); + new_event->name = event; + jl_atomic_store_relaxed(&new_event->total, 0); + return new_event; +} + +STATIC_INLINE void _jl_timing_counts_pause(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { +#ifdef JL_DEBUG_BUILD + assert(block->running); + block->running = 0; +#endif + block->total += t - block->start; +} + +STATIC_INLINE void _jl_timing_counts_resume(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { +#ifdef JL_DEBUG_BUILD + assert(!block->running); + block->running = 1; +#endif + block->start = t; +} + +STATIC_INLINE void _jl_timing_counts_start(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { + block->total = 0; + block->start = t; + block->t0 = t; +#ifdef JL_DEBUG_BUILD + block->running = 1; +#endif +} + +STATIC_INLINE void _jl_timing_counts_stop(jl_timing_block_t *block, uint64_t t) JL_NOTSAFEPOINT { +#ifdef JL_DEBUG_BUILD + assert(block->counts_ctx.running); + block->counts_ctx.running = 0; +#endif + jl_timing_counts_event_t *event = block->event->counts_event; + block->counts_ctx.total += t - block->counts_ctx.start; + jl_atomic_fetch_add_relaxed(&event->self, block->counts_ctx.total); + jl_atomic_fetch_add_relaxed(&event->total, t - block->counts_ctx.t0); +} + +#endif // USE_TIMING_COUNTS + +JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, const char *name, const char *function, const char *file, int line, int color) { + int maybe_subsystem = get_timing_subsystem(subsystem); + if (maybe_subsystem >= JL_TIMING_SUBSYSTEM_LAST) { + jl_errorf("invalid timing subsystem name: %s", subsystem); + return NULL; + } + + jl_timing_event_t *event = (jl_timing_event_t *) malloc(sizeof(jl_timing_event_t)); + event->subsystem = maybe_subsystem; + +#ifdef USE_TIMING_COUNTS + event->counts_event = _jl_timing_counts_event_create(name); +#endif // USE_TIMING_COUNTS + +#ifdef USE_TRACY + event->tracy_srcloc.name = name; + event->tracy_srcloc.function = function; + event->tracy_srcloc.file = file; + event->tracy_srcloc.line = line; + event->tracy_srcloc.color = color; +#endif // USE_TRACY + +#ifdef USE_ITTAPI + event->ittapi_event = __itt_event_create(name, strlen(name)); +#endif // USE_ITTAPI + + return event; +} + +JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event) { + memset(block, 0, sizeof(*block)); + block->event = event; +} + +JL_DLLEXPORT void _jl_timing_block_start(jl_timing_block_t *block) { + assert(!block->is_running); + if (!_jl_timing_enabled(block->event->subsystem)) return; + + uint64_t t = cycleclock(); (void)t; + _COUNTS_START(&block->counts_ctx, t); + _ITTAPI_START(block); + _TRACY_START(block); + + jl_timing_block_t **prevp = &jl_current_task->ptls->timing_stack; + block->prev = *prevp; + block->is_running = 1; + if (block->prev) { + _COUNTS_PAUSE(&block->prev->counts_ctx, t); + } + *prevp = block; +} + +JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *block) { + if (block->is_running) { + uint64_t t = cycleclock(); (void)t; + _ITTAPI_STOP(block); + _TRACY_STOP(block->tracy_ctx); + _COUNTS_STOP(block, t); + + jl_task_t *ct = jl_current_task; + jl_timing_block_t **pcur = &ct->ptls->timing_stack; + assert(*pcur == block); + *pcur = block->prev; + if (block->prev) { + _COUNTS_RESUME(&block->prev->counts_ctx, t); + } + } +} + +jl_timing_block_t *jl_timing_block_pop(jl_timing_block_t *cur_block) { - _jl_timing_block_destroy(cur_block); + jl_timing_block_end(cur_block); return cur_block->prev; } -void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk) +void jl_timing_block_task_enter(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk) { if (prev_blk != NULL) { assert(ptls->timing_stack == NULL); @@ -171,7 +323,7 @@ void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t #endif } -jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls) +jl_timing_block_t *jl_timing_block_task_exit(jl_task_t *ct, jl_ptls_t ptls) { #ifdef USE_TRACY // Tracy is fairly strict about not leaving a fiber that hasn't @@ -323,7 +475,7 @@ JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str) #endif } -void jl_timing_init_task(jl_task_t *t) +void jl_timing_task_init(jl_task_t *t) { #ifdef USE_TRACY jl_value_t *start_type = jl_typeof(t->start); @@ -359,20 +511,12 @@ void jl_timing_init_task(jl_task_t *t) #endif } -int cmp_name_idx(const void *name, const void *idx) { - return strcmp((const char *)name, jl_timing_names[*(const int *)idx]); -} - JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled) { - const int *idx = (const int *)bsearch(subsystem, jl_timing_names_sorted, JL_TIMING_EVENT_LAST, sizeof(int), cmp_name_idx); - if (idx == NULL) - return -1; - int i = *idx; - // sorted names include events, so skip if we're looking at an event instead of a subsystem - // events are always at least JL_TIMING_LAST - if (i >= JL_TIMING_LAST) + int i = get_timing_subsystem(subsystem); + if (i >= JL_TIMING_SUBSYSTEM_LAST) return -1; + uint64_t subsystem_bit = 1ul << (i % (sizeof(uint64_t) * CHAR_BIT)); if (enabled) { jl_atomic_fetch_and_relaxed(jl_timing_disable_mask + (i / (sizeof(uint64_t) * CHAR_BIT)), ~subsystem_bit); @@ -445,6 +589,13 @@ void jl_timing_apply_env(void) void jl_init_timing(void) { } void jl_destroy_timing(void) { } + +JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, const char *name, const char *function, const char *file, int line, int color) { return NULL; } + +JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event) { } +JL_DLLEXPORT void _jl_timing_block_start(jl_timing_block_t *block) { } +JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *block) { } + JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled) { return -1; } JL_DLLEXPORT uint32_t jl_timing_print_limit = 0; diff --git a/src/timing.h b/src/timing.h index 4f94bbc3050cf..5ec6ea1fc33f8 100644 --- a/src/timing.h +++ b/src/timing.h @@ -31,7 +31,7 @@ void jl_init_timing(void); void jl_destroy_timing(void) JL_NOTSAFEPOINT; // Update the enable bit-mask to enable/disable tracing events for -// the subsystem in `jl_timing_names` matching the provided string. +// the subsystem in `jl_timing_subsystems` matching the provided string. // // Returns -1 if no matching sub-system was found. JL_DLLEXPORT int jl_timing_set_enable(const char *subsystem, uint8_t enabled); @@ -51,6 +51,11 @@ void jl_timing_apply_env(void); // when adding potentially many items of metadata to a single timing zone. extern JL_DLLEXPORT uint32_t jl_timing_print_limit; +JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, const char *name, const char *function, const char *file, int line, int color); +JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event); +JL_DLLEXPORT void _jl_timing_block_start(jl_timing_block_t *cur_block); +JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *cur_block); + #ifdef __cplusplus } #endif @@ -82,11 +87,13 @@ extern JL_DLLEXPORT uint32_t jl_timing_print_limit; #define jl_timing_show_macro(macro, lno, mod, b) #define jl_timing_printf(b, f, ...) #define jl_timing_puts(b, s) -#define jl_timing_init_task(t) +#define jl_timing_task_init(t) +#define jl_timing_event_create(blk) +#define jl_timing_block_init(blk) #define jl_timing_block_start(blk) -#define jl_timing_block_enter_task(ct, ptls, blk) -#define jl_timing_block_exit_task(ct, ptls) ((jl_timing_block_t *)NULL) -#define jl_pop_timing_block(blk) +#define jl_timing_block_task_enter(ct, ptls, blk) +#define jl_timing_block_task_exit(ct, ptls) ((jl_timing_block_t *)NULL) +#define jl_timing_block_pop(blk) #define jl_timing_counter_inc(counter, value) #define jl_timing_counter_dec(counter, value) @@ -102,6 +109,7 @@ extern JL_DLLEXPORT uint32_t jl_timing_print_limit; #include "julia_assert.h" #ifdef USE_TRACY #include "tracy/TracyC.h" +typedef struct ___tracy_source_location_data TracySrcLocData; #endif #ifdef USE_ITTAPI @@ -112,11 +120,11 @@ extern JL_DLLEXPORT uint32_t jl_timing_print_limit; extern "C" { #endif void jl_print_timings(void); -jl_timing_block_t *jl_pop_timing_block(jl_timing_block_t *cur_block); -void jl_timing_init_task(jl_task_t *t); -void jl_timing_block_enter_task(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk); -jl_timing_block_t *jl_timing_block_exit_task(jl_task_t *ct, jl_ptls_t ptls); +void jl_timing_task_init(jl_task_t *t); +void jl_timing_block_task_enter(jl_task_t *ct, jl_ptls_t ptls, jl_timing_block_t *prev_blk); +jl_timing_block_t *jl_timing_block_task_exit(jl_task_t *ct, jl_ptls_t ptls); +jl_timing_block_t *jl_timing_block_pop(jl_timing_block_t *cur_block); // Add the output of `jl_static_show(x)` as a text annotation to the // profiling region corresponding to `cur_block`. @@ -133,6 +141,11 @@ JL_DLLEXPORT void jl_timing_show_macro(jl_method_instance_t *macro, jl_value_t* JL_DLLEXPORT void jl_timing_printf(jl_timing_block_t *cur_block, const char *format, ...); JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str); +#define jl_timing_event_create(subsystem, name, function, file, line, color) _jl_timing_event_create(subsystem, name, function, file, line, color) +#define jl_timing_block_init(blk, evt) _jl_timing_block_start(blk, evt) +#define jl_timing_block_start(blk) _jl_timing_block_start(blk) +#define jl_timing_block_end(blk) _jl_timing_block_end(blk) + #ifdef __cplusplus } #endif @@ -172,29 +185,6 @@ JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str); X(JULIA_INIT) \ -#define JL_TIMING_EVENTS \ - JL_TIMING_SUBSYSTEMS \ - X(GC_Stop) \ - X(GC_Mark) \ - X(GC_FullSweep) \ - X(GC_IncrementalSweep) \ - X(GC_Finalizers) \ - X(CODEGEN_LLVM) \ - X(CODEGEN_Codeinst) \ - X(CODEGEN_Workqueue) \ - X(LOAD_Require) \ - X(LOAD_Sysimg) \ - X(LOAD_Pkgimg) \ - X(LOAD_Processor) \ - X(VERIFY_Edges) \ - X(VERIFY_Methods) \ - X(VERIFY_Graph) \ - X(STACKWALK_Backtrace) \ - X(STACKWALK_Excstack) \ - X(NATIVE_Dump) \ - X(NATIVE_Create) \ - - #define JL_TIMING_COUNTERS \ X(Invalidations) \ X(HeapSize) \ @@ -204,18 +194,11 @@ JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str); X(ImageSize) \ -enum jl_timing_subsystems { +enum jl_timing_subsystem { #define X(name) JL_TIMING_ ## name, JL_TIMING_SUBSYSTEMS #undef X - JL_TIMING_LAST -}; - -enum jl_timing_events { -#define X(name) JL_TIMING_EVENT_ ## name, - JL_TIMING_EVENTS -#undef X - JL_TIMING_EVENT_LAST + JL_TIMING_SUBSYSTEM_LAST }; enum jl_timing_counter_types { @@ -232,16 +215,33 @@ enum jl_timing_counter_types { * Timing Backend: Aggregated timing counts (implemented in timing.c) **/ +typedef struct jl_timing_counts_event_t { + const char *name; + _Atomic(uint64_t) self; + _Atomic(uint64_t) total; +} jl_timing_counts_event_t; + +typedef struct _jl_timing_counts_t { + uint64_t total; + uint64_t start; + uint64_t t0; +#ifdef JL_DEBUG_BUILD + uint8_t running; +#endif +} jl_timing_counts_t; + #ifdef USE_TIMING_COUNTS -#define _COUNTS_CTX_MEMBER jl_timing_counts_t counts_ctx; -#define _COUNTS_START(block, t) _jl_timing_counts_start(block, t) -#define _COUNTS_STOP(block, event, t) _jl_timing_counts_stop(block, event, t) -#define _COUNTS_PAUSE(block, t) _jl_timing_counts_pause(block, t) -#define _COUNTS_RESUME(block, t) _jl_timing_counts_resume(block, t) +#define _COUNTS_EVENT_MEMBER jl_timing_counts_event_t *counts_event; +#define _COUNTS_BLOCK_MEMBER jl_timing_counts_t counts_ctx; +#define _COUNTS_START(block, t) _jl_timing_counts_start(block, t) +#define _COUNTS_STOP(block, t) _jl_timing_counts_stop(block, t) +#define _COUNTS_PAUSE(block, t) _jl_timing_counts_pause(block, t) +#define _COUNTS_RESUME(block, t) _jl_timing_counts_resume(block, t) #else -#define _COUNTS_CTX_MEMBER +#define _COUNTS_EVENT_MEMBER +#define _COUNTS_BLOCK_MEMBER #define _COUNTS_START(block, t) -#define _COUNTS_STOP(block, event, t) +#define _COUNTS_STOP(block, t) #define _COUNTS_PAUSE(block, t) #define _COUNTS_RESUME(block, t) #endif @@ -251,15 +251,13 @@ enum jl_timing_counter_types { **/ #ifdef USE_TRACY -#define _TRACY_CTX_MEMBER TracyCZoneCtx tracy_ctx; const struct ___tracy_source_location_data *tracy_srcloc; -#define _TRACY_CTOR(block, name) static const struct ___tracy_source_location_data TIMING_CONCAT(__tracy_source_location,__LINE__) = { name, __func__, TracyFile, (uint32_t)__LINE__, 0 }; \ - (block)->tracy_srcloc = &TIMING_CONCAT(__tracy_source_location,__LINE__); \ - (block)->tracy_ctx.active = 0 -#define _TRACY_START(block) (block)->tracy_ctx = ___tracy_emit_zone_begin( (block)->tracy_srcloc, 1 ); -#define _TRACY_STOP(ctx) TracyCZoneEnd(ctx) +#define _TRACY_EVENT_MEMBER TracySrcLocData tracy_srcloc; +#define _TRACY_BLOCK_MEMBER TracyCZoneCtx tracy_ctx; +#define _TRACY_START(block) (block)->tracy_ctx = ___tracy_emit_zone_begin( &(block)->event->tracy_srcloc, 1 ); +#define _TRACY_STOP(ctx) TracyCZoneEnd(ctx) #else -#define _TRACY_CTX_MEMBER -#define _TRACY_CTOR(block, name) +#define _TRACY_EVENT_MEMBER +#define _TRACY_BLOCK_MEMBER #define _TRACY_START(block) #define _TRACY_STOP(ctx) #endif @@ -269,133 +267,60 @@ enum jl_timing_counter_types { **/ #ifdef USE_ITTAPI -#define _ITTAPI_CTX_MEMBER __itt_event ittapi_event; -#define _ITTAPI_CTOR(block, name) static __itt_event TIMING_CONCAT(__itt_event,__LINE__) = INT_MAX; \ - if (TIMING_CONCAT(__itt_event,__LINE__) == INT_MAX) \ - TIMING_CONCAT(__itt_event,__LINE__) = __itt_event_create(name, strlen(name)); \ - (block)->ittapi_event = TIMING_CONCAT(__itt_event,__LINE__) -#define _ITTAPI_START(block) __itt_event_start((block)->ittapi_event) -#define _ITTAPI_STOP(block) __itt_event_end((block)->ittapi_event) +#define _ITTAPI_EVENT_MEMBER __itt_event ittapi_event; +#define _ITTAPI_BLOCK_MEMBER +#define _ITTAPI_START(block) __itt_event_start((block)->event->ittapi_event) +#define _ITTAPI_STOP(block) __itt_event_end((block)->event->ittapi_event) #else -#define _ITTAPI_CTX_MEMBER -#define _ITTAPI_CTOR(block, name) +#define _ITTAPI_EVENT_MEMBER +#define _ITTAPI_BLOCK_MEMBER #define _ITTAPI_START(block) #define _ITTAPI_STOP(block) #endif /** - * Implementation: Aggregated counts back-end + * Top-level jl_timing implementation **/ -extern JL_DLLEXPORT _Atomic(uint64_t) jl_timing_self_counts[(int)JL_TIMING_EVENT_LAST]; -extern JL_DLLEXPORT _Atomic(uint64_t) jl_timing_full_counts[(int)JL_TIMING_EVENT_LAST]; -typedef struct _jl_timing_counts_t { - uint64_t total; - uint64_t start; - uint64_t t0; -#ifdef JL_DEBUG_BUILD - uint8_t running; -#endif -} jl_timing_counts_t; - -STATIC_INLINE void _jl_timing_counts_pause(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { -#ifdef JL_DEBUG_BUILD - assert(block->running); - block->running = 0; -#endif - block->total += t - block->start; -} - -STATIC_INLINE void _jl_timing_counts_resume(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { -#ifdef JL_DEBUG_BUILD - assert(!block->running); - block->running = 1; -#endif - block->start = t; -} - -STATIC_INLINE void _jl_timing_counts_start(jl_timing_counts_t *block, uint64_t t) JL_NOTSAFEPOINT { - block->total = 0; - block->start = t; - block->t0 = t; -#ifdef JL_DEBUG_BUILD - block->running = 1; -#endif -} - -STATIC_INLINE void _jl_timing_counts_stop(jl_timing_counts_t *block, int event, uint64_t t) JL_NOTSAFEPOINT { -#ifdef JL_DEBUG_BUILD - assert(block->running); - block->running = 0; -#endif - block->total += t - block->start; - jl_atomic_fetch_add_relaxed(jl_timing_self_counts + event, block->total); - jl_atomic_fetch_add_relaxed(jl_timing_full_counts + event, t - block->t0); -} +extern JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_SUBSYSTEM_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)]; +extern const char *jl_timing_subsystems[(int)JL_TIMING_SUBSYSTEM_LAST]; /** - * Top-level jl_timing implementation + * Stores all static attributes associated with a profiling event. + * + * A single event can be used to create many timing blocks with + * the same name/source information. **/ +struct _jl_timing_event_t { // typedef in julia.h + _TRACY_EVENT_MEMBER + _ITTAPI_EVENT_MEMBER + _COUNTS_EVENT_MEMBER -extern JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)]; + int subsystem; +}; +/** + * Stores all dynamic attributes associated with a timing block. + * + * Every time the application enters an instrumented block of code, + * a new timing block is created. A timing block corresponds to one + * "span" of time in the profiler. + **/ struct _jl_timing_block_t { // typedef in julia.h struct _jl_timing_block_t *prev; + jl_timing_event_t *event; - _TRACY_CTX_MEMBER - _ITTAPI_CTX_MEMBER - _COUNTS_CTX_MEMBER + _TRACY_BLOCK_MEMBER + _ITTAPI_BLOCK_MEMBER + _COUNTS_BLOCK_MEMBER - int subsystem; - int event; - int8_t is_running; + uint8_t is_running; }; STATIC_INLINE int _jl_timing_enabled(int subsystem) JL_NOTSAFEPOINT { return (jl_atomic_load_relaxed(jl_timing_disable_mask + subsystem / (sizeof(uint64_t) * CHAR_BIT)) & (1 << (subsystem % (sizeof(uint64_t) * CHAR_BIT)))) == 0; } -STATIC_INLINE void jl_timing_block_start(jl_timing_block_t *block) { - assert(!block->is_running); - if (!_jl_timing_enabled(block->subsystem)) return; - - uint64_t t = cycleclock(); (void)t; - _COUNTS_START(&block->counts_ctx, t); - _ITTAPI_START(block); - _TRACY_START(block); - - jl_timing_block_t **prevp = &jl_current_task->ptls->timing_stack; - block->prev = *prevp; - block->is_running = 1; - if (block->prev) { - _COUNTS_PAUSE(&block->prev->counts_ctx, t); - } - *prevp = block; -} - -STATIC_INLINE void _jl_timing_block_ctor(jl_timing_block_t *block, int subsystem, int event) JL_NOTSAFEPOINT { - block->subsystem = subsystem; - block->event = event; - block->is_running = 0; -} - -STATIC_INLINE void _jl_timing_block_destroy(jl_timing_block_t *block) JL_NOTSAFEPOINT { - if (block->is_running) { - uint64_t t = cycleclock(); (void)t; - _ITTAPI_STOP(block); - _TRACY_STOP(block->tracy_ctx); - _COUNTS_STOP(&block->counts_ctx, block->event, cycleclock()); - - jl_task_t *ct = jl_current_task; - jl_timing_block_t **pcur = &ct->ptls->timing_stack; - assert(*pcur == block); - *pcur = block->prev; - if (block->prev) { - _COUNTS_RESUME(&block->prev->counts_ctx, t); - } - } -} - typedef struct _jl_timing_suspend_t { jl_task_t *ct; } jl_timing_suspend_t; @@ -413,16 +338,19 @@ STATIC_INLINE void _jl_timing_suspend_destroy(jl_timing_suspend_t *suspend) JL_N #endif } -#define JL_TIMING(subsystem, event) \ - JL_TIMING_CREATE_BLOCK(__timing_block, subsystem, event); \ +#define JL_TIMING(subsystem, event) \ + JL_TIMING_CREATE_BLOCK(__timing_block, subsystem, event); \ jl_timing_block_start(&__timing_block) -#define JL_TIMING_CREATE_BLOCK(block, subsystem, event) \ - __attribute__((cleanup(_jl_timing_block_destroy))) \ - jl_timing_block_t block; \ - _jl_timing_block_ctor(&block, JL_TIMING_ ## subsystem, JL_TIMING_EVENT_ ## event); \ - _TRACY_CTOR(&block, #event); \ - _ITTAPI_CTOR(&block, #event) +#define JL_TIMING_CREATE_BLOCK(block, subsystem_name, event_name) \ + static jl_timing_event_t *TIMING_CONCAT(__timing_event, __LINE__) = 0; \ + if (!TIMING_CONCAT(__timing_event, __LINE__)) \ + TIMING_CONCAT(__timing_event, __LINE__) = jl_timing_event_create( \ + #subsystem_name, #event_name, __func__, __FILE__, __LINE__, 0 \ + ); \ + __attribute__((cleanup(_jl_timing_block_end))) \ + jl_timing_block_t block = { 0 }; \ + block.event = TIMING_CONCAT(__timing_event, __LINE__) #define JL_TIMING_SUSPEND_TASK(subsystem, ct) \ __attribute__((cleanup(_jl_timing_suspend_destroy))) \ From 5850227a3bcad04649a918a793e8205414cb6664 Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Thu, 25 May 2023 12:11:45 -0400 Subject: [PATCH 2/4] Add timing event locks + de-duplicate ITTAPI events --- src/timing.c | 83 +++++++++++++++++++++++++++++++++++++++++++++------- src/timing.h | 4 +-- 2 files changed, 73 insertions(+), 14 deletions(-) diff --git a/src/timing.c b/src/timing.c index 8af101c0f8026..7e4fe7fb5cccc 100644 --- a/src/timing.c +++ b/src/timing.c @@ -26,8 +26,6 @@ static uint64_t t0; JL_DLLEXPORT _Atomic(uint64_t) jl_timing_disable_mask[(JL_TIMING_SUBSYSTEM_LAST + sizeof(uint64_t) * CHAR_BIT - 1) / (sizeof(uint64_t) * CHAR_BIT)]; -static arraylist_t jl_timing_counts_events; - // Used to as an item limit when several strings of metadata can // potentially be associated with a single timing zone. JL_DLLEXPORT uint32_t jl_timing_print_limit = 10; @@ -41,12 +39,23 @@ const char *jl_timing_subsystems[(int)JL_TIMING_SUBSYSTEM_LAST] = JL_DLLEXPORT jl_timing_counter_t jl_timing_counters[JL_TIMING_COUNTER_LAST]; +#ifdef USE_TIMING_COUNTS +static arraylist_t jl_timing_counts_events; +static jl_mutex_t jl_timing_counts_events_lock; +#endif //USE_TIMING_COUNTS + +#ifdef USE_ITTAPI +static arraylist_t jl_timing_ittapi_events; +static jl_mutex_t jl_timing_ittapi_events_lock; +#endif //USE_ITTAPI + void jl_print_timings(void) { #ifdef USE_TIMING_COUNTS + JL_LOCK_NOGC(&jl_timing_counts_events_lock); uint64_t total_time = cycleclock() - t0; uint64_t root_time = total_time; - for (int i = 0; i < jl_timing_counts_events.len; i++) { + for (int i = 1; i < jl_timing_counts_events.len; i++) { jl_timing_counts_event_t *other_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i]; root_time -= jl_atomic_load_relaxed(&other_event->self); } @@ -66,6 +75,7 @@ void jl_print_timings(void) self, 100 * (((double)self) / total_time), total, 100 * (((double)total) / total_time)); } + JL_UNLOCK_NOGC(&jl_timing_counts_events_lock); fprintf(stderr, "\nJULIA COUNTERS\n"); fprintf(stderr, "%-25s, %-20s\n", "Counter", "Value"); @@ -91,6 +101,8 @@ void jl_init_timing(void) _Static_assert(JL_TIMING_SUBSYSTEM_LAST < sizeof(uint64_t) * CHAR_BIT, "Too many timing subsystems!"); #ifdef USE_TIMING_COUNTS + JL_MUTEX_INIT(&jl_timing_counts_events_lock, "jl_timing_counts_events_lock"); + // Create events list for counts backend arraylist_new(&jl_timing_counts_events, 1); @@ -98,9 +110,16 @@ void jl_init_timing(void) arraylist_push(&jl_timing_counts_events, (void *)new_event); new_event->name = "ROOT"; + jl_atomic_store_relaxed(&new_event->self, 0); jl_atomic_store_relaxed(&new_event->total, 0); #endif +#ifdef USE_ITTAPI + // Create events list for ITTAPI backend + JL_MUTEX_INIT(&jl_timing_ittapi_events_lock, "jl_timing_ittapi_events_lock"); + arraylist_new(&jl_timing_ittapi_events, 0); +#endif + // Sort the subsystem names for quick enable/disable lookups qsort( jl_timing_subsystems, JL_TIMING_SUBSYSTEM_LAST, @@ -171,24 +190,60 @@ static const int get_timing_subsystem(const char *subsystem) { return (int)(match - &jl_timing_subsystems[0]); } +#ifdef USE_ITTAPI + +typedef struct { + __itt_event event; + const char *name; +} cached_ittapi_event_t; + +static __itt_event _jl_timing_ittapi_event_create(const char *event) { + JL_LOCK_NOGC(&jl_timing_ittapi_events_lock); + const size_t n = jl_timing_ittapi_events.len; + for (size_t i = 0; i < n; i++) { + cached_ittapi_event_t *other_event = (cached_ittapi_event_t *)jl_timing_ittapi_events.items[i]; + if (strcmp(event, other_event->name) == 0) { + JL_UNLOCK_NOGC(&jl_timing_ittapi_events_lock); + return other_event->event; + } + } + + // No matching event found - create a new one + cached_ittapi_event_t *new_event = (cached_ittapi_event_t *)malloc(sizeof(cached_ittapi_event_t)); + arraylist_push(&jl_timing_ittapi_events, (void *)new_event); + new_event->name = event; + new_event->event = __itt_event_create(event, strlen(event)); + JL_UNLOCK_NOGC(&jl_timing_ittapi_events_lock); + + return new_event->event; +} + +#endif // USE_ITTAPI + #ifdef USE_TIMING_COUNTS // This function is analogous to __itt_event_create but for the counts backend // // `event` is required to live forever static jl_timing_counts_event_t *_jl_timing_counts_event_create(const char *event) { + JL_LOCK_NOGC(&jl_timing_counts_events_lock); const size_t n = jl_timing_counts_events.len; for (size_t i = 0; i < n; i++) { jl_timing_counts_event_t *other_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i]; - if (strcmp(event, other_event->name) == 0) + if (strcmp(event, other_event->name) == 0) { + JL_UNLOCK_NOGC(&jl_timing_counts_events_lock); return other_event; + } } // No matching event found - create a new one jl_timing_counts_event_t *new_event = (jl_timing_counts_event_t *)malloc(sizeof(jl_timing_counts_event_t)); arraylist_push(&jl_timing_counts_events, (void *)new_event); new_event->name = event; + jl_atomic_store_relaxed(&new_event->self, 0); jl_atomic_store_relaxed(&new_event->total, 0); + JL_UNLOCK_NOGC(&jl_timing_counts_events_lock); + return new_event; } @@ -244,6 +299,10 @@ JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, c event->counts_event = _jl_timing_counts_event_create(name); #endif // USE_TIMING_COUNTS +#ifdef USE_ITTAPI + event->ittapi_event = _jl_timing_ittapi_event_create(name); +#endif // USE_ITTAPI + #ifdef USE_TRACY event->tracy_srcloc.name = name; event->tracy_srcloc.function = function; @@ -252,15 +311,17 @@ JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, c event->tracy_srcloc.color = color; #endif // USE_TRACY -#ifdef USE_ITTAPI - event->ittapi_event = __itt_event_create(name, strlen(name)); -#endif // USE_ITTAPI - return event; } -JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event) { - memset(block, 0, sizeof(*block)); +JL_DLLEXPORT void _jl_timing_block_init(char *buf, size_t size, jl_timing_event_t *event) { + if (size < sizeof(jl_timing_block_t)) { + jl_errorf("jl_timing_block_t buffer must be at least %d bytes", sizeof(jl_timing_block_t)); + return; + } + + jl_timing_block_t *block = (jl_timing_block_t *)buf; + memset(block, 0, sizeof(jl_timing_block_t)); block->event = event; } @@ -592,7 +653,7 @@ void jl_destroy_timing(void) { } JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, const char *name, const char *function, const char *file, int line, int color) { return NULL; } -JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event) { } +JL_DLLEXPORT void _jl_timing_block_init(char *buf, size_t size, jl_timing_event_t *event) { } JL_DLLEXPORT void _jl_timing_block_start(jl_timing_block_t *block) { } JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *block) { } diff --git a/src/timing.h b/src/timing.h index 5ec6ea1fc33f8..30f6ad0ab3b5c 100644 --- a/src/timing.h +++ b/src/timing.h @@ -52,7 +52,7 @@ void jl_timing_apply_env(void); extern JL_DLLEXPORT uint32_t jl_timing_print_limit; JL_DLLEXPORT jl_timing_event_t *_jl_timing_event_create(const char *subsystem, const char *name, const char *function, const char *file, int line, int color); -JL_DLLEXPORT void _jl_timing_block_init(jl_timing_block_t *block, jl_timing_event_t *event); +JL_DLLEXPORT void _jl_timing_block_init(char *buf, size_t size, jl_timing_event_t *event); JL_DLLEXPORT void _jl_timing_block_start(jl_timing_block_t *cur_block); JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *cur_block); @@ -89,7 +89,6 @@ JL_DLLEXPORT void _jl_timing_block_end(jl_timing_block_t *cur_block); #define jl_timing_puts(b, s) #define jl_timing_task_init(t) #define jl_timing_event_create(blk) -#define jl_timing_block_init(blk) #define jl_timing_block_start(blk) #define jl_timing_block_task_enter(ct, ptls, blk) #define jl_timing_block_task_exit(ct, ptls) ((jl_timing_block_t *)NULL) @@ -142,7 +141,6 @@ JL_DLLEXPORT void jl_timing_printf(jl_timing_block_t *cur_block, const char *for JL_DLLEXPORT void jl_timing_puts(jl_timing_block_t *cur_block, const char *str); #define jl_timing_event_create(subsystem, name, function, file, line, color) _jl_timing_event_create(subsystem, name, function, file, line, color) -#define jl_timing_block_init(blk, evt) _jl_timing_block_start(blk, evt) #define jl_timing_block_start(blk) _jl_timing_block_start(blk) #define jl_timing_block_end(blk) _jl_timing_block_end(blk) From 3b90d5edd83630c64e2b037efb1c425d7d974f4b Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Thu, 25 May 2023 20:49:51 -0400 Subject: [PATCH 3/4] Use `jl_timing_set_enable` instead of `DISABLE_SUBSYSTEM` --- src/timing.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/timing.c b/src/timing.c index 7e4fe7fb5cccc..19357c017cfa1 100644 --- a/src/timing.c +++ b/src/timing.c @@ -154,14 +154,15 @@ void jl_init_timing(void) * files, so we disable them by default. **/ #ifdef DISABLE_FREQUENT_EVENTS -#define DISABLE_SUBSYSTEM(subsystem) jl_atomic_fetch_or_relaxed(jl_timing_disable_mask + (JL_TIMING_##subsystem / (sizeof(uint64_t) * CHAR_BIT)), 1 << (JL_TIMING_##subsystem % (sizeof(uint64_t) * CHAR_BIT))) - DISABLE_SUBSYSTEM(ROOT); - DISABLE_SUBSYSTEM(TYPE_CACHE_LOOKUP); - DISABLE_SUBSYSTEM(TYPE_CACHE_INSERT); - DISABLE_SUBSYSTEM(METHOD_MATCH); - DISABLE_SUBSYSTEM(METHOD_LOOKUP_FAST); - DISABLE_SUBSYSTEM(AST_COMPRESS); - DISABLE_SUBSYSTEM(AST_UNCOMPRESS); + uint8_t error = 0; + error |= jl_timing_set_enable("ROOT", 0); + error |= jl_timing_set_enable("TYPE_CACHE_LOOKUP", 0); + error |= jl_timing_set_enable("METHOD_MATCH", 0); + error |= jl_timing_set_enable("METHOD_LOOKUP_FAST", 0); + error |= jl_timing_set_enable("AST_COMPRESS", 0); + error |= jl_timing_set_enable("AST_UNCOMPRESS", 0); + if (error) + jl_error("invalid timing subsystem encountered in jl_init_timing"); #endif // Apply e.g. JULIA_TIMING_SUBSYSTEMS="+GC,-INFERENCE" and From 3c915f304d186cf82fa561f96346280ecf14d38f Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Thu, 25 May 2023 22:30:52 -0400 Subject: [PATCH 4/4] Sort jl_timing counts events before printing --- src/timing.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/timing.c b/src/timing.c index 19357c017cfa1..d933f082c816e 100644 --- a/src/timing.c +++ b/src/timing.c @@ -49,17 +49,32 @@ static arraylist_t jl_timing_ittapi_events; static jl_mutex_t jl_timing_ittapi_events_lock; #endif //USE_ITTAPI +#ifdef USE_TIMING_COUNTS +static int cmp_counts_events(const void *a, const void *b) { + jl_timing_counts_event_t *event_a = *(jl_timing_counts_event_t **)a; + jl_timing_counts_event_t *event_b = *(jl_timing_counts_event_t **)b; + return strcmp(event_a->name, event_b->name); +} +#endif + void jl_print_timings(void) { #ifdef USE_TIMING_COUNTS + qsort(jl_timing_counts_events.items, jl_timing_counts_events.len, + sizeof(jl_timing_counts_event_t *), cmp_counts_events); + JL_LOCK_NOGC(&jl_timing_counts_events_lock); uint64_t total_time = cycleclock() - t0; uint64_t root_time = total_time; - for (int i = 1; i < jl_timing_counts_events.len; i++) { + jl_timing_counts_event_t *root_event; + for (int i = 0; i < jl_timing_counts_events.len; i++) { jl_timing_counts_event_t *other_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[i]; - root_time -= jl_atomic_load_relaxed(&other_event->self); + if (strcmp(other_event->name, "ROOT") == 0) { + root_event = other_event; + } else { + root_time -= jl_atomic_load_relaxed(&other_event->self); + } } - jl_timing_counts_event_t *root_event = (jl_timing_counts_event_t *)jl_timing_counts_events.items[0]; jl_atomic_store_relaxed(&root_event->self, root_time); jl_atomic_store_relaxed(&root_event->total, total_time); @@ -90,7 +105,7 @@ void jl_print_timings(void) #endif } -static const int indirect_strcmp(const void *a, const void *b) { +static int indirect_strcmp(const void *a, const void *b) { return strcmp(*(const char **)a, *(const char **)b); } @@ -106,12 +121,12 @@ void jl_init_timing(void) // Create events list for counts backend arraylist_new(&jl_timing_counts_events, 1); - jl_timing_counts_event_t *new_event = (jl_timing_counts_event_t *)malloc(sizeof(jl_timing_counts_event_t)); - arraylist_push(&jl_timing_counts_events, (void *)new_event); + jl_timing_counts_event_t *root_event = (jl_timing_counts_event_t *)malloc(sizeof(jl_timing_counts_event_t)); + arraylist_push(&jl_timing_counts_events, (void *)root_event); - new_event->name = "ROOT"; - jl_atomic_store_relaxed(&new_event->self, 0); - jl_atomic_store_relaxed(&new_event->total, 0); + root_event->name = "ROOT"; + jl_atomic_store_relaxed(&root_event->self, 0); + jl_atomic_store_relaxed(&root_event->total, 0); #endif #ifdef USE_ITTAPI @@ -121,10 +136,8 @@ void jl_init_timing(void) #endif // Sort the subsystem names for quick enable/disable lookups - qsort( - jl_timing_subsystems, JL_TIMING_SUBSYSTEM_LAST, - sizeof(const char *), indirect_strcmp - ); + qsort(jl_timing_subsystems, JL_TIMING_SUBSYSTEM_LAST, + sizeof(const char *), indirect_strcmp); int i __attribute__((unused)) = 0; #ifdef USE_ITTAPI