diff --git a/NEWS.md b/NEWS.md
index 9e5ab91b1749b..7077aa8e3a8dc 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -133,6 +133,8 @@ Deprecated or removed
 
 * The `@pure` macro is now deprecated. Use `Base.@assume_effects :foldable` instead ([#48682]).
 
+- A wall-time profiler is now available for users who need a sampling profiler that captures tasks regardless of their scheduling or running state. This type of profiler enables profiling of I/O-heavy tasks and helps detect areas of heavy contention in the system ([#55889]).
+
 <!--- generated by NEWS-update.jl: -->
 [#31836]: https://github.com/JuliaLang/julia/issues/31836
 [#40105]: https://github.com/JuliaLang/julia/issues/40105
diff --git a/doc/src/manual/img/cpu-profile.png b/doc/src/manual/img/cpu-profile.png
new file mode 100644
index 0000000000000..ec48b41f6e78b
Binary files /dev/null and b/doc/src/manual/img/cpu-profile.png differ
diff --git a/doc/src/manual/img/task-sampling-failure.png b/doc/src/manual/img/task-sampling-failure.png
new file mode 100644
index 0000000000000..46bbd9b33b6ce
Binary files /dev/null and b/doc/src/manual/img/task-sampling-failure.png differ
diff --git a/doc/src/manual/img/wall-time-profiler-channel-example.png b/doc/src/manual/img/wall-time-profiler-channel-example.png
new file mode 100644
index 0000000000000..26cb4a4522621
Binary files /dev/null and b/doc/src/manual/img/wall-time-profiler-channel-example.png differ
diff --git a/doc/src/manual/img/wall-time-profiler-compute-bound-example.png b/doc/src/manual/img/wall-time-profiler-compute-bound-example.png
new file mode 100644
index 0000000000000..983b01bcc2dea
Binary files /dev/null and b/doc/src/manual/img/wall-time-profiler-compute-bound-example.png differ
diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md
index 6e9f98d3f1780..f15dec5d36d43 100644
--- a/doc/src/manual/profile.md
+++ b/doc/src/manual/profile.md
@@ -297,6 +297,220 @@ Of course, you can decrease the delay as well as increase it; however, the overh
 grows once the delay becomes similar to the amount of time needed to take a backtrace (~30 microseconds
 on the author's laptop).
 
+## Wall-time Profiler
+
+### Introduction & Problem Motivation
+
+The profiler described in the previous section is a sampling CPU profiler. At a high level, the profiler periodically stops all Julia compute threads to collect their backtraces and estimates the time spent in each function based on the number of backtrace samples that include a frame from that function. However, note that only tasks currently running on system threads just before the profiler stops them will have their backtraces collected.
+
+While this profiler is typically well-suited for workloads where the majority of tasks are compute-bound, it is less helpful for systems where most tasks are IO-heavy or for diagnosing contention on synchronization primitives in your code.
+
+Let's consider this simple workload:
+
+```Julia
+using Base.Threads
+using Profile
+using PProf
+
+ch = Channel(1)
+
+const N_SPAWNED_TASKS = (1 << 10)
+const WAIT_TIME_NS = 10_000_000
+
+function spawn_a_bunch_of_tasks_waiting_on_channel()
+    for i in 1:N_SPAWNED_TASKS
+        Threads.@spawn begin
+            take!(ch)
+        end
+    end
+end
+
+function busywait()
+    t0 = time_ns()
+    while true
+        if time_ns() - t0 > WAIT_TIME_NS
+            break
+        end
+    end
+end
+
+function main()
+    spawn_a_bunch_of_tasks_waiting_on_channel()
+    for i in 1:N_SPAWNED_TASKS
+        put!(ch, i)
+        busywait()
+    end
+end
+
+Profile.@profile main()
+```
+
+Our goal is to detect whether there is contention on the `ch` channel—i.e., whether the number of waiters is excessive given the rate at which work items are being produced in the channel.
+
+If we run this, we obtain the following [PProf](https://github.com/JuliaPerf/PProf.jl) flame graph:
+
+![CPU Profile](./img/cpu-profile.png)
+
+This profile provides no information to help determine where contention occurs in the system’s synchronization primitives. Waiters on a channel will be blocked and descheduled, meaning no system thread will be running the tasks assigned to those waiters, and as a result, they won't be sampled by the profiler.
+
+### Wall-time Profiler
+
+Instead of sampling threads—and thus only sampling tasks that are running—a wall-time task profiler samples tasks independently of their scheduling state. For example, tasks that are sleeping on a synchronization primitive at the time the profiler is running will be sampled with the same probability as tasks that were actively running when the profiler attempted to capture backtraces.
+
+This approach allows us to construct a profile where backtraces from tasks blocked on the `ch` channel, as in the example above, are actually represented.
+
+Let's run the same example, but now with a wall-time profiler:
+
+
+```Julia
+using Base.Threads
+using Profile
+using PProf
+
+ch = Channel(1)
+
+const N_SPAWNED_TASKS = (1 << 10)
+const WAIT_TIME_NS = 10_000_000
+
+function spawn_a_bunch_of_tasks_waiting_on_channel()
+    for i in 1:N_SPAWNED_TASKS
+        Threads.@spawn begin
+            take!(ch)
+        end
+    end
+end
+
+function busywait()
+    t0 = time_ns()
+    while true
+        if time_ns() - t0 > WAIT_TIME_NS
+            break
+        end
+    end
+end
+
+function main()
+    spawn_a_bunch_of_tasks_waiting_on_channel()
+    for i in 1:N_SPAWNED_TASKS
+        put!(ch, i)
+        busywait()
+    end
+end
+
+Profile.@profile_walltime main()
+```
+
+We obtain the following flame graph:
+
+![Wall-time Profile Channel](./img/wall-time-profiler-channel-example.png)
+
+We see that a large number of samples come from channel-related `take!` functions, which allows us to determine that there is indeed an excessive number of waiters in `ch`.
+
+### A Compute-Bound Workload
+
+Despite the wall-time profiler sampling all live tasks in the system and not just the currently running ones, it can still be helpful for identifying performance hotspots, even if your code is compute-bound. Let’s consider a simple example:
+
+```Julia
+using Base.Threads
+using Profile
+using PProf
+
+ch = Channel(1)
+
+const MAX_ITERS = (1 << 22)
+const N_TASKS = (1 << 12)
+
+function spawn_a_task_waiting_on_channel()
+    Threads.@spawn begin
+        take!(ch)
+    end
+end
+
+function sum_of_sqrt()
+    sum_of_sqrt = 0.0
+    for i in 1:MAX_ITERS
+        sum_of_sqrt += sqrt(i)
+    end
+    return sum_of_sqrt
+end
+
+function spawn_a_bunch_of_compute_heavy_tasks()
+    Threads.@sync begin
+        for i in 1:N_TASKS
+            Threads.@spawn begin
+                sum_of_sqrt()
+            end
+        end
+    end
+end
+
+function main()
+    spawn_a_task_waiting_on_channel()
+    spawn_a_bunch_of_compute_heavy_tasks()
+end
+
+Profile.@profile_walltime main()
+```
+
+After collecting a wall-time profile, we get the following flame graph:
+
+![Wall-time Profile Compute-Bound](./img/wall-time-profiler-compute-bound-example.png)
+
+Notice how many of the samples contain `sum_of_sqrt`, which is the expensive compute function in our example.
+
+### Identifying Task Sampling Failures in your Profile
+
+In the current implementation, the wall-time profiler attempts to sample from tasks that have been alive since the last garbage collection, along with those created afterward. However, if most tasks are extremely short-lived, you may end up sampling tasks that have already completed, resulting in missed backtrace captures.
+
+If you encounter samples containing `failed_to_sample_task_fun` or `failed_to_stop_thread_fun`, this likely indicates a high volume of short-lived tasks, which prevented their backtraces from being collected.
+
+Let's consider this simple example:
+
+```Julia
+using Base.Threads
+using Profile
+using PProf
+
+const N_SPAWNED_TASKS = (1 << 16)
+const WAIT_TIME_NS = 100_000
+
+function spawn_a_bunch_of_short_lived_tasks()
+    for i in 1:N_SPAWNED_TASKS
+        Threads.@spawn begin
+            # Do nothing
+        end
+    end
+end
+
+function busywait()
+    t0 = time_ns()
+    while true
+        if time_ns() - t0 > WAIT_TIME_NS
+            break
+        end
+    end
+end
+
+function main()
+    GC.enable(false)
+    spawn_a_bunch_of_short_lived_tasks()
+    for i in 1:N_SPAWNED_TASKS
+        busywait()
+    end
+    GC.enable(true)
+end
+
+Profile.@profile_walltime main()
+```
+
+Notice that the tasks spawned in `spawn_a_bunch_of_short_lived_tasks` are extremely short-lived. Since these tasks constitute the majority in the system, we will likely miss capturing a backtrace for most sampled tasks.
+
+After collecting a wall-time profile, we obtain the following flame graph:
+
+![Task Sampling Failure](./img/task-sampling-failure.png)
+
+The large number of samples from `failed_to_stop_thread_fun` confirms that we have a significant number of short-lived tasks in the system.
+
 ## Memory allocation analysis
 
 One of the most common techniques to improve performance is to reduce memory allocation. Julia
diff --git a/src/gc-stacks.c b/src/gc-stacks.c
index 1c1e286960762..8e4297b0a4b8a 100644
--- a/src/gc-stacks.c
+++ b/src/gc-stacks.c
@@ -266,6 +266,39 @@ void sweep_stack_pools(void)
     }
 }
 
+// Builds a list of the live tasks. Racy: `live_tasks` can expand at any time.
+arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT
+{
+    arraylist_t *tasks = (arraylist_t*)malloc_s(sizeof(arraylist_t));
+    arraylist_new(tasks, 0);
+    size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states);
+    for (size_t i = 0; i < nthreads; i++) {
+        // skip GC threads...
+        if (gc_is_parallel_collector_thread(i) || gc_is_concurrent_collector_thread(i)) {
+            continue;
+        }
+        jl_ptls_t ptls2 = allstates[i];
+        if (ptls2 == NULL) {
+            continue;
+        }
+        jl_task_t *t = ptls2->root_task;
+        if (t->stkbuf != NULL) {
+            arraylist_push(tasks, t);
+        }
+        small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
+        size_t n = mtarraylist_length(live_tasks);
+        for (size_t i = 0; i < n; i++) {
+            jl_task_t *t = (jl_task_t*)mtarraylist_get(live_tasks, i);
+            assert(t != NULL);
+            if (t->stkbuf != NULL) {
+                arraylist_push(tasks, t);
+            }
+        }
+    }
+    return tasks;
+}
+
 JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
 {
     size_t nthreads = jl_atomic_load_acquire(&jl_n_threads);
diff --git a/src/gc.c b/src/gc.c
index 334bd8f83ada7..12d312dea1fb2 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1527,6 +1527,31 @@ static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
     gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start;
 }
 
+void sweep_mtarraylist_buffers(void) JL_NOTSAFEPOINT
+{
+    for (int i = 0; i < gc_n_threads; i++) {
+        jl_ptls_t ptls = gc_all_tls_states[i];
+        if (ptls == NULL) {
+            continue;
+        }
+        small_arraylist_t *buffers = &ptls->lazily_freed_mtarraylist_buffers;
+        void *buf;
+        while ((buf = small_arraylist_pop(buffers)) != NULL) {
+            free(buf);
+        }
+    }
+}
+
+void sweep_stack_pools_and_mtarraylist_buffers(jl_ptls_t ptls) JL_NOTSAFEPOINT
+{
+    // initialize ptls index for parallel sweeping of stack pools
+    assert(gc_n_threads);
+    uv_mutex_lock(&live_tasks_lock);
+    sweep_stack_pools();
+    sweep_mtarraylist_buffers();
+    uv_mutex_unlock(&live_tasks_lock);
+}
+
 static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT
 {
     assert(pg->fl_begin_offset != UINT16_MAX);
@@ -3611,7 +3636,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
 #endif
         current_sweep_full = sweep_full;
         sweep_weak_refs();
-        sweep_stack_pools();
+        sweep_stack_pools_and_mtarraylist_buffers(ptls);
         gc_sweep_foreign_objs();
         gc_sweep_other(ptls, sweep_full);
         gc_scrub();
@@ -3903,6 +3928,8 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     jl_atomic_store_relaxed(&q->bottom, 0);
     jl_atomic_store_relaxed(&q->array, wsa2);
     arraylist_new(&mq->reclaim_set, 32);
+    // Initialize `lazily_freed_mtarraylist_buffers`
+    small_arraylist_new(&ptls->lazily_freed_mtarraylist_buffers, 0);
 
     memset(&ptls->gc_tls.gc_num, 0, sizeof(ptls->gc_tls.gc_num));
     jl_atomic_store_relaxed(&ptls->gc_tls.gc_num.allocd, -(int64_t)gc_num.interval);
diff --git a/src/gc.h b/src/gc.h
index 61c7c835db34a..be80e4bcbbd71 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -471,6 +471,16 @@ STATIC_INLINE int gc_is_parallel_collector_thread(int tid) JL_NOTSAFEPOINT
     return tid >= gc_first_tid && tid <= gc_last_parallel_collector_thread_id();
 }
 
+STATIC_INLINE int gc_is_concurrent_collector_thread(int tid) JL_NOTSAFEPOINT
+{
+    if (jl_n_sweepthreads == 0) {
+        return 0;
+    }
+    int last_parallel_collector_thread_id = gc_last_parallel_collector_thread_id();
+    int concurrent_collector_thread_id = last_parallel_collector_thread_id + 1;
+    return tid == concurrent_collector_thread_id;
+}
+
 STATIC_INLINE int gc_random_parallel_collector_thread_id(jl_ptls_t ptls) JL_NOTSAFEPOINT
 {
     assert(jl_n_markthreads > 0);
diff --git a/src/init.c b/src/init.c
index e482f8b77ee9b..9a49ad60c02f4 100644
--- a/src/init.c
+++ b/src/init.c
@@ -747,6 +747,11 @@ JL_DLLEXPORT void julia_init(JL_IMAGE_SEARCH rel)
     // Make sure we finalize the tls callback before starting any threads.
     (void)jl_get_pgcstack();
 
+    // initialize the live tasks lock
+    uv_mutex_init(&live_tasks_lock);
+    // initialize the profiler buffer lock
+    uv_mutex_init(&bt_data_prof_lock);
+
     // initialize backtraces
     jl_init_profile_lock();
 #ifdef _OS_WINDOWS_
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 94b1f85112d7d..fc719c45c5753 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -206,6 +206,35 @@ JL_DLLEXPORT void jl_unlock_profile_wr(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEA
 int jl_lock_stackwalk(void) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER;
 void jl_unlock_stackwalk(int lockret) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_LEAVE;
 
+arraylist_t *jl_get_all_tasks_arraylist(void) JL_NOTSAFEPOINT;
+typedef struct {
+    size_t bt_size;
+    int tid;
+} jl_record_backtrace_result_t;
+JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, struct _jl_bt_element_t *bt_data,
+                                                              size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT;
+extern volatile struct _jl_bt_element_t *profile_bt_data_prof;
+extern volatile size_t profile_bt_size_max;
+extern volatile size_t profile_bt_size_cur;
+extern volatile int profile_running;
+extern volatile int profile_all_tasks;
+// Ensures that we can safely read the `live_tasks`field of every TLS when profiling.
+// We want to avoid the case that a GC gets interleaved with `jl_profile_task` and shrinks
+// the `live_tasks` array while we are reading it or frees tasks that are being profiled.
+// Because of that, this lock must be held in `jl_profile_task` and `sweep_stack_pools_and_mtarraylist_buffers`.
+extern uv_mutex_t live_tasks_lock;
+// Ensures that we can safely write to `profile_bt_data_prof` and `profile_bt_size_cur`.
+// We want to avoid the case that:
+// - We start to profile a task very close to the profiling time window end.
+// - The profiling time window ends and we start to read the profile data in a compute thread.
+// - We write to the profile in a profiler thread while the compute thread is reading it.
+// Locking discipline: `bt_data_prof_lock` must be held inside the scope of `live_tasks_lock`.
+extern uv_mutex_t bt_data_prof_lock;
+#define PROFILE_STATE_THREAD_NOT_SLEEPING (1)
+#define PROFILE_STATE_THREAD_SLEEPING (2)
+#define PROFILE_STATE_WALL_TIME_PROFILING (3)
+void jl_profile_task(void);
+
 // number of cycles since power-on
 static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
 {
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 3d06d380d10c2..141098f7dce49 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -155,6 +155,7 @@ typedef struct _jl_tls_states_t {
     // Counter to disable finalizer **on the current thread**
     int finalizers_inhibited;
     jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
+    small_arraylist_t lazily_freed_mtarraylist_buffers;
     volatile sig_atomic_t defer_signal;
     _Atomic(struct _jl_task_t*) current_task;
     struct _jl_task_t *next_task;
diff --git a/src/mtarraylist.c b/src/mtarraylist.c
index 8bad44797dab4..7af265a86ab63 100644
--- a/src/mtarraylist.c
+++ b/src/mtarraylist.c
@@ -37,7 +37,7 @@ static void mtarraylist_resizeto(small_mtarraylist_t *a, size_t len, size_t newl
         a->max = nm;
         if (olditems != (void*)&a->_space[0]) {
             jl_task_t *ct = jl_current_task;
-            jl_gc_add_quiescent(ct->ptls, (void**)olditems, free);
+            small_arraylist_push(&ct->ptls->lazily_freed_mtarraylist_buffers, olditems);
         }
     }
 }
diff --git a/src/signal-handling.c b/src/signal-handling.c
index c2a344a752547..98c0b0896f347 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -18,16 +18,18 @@ extern "C" {
 #include <threading.h>
 
 // Profiler control variables
-// Note: these "static" variables are also used in "signals-*.c"
-static volatile jl_bt_element_t *bt_data_prof = NULL;
-static volatile size_t bt_size_max = 0;
-static volatile size_t bt_size_cur = 0;
+uv_mutex_t live_tasks_lock;
+uv_mutex_t bt_data_prof_lock;
+volatile jl_bt_element_t *profile_bt_data_prof = NULL;
+volatile size_t profile_bt_size_max = 0;
+volatile size_t profile_bt_size_cur = 0;
 static volatile uint64_t nsecprof = 0;
-static volatile int running = 0;
-static const    uint64_t GIGA = 1000000000ULL;
+volatile int profile_running = 0;
+volatile int profile_all_tasks = 0;
+static const uint64_t GIGA = 1000000000ULL;
 // Timers to take samples at intervals
 JL_DLLEXPORT void jl_profile_stop_timer(void);
-JL_DLLEXPORT int jl_profile_start_timer(void);
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t);
 // File-descriptor for safe logging on signal handling
 int jl_sig_fd;
 
@@ -36,30 +38,30 @@ int jl_sig_fd;
 ///////////////////////
 JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec)
 {
-    bt_size_max = maxsize;
+    profile_bt_size_max = maxsize;
     nsecprof = delay_nsec;
-    if (bt_data_prof != NULL)
-        free((void*)bt_data_prof);
-    bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t));
-    if (bt_data_prof == NULL && maxsize > 0)
+    if (profile_bt_data_prof != NULL)
+        free((void*)profile_bt_data_prof);
+    profile_bt_data_prof = (jl_bt_element_t*) calloc(maxsize, sizeof(jl_bt_element_t));
+    if (profile_bt_data_prof == NULL && maxsize > 0)
         return -1;
-    bt_size_cur = 0;
+    profile_bt_size_cur = 0;
     return 0;
 }
 
 JL_DLLEXPORT uint8_t *jl_profile_get_data(void)
 {
-    return (uint8_t*) bt_data_prof;
+    return (uint8_t*) profile_bt_data_prof;
 }
 
 JL_DLLEXPORT size_t jl_profile_len_data(void)
 {
-    return bt_size_cur;
+    return profile_bt_size_cur;
 }
 
 JL_DLLEXPORT size_t jl_profile_maxlen_data(void)
 {
-    return bt_size_max;
+    return profile_bt_size_max;
 }
 
 JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void)
@@ -69,12 +71,12 @@ JL_DLLEXPORT uint64_t jl_profile_delay_nsec(void)
 
 JL_DLLEXPORT void jl_profile_clear_data(void)
 {
-    bt_size_cur = 0;
+    profile_bt_size_cur = 0;
 }
 
 JL_DLLEXPORT int jl_profile_is_running(void)
 {
-    return running;
+    return profile_running;
 }
 
 // Any function that acquires this lock must be either a unmanaged thread
@@ -187,7 +189,102 @@ JL_DLLEXPORT int jl_profile_is_buffer_full(void)
     // Declare buffer full if there isn't enough room to sample even just the
     // thread metadata and one max-sized frame. The `+ 6` is for the two block
     // terminator `0`'s plus the 4 metadata entries.
-    return bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > bt_size_max;
+    return profile_bt_size_cur + ((JL_BT_MAX_ENTRY_SIZE + 1) + 6) > profile_bt_size_max;
+}
+
+NOINLINE int failed_to_sample_task_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT;
+NOINLINE int failed_to_stop_thread_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT;
+
+#define PROFILE_TASK_DEBUG_FORCE_SAMPLING_FAILURE (0)
+#define PROFILE_TASK_DEBUG_FORCE_STOP_THREAD_FAILURE (0)
+
+void jl_profile_task(void)
+{
+    if (jl_profile_is_buffer_full()) {
+        // Buffer full: Delete the timer
+        jl_profile_stop_timer();
+        return;
+    }
+
+    jl_task_t *t = NULL;
+    int got_mutex = 0;
+    if (uv_mutex_trylock(&live_tasks_lock) != 0) {
+        goto collect_backtrace;
+    }
+    got_mutex = 1;
+
+    arraylist_t *tasks = jl_get_all_tasks_arraylist();
+    uint64_t seed = jl_rand();
+    const int n_max_random_attempts = 4;
+    // randomly select a task that is not done
+    for (int i = 0; i < n_max_random_attempts; i++) {
+        t = (jl_task_t*)tasks->items[cong(tasks->len, UINT64_MAX, &seed)];
+        assert(t == NULL || jl_is_task(t));
+        if (t == NULL) {
+            continue;
+        }
+        int t_state = jl_atomic_load_relaxed(&t->_state);
+        if (t_state == JL_TASK_STATE_DONE) {
+            continue;
+        }
+        break;
+    }
+    arraylist_free(tasks);
+    free(tasks);
+
+collect_backtrace:
+
+    uv_mutex_lock(&bt_data_prof_lock);
+    if (profile_running == 0) {
+        uv_mutex_unlock(&bt_data_prof_lock);
+        if (got_mutex) {
+            uv_mutex_unlock(&live_tasks_lock);
+        }
+        return;
+    }
+
+    jl_record_backtrace_result_t r = {0, INT16_MAX};
+    jl_bt_element_t *bt_data_prof = (jl_bt_element_t*)(profile_bt_data_prof + profile_bt_size_cur);
+    size_t bt_size_max = profile_bt_size_max - profile_bt_size_cur - 1;
+    if (t == NULL || PROFILE_TASK_DEBUG_FORCE_SAMPLING_FAILURE) {
+        // failed to find a task
+        r.bt_size = failed_to_sample_task_fun(bt_data_prof, bt_size_max, 0);
+    }
+    else {
+        if (!PROFILE_TASK_DEBUG_FORCE_STOP_THREAD_FAILURE) {
+            r = jl_record_backtrace(t, bt_data_prof, bt_size_max, 1);
+        }
+        // we failed to get a backtrace
+        if (r.bt_size == 0) {
+            r.bt_size = failed_to_stop_thread_fun(bt_data_prof, bt_size_max, 0);
+        }
+    }
+
+    // update the profile buffer size
+    profile_bt_size_cur += r.bt_size;
+
+    // store threadid but add 1 as 0 is preserved to indicate end of block
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = (uintptr_t)r.tid + 1;
+
+    // store task id (never null)
+    profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)t;
+
+    // store cpu cycle clock
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
+
+    // the thread profiler uses this block to record whether the thread is not sleeping (1) or sleeping (2)
+    // let's use a dummy value which is not 1 or 2 to
+    // indicate that we are profiling a task, and therefore, this block is not about the thread state
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = 3;
+
+    // Mark the end of this block with two 0's
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+
+    uv_mutex_unlock(&bt_data_prof_lock);
+    if (got_mutex) {
+        uv_mutex_unlock(&live_tasks_lock);
+    }
 }
 
 static uint64_t jl_last_sigint_trigger = 0;
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 6ec8f95570f17..9b5d56eaff36c 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -603,6 +603,84 @@ void jl_unlock_stackwalk(int lockret)
     jl_unlock_profile_mach(1, lockret);
 }
 
+// assumes holding `jl_lock_profile_mach`
+void jl_profile_thread_mach(int tid)
+{
+    // if there is no space left, return early
+    if (jl_profile_is_buffer_full()) {
+        jl_profile_stop_timer();
+        return;
+    }
+    if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+        _dyld_dlopen_atfork_prepare();
+    if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
+        _dyld_atfork_prepare(); // briefly acquire the dlsym lock
+    host_thread_state_t state;
+    int valid_thread = jl_thread_suspend_and_get_state2(tid, &state);
+    unw_context_t *uc = (unw_context_t*)&state;
+    if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
+        _dyld_atfork_parent(); // quickly release the dlsym lock
+    if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
+        _dyld_dlopen_atfork_parent();
+    if (!valid_thread)
+        return;
+    if (profile_running) {
+#ifdef LLVMLIBUNWIND
+        /*
+            *  Unfortunately compact unwind info is incorrectly generated for quite a number of
+            *  libraries by quite a large number of compilers. We can fall back to DWARF unwind info
+            *  in some cases, but in quite a number of cases (especially libraries not compiled in debug
+            *  mode, only the compact unwind info may be available). Even more unfortunately, there is no
+            *  way to detect such bogus compact unwind info (other than noticing the resulting segfault).
+            *  What we do here is ugly, but necessary until the compact unwind info situation improves.
+            *  We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
+            *  Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
+            *  entry will always be correct, and the number of cases in which this is an issue is rather small.
+            *  Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
+            *  and during stack unwinding we only ever read memory, but never write it.
+            */
+
+        forceDwarf = 0;
+        unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point
+
+        if (forceDwarf == 0) {
+            // Save the backtrace
+            profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL);
+        }
+        else if (forceDwarf == 1) {
+            profile_bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL);
+        }
+        else if (forceDwarf == -1) {
+            jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
+        }
+
+        forceDwarf = -2;
+#else
+        profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur, profile_bt_size_max - profile_bt_size_cur - 1, uc, NULL);
+#endif
+        jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+
+        // store threadid but add 1 as 0 is preserved to indicate end of block
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1;
+
+        // store task id (never null)
+        profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
+
+        // store cpu cycle clock
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
+
+        // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block)
+        int state = jl_atomic_load_relaxed(&ptls->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING;
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = state;
+
+        // Mark the end of this block with two 0's
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+        profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+    }
+    // We're done! Resume the thread.
+    jl_thread_resume(tid);
+}
+
 void *mach_profile_listener(void *arg)
 {
     (void)arg;
@@ -620,88 +698,21 @@ void *mach_profile_listener(void *arg)
         // sample each thread, round-robin style in reverse order
         // (so that thread zero gets notified last)
         int keymgr_locked = jl_lock_profile_mach(0);
-
         int nthreads = jl_atomic_load_acquire(&jl_n_threads);
-        int *randperm = profile_get_randperm(nthreads);
-        for (int idx = nthreads; idx-- > 0; ) {
-            // Stop the threads in the random or reverse round-robin order.
-            int i = randperm[idx];
-            // if there is no space left, break early
-            if (jl_profile_is_buffer_full()) {
-                jl_profile_stop_timer();
-                break;
-            }
-
-            if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
-                _dyld_dlopen_atfork_prepare();
-            if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
-                _dyld_atfork_prepare(); // briefly acquire the dlsym lock
-            host_thread_state_t state;
-            int valid_thread = jl_thread_suspend_and_get_state2(i, &state);
-            unw_context_t *uc = (unw_context_t*)&state;
-            if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL)
-                _dyld_atfork_parent(); // quickly release the dlsym lock
-            if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL)
-                _dyld_dlopen_atfork_parent();
-            if (!valid_thread)
-                continue;
-            if (running) {
-#ifdef LLVMLIBUNWIND
-                /*
-                 *  Unfortunately compact unwind info is incorrectly generated for quite a number of
-                 *  libraries by quite a large number of compilers. We can fall back to DWARF unwind info
-                 *  in some cases, but in quite a number of cases (especially libraries not compiled in debug
-                 *  mode, only the compact unwind info may be available). Even more unfortunately, there is no
-                 *  way to detect such bogus compact unwind info (other than noticing the resulting segfault).
-                 *  What we do here is ugly, but necessary until the compact unwind info situation improves.
-                 *  We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
-                 *  Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
-                 *  entry will always be correct, and the number of cases in which this is an issue is rather small.
-                 *  Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
-                 *  and during stack unwinding we only ever read memory, but never write it.
-                 */
-
-                forceDwarf = 0;
-                unw_getcontext(&profiler_uc); // will resume from this point if the next lines segfault at any point
-
-                if (forceDwarf == 0) {
-                    // Save the backtrace
-                    bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
-                }
-                else if (forceDwarf == 1) {
-                    bt_size_cur += rec_backtrace_ctx_dwarf((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
-                }
-                else if (forceDwarf == -1) {
-                    jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
-                }
-
-                forceDwarf = -2;
-#else
-                bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur, bt_size_max - bt_size_cur - 1, uc, NULL);
-#endif
-                jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[i];
-
-                // store threadid but add 1 as 0 is preserved to indicate end of block
-                bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
-
-                // store task id (never null)
-                bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
-
-                // store cpu cycle clock
-                bt_data_prof[bt_size_cur++].uintptr = cycleclock();
-
-                // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
-                bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
-
-                // Mark the end of this block with two 0's
-                bt_data_prof[bt_size_cur++].uintptr = 0;
-                bt_data_prof[bt_size_cur++].uintptr = 0;
+        if (profile_all_tasks) {
+            // Don't take the stackwalk lock here since it's already taken in `jl_record_backtrace`
+            jl_profile_task();
+        }
+        else {
+            int *randperm = profile_get_randperm(nthreads);
+            for (int idx = nthreads; idx-- > 0; ) {
+                // Stop the threads in random order.
+                int i = randperm[idx];
+                jl_profile_thread_mach(i);
             }
-            // We're done! Resume the thread.
-            jl_thread_resume(i);
         }
         jl_unlock_profile_mach(0, keymgr_locked);
-        if (running) {
+        if (profile_running) {
             jl_check_profile_autostop();
             // Reset the alarm
             kern_return_t ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port);
@@ -710,7 +721,8 @@ void *mach_profile_listener(void *arg)
     }
 }
 
-JL_DLLEXPORT int jl_profile_start_timer(void)
+
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
 {
     kern_return_t ret;
     if (!profile_started) {
@@ -739,7 +751,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
     timerprof.tv_sec = nsecprof/GIGA;
     timerprof.tv_nsec = nsecprof%GIGA;
 
-    running = 1;
+    profile_running = 1;
+    profile_all_tasks = all_tasks;
     // ensure the alarm is running
     ret = clock_alarm(clk, TIME_RELATIVE, timerprof, profile_port);
     HANDLE_MACH_ERROR("clock_alarm", ret);
@@ -749,5 +762,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
 
 JL_DLLEXPORT void jl_profile_stop_timer(void)
 {
-    running = 0;
+    uv_mutex_lock(&bt_data_prof_lock);
+    profile_running = 0;
+    profile_all_tasks = 0;
+    uv_mutex_unlock(&bt_data_prof_lock);
 }
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 3ebf7954dccfc..1a4071a1e79be 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -9,6 +9,10 @@
 #include <pthread.h>
 #include <time.h>
 #include <errno.h>
+
+#include "julia.h"
+#include "julia_internal.h"
+
 #if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS)
 #define MAP_ANONYMOUS MAP_ANON
 #endif
@@ -562,7 +566,7 @@ int timer_graceperiod_elapsed(void)
 static timer_t timerprof;
 static struct itimerspec itsprof;
 
-JL_DLLEXPORT int jl_profile_start_timer(void)
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
 {
     struct sigevent sigprof;
 
@@ -571,10 +575,12 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
     sigprof.sigev_notify = SIGEV_SIGNAL;
     sigprof.sigev_signo = SIGUSR1;
     sigprof.sigev_value.sival_ptr = &timerprof;
-    // Because SIGUSR1 is multipurpose, set `running` before so that we know that the first SIGUSR1 came from the timer
-    running = 1;
+    // Because SIGUSR1 is multipurpose, set `profile_running` before so that we know that the first SIGUSR1 came from the timer
+    profile_running = 1;
+    profile_all_tasks = all_tasks;
     if (timer_create(CLOCK_REALTIME, &sigprof, &timerprof) == -1) {
-        running = 0;
+        profile_running = 0;
+        profile_all_tasks = 0;
         return -2;
     }
 
@@ -584,7 +590,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
     itsprof.it_value.tv_sec = nsecprof / GIGA;
     itsprof.it_value.tv_nsec = nsecprof % GIGA;
     if (timer_settime(timerprof, 0, &itsprof, NULL) == -1) {
-        running = 0;
+        profile_running = 0;
+        profile_all_tasks = 0;
         return -3;
     }
     return 0;
@@ -592,11 +599,13 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
 
 JL_DLLEXPORT void jl_profile_stop_timer(void)
 {
-    if (running) {
+    uv_mutex_lock(&bt_data_prof_lock);
+    if (profile_running) {
         timer_delete(timerprof);
         last_timer_delete_time = jl_hrtime();
-        running = 0;
+        profile_running = 0;
     }
+    uv_mutex_unlock(&bt_data_prof_lock);
 }
 
 #else
@@ -691,7 +700,7 @@ void trigger_profile_peek(void)
     jl_safe_printf("\n======================================================================================\n");
     jl_safe_printf("Information request received. A stacktrace will print followed by a %.1f second profile\n", profile_peek_duration);
     jl_safe_printf("======================================================================================\n");
-    if (bt_size_max == 0){
+    if (profile_bt_size_max == 0){
         // If the buffer hasn't been initialized, initialize with default size
         // Keep these values synchronized with Profile.default_init()
         if (jl_profile_init(10000000, 1000000) == -1) {
@@ -699,13 +708,62 @@ void trigger_profile_peek(void)
             return;
         }
     }
-    bt_size_cur = 0; // clear profile buffer
-    if (jl_profile_start_timer() < 0)
+    profile_bt_size_cur = 0; // clear profile buffer
+    if (jl_profile_start_timer(0) < 0)
         jl_safe_printf("ERROR: Could not start profile timer\n");
     else
         profile_autostop_time = jl_hrtime() + (profile_peek_duration * 1e9);
 }
 
+// assumes holding `jl_lock_stackwalk`
+void jl_profile_thread_unix(int tid, bt_context_t *signal_context)
+{
+    if (jl_profile_is_buffer_full()) {
+        // Buffer full: Delete the timer
+        jl_profile_stop_timer();
+        return;
+    }
+    // notify thread to stop
+    if (!jl_thread_suspend_and_get_state(tid, 1, signal_context))
+        return;
+    // unwinding can fail, so keep track of the current state
+    // and restore from the SEGV handler if anything happens.
+    jl_jmp_buf *old_buf = jl_get_safe_restore();
+    jl_jmp_buf buf;
+
+    jl_set_safe_restore(&buf);
+    if (jl_setjmp(buf, 0)) {
+        jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
+    } else {
+        // Get backtrace data
+        profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur,
+                profile_bt_size_max - profile_bt_size_cur - 1, signal_context, NULL);
+    }
+    jl_set_safe_restore(old_buf);
+
+    jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
+
+    // store threadid but add 1 as 0 is preserved to indicate end of block
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls2->tid + 1;
+
+    // store task id (never null)
+    profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task);
+
+    // store cpu cycle clock
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
+
+    // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block)
+    int state = jl_atomic_load_relaxed(&ptls2->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING;
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = state;
+
+    // Mark the end of this block with two 0's
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+    profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+
+    // notify thread to resume
+    jl_thread_resume(tid);
+}
+
 static void *signal_listener(void *arg)
 {
     static jl_bt_element_t bt_data[JL_MAX_BT_SIZE + 1];
@@ -805,13 +863,13 @@ static void *signal_listener(void *arg)
         int doexit = critical;
 #ifdef SIGINFO
         if (sig == SIGINFO) {
-            if (running != 1)
+            if (profile_running != 1)
                 trigger_profile_peek();
             doexit = 0;
         }
 #else
         if (sig == SIGUSR1) {
-            if (running != 1 && timer_graceperiod_elapsed())
+            if (profile_running != 1 && timer_graceperiod_elapsed())
                 trigger_profile_peek();
             doexit = 0;
         }
@@ -845,78 +903,46 @@ static void *signal_listener(void *arg)
         bt_size = 0;
 #if !defined(JL_DISABLE_LIBUNWIND)
         bt_context_t signal_context;
-        // sample each thread, round-robin style in reverse order
-        // (so that thread zero gets notified last)
-        if (critical || profile) {
+        if (critical) {
             int lockret = jl_lock_stackwalk();
-            int *randperm;
-            if (profile)
-                 randperm = profile_get_randperm(nthreads);
-            for (int idx = nthreads; idx-- > 0; ) {
-                // Stop the threads in the random or reverse round-robin order.
-                int i = profile ? randperm[idx] : idx;
+            // sample each thread, round-robin style in reverse order
+            // (so that thread zero gets notified last)
+            for (int i = nthreads; i-- > 0; ) {
                 // notify thread to stop
                 if (!jl_thread_suspend_and_get_state(i, 1, &signal_context))
                     continue;
 
                 // do backtrace on thread contexts for critical signals
                 // this part must be signal-handler safe
-                if (critical) {
-                    bt_size += rec_backtrace_ctx(bt_data + bt_size,
-                            JL_MAX_BT_SIZE / nthreads - 1,
-                            &signal_context, NULL);
-                    bt_data[bt_size++].uintptr = 0;
-                }
-
-                // do backtrace for profiler
-                if (profile && running) {
-                    if (jl_profile_is_buffer_full()) {
-                        // Buffer full: Delete the timer
-                        jl_profile_stop_timer();
-                    }
-                    else {
-                        // unwinding can fail, so keep track of the current state
-                        // and restore from the SEGV handler if anything happens.
-                        jl_jmp_buf *old_buf = jl_get_safe_restore();
-                        jl_jmp_buf buf;
-
-                        jl_set_safe_restore(&buf);
-                        if (jl_setjmp(buf, 0)) {
-                            jl_safe_printf("WARNING: profiler attempt to access an invalid memory location\n");
-                        } else {
-                            // Get backtrace data
-                            bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
-                                    bt_size_max - bt_size_cur - 1, &signal_context, NULL);
-                        }
-                        jl_set_safe_restore(old_buf);
-
-                        jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[i];
-
-                        // store threadid but add 1 as 0 is preserved to indicate end of block
-                        bt_data_prof[bt_size_cur++].uintptr = ptls2->tid + 1;
-
-                        // store task id (never null)
-                        bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls2->current_task);
-
-                        // store cpu cycle clock
-                        bt_data_prof[bt_size_cur++].uintptr = cycleclock();
-
-                        // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
-                        bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls2->sleep_check_state) + 1;
-
-                        // Mark the end of this block with two 0's
-                        bt_data_prof[bt_size_cur++].uintptr = 0;
-                        bt_data_prof[bt_size_cur++].uintptr = 0;
-                    }
-                }
-
-                // notify thread to resume
+                bt_size += rec_backtrace_ctx(bt_data + bt_size,
+                        JL_MAX_BT_SIZE / nthreads - 1,
+                        &signal_context, NULL);
+                bt_data[bt_size++].uintptr = 0;
                 jl_thread_resume(i);
             }
             jl_unlock_stackwalk(lockret);
         }
+        else if (profile) {
+            if (profile_all_tasks) {
+                // Don't take the stackwalk lock here since it's already taken in `jl_record_backtrace`
+                jl_profile_task();
+            }
+            else {
+                int lockret = jl_lock_stackwalk();
+                int *randperm = profile_get_randperm(nthreads);
+                for (int idx = nthreads; idx-- > 0; ) {
+                    // Stop the threads in the random order.
+                    int i = randperm[idx];
+                    // do backtrace for profiler
+                    if (profile_running) {
+                        jl_profile_thread_unix(i, &signal_context);
+                    }
+                }
+                jl_unlock_stackwalk(lockret);
+            }
+        }
 #ifndef HAVE_MACH
-        if (profile && running) {
+        if (profile_running) {
             jl_check_profile_autostop();
 #if defined(HAVE_TIMER)
             timer_settime(timerprof, 0, &itsprof, NULL);
diff --git a/src/signals-win.c b/src/signals-win.c
index bcb3a1fd246f0..a2293d2d7a2f8 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -398,12 +398,16 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
     while (1) {
         DWORD timeout_ms = nsecprof / (GIGA / 1000);
         Sleep(timeout_ms > 0 ? timeout_ms : 1);
-        if (running) {
+        if (profile_running) {
             if (jl_profile_is_buffer_full()) {
                 jl_profile_stop_timer(); // does not change the thread state
                 SuspendThread(GetCurrentThread());
                 continue;
             }
+            else if (profile_all_tasks) {
+                // Don't take the stackwalk lock here since it's already taken in `jl_record_backtrace`
+                jl_profile_task();
+            }
             else {
                 // TODO: bring this up to parity with other OS by adding loop over tid here
                 int lockret = jl_lock_stackwalk();
@@ -415,26 +419,27 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
                     break;
                 }
                 // Get backtrace data
-                bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)bt_data_prof + bt_size_cur,
-                        bt_size_max - bt_size_cur - 1, &ctxThread, NULL);
+                profile_bt_size_cur += rec_backtrace_ctx((jl_bt_element_t*)profile_bt_data_prof + profile_bt_size_cur,
+                        profile_bt_size_max - profile_bt_size_cur - 1, &ctxThread, NULL);
 
                 jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[0]; // given only profiling hMainThread
 
-                // store threadid but add 1 as 0 is preserved to indicate end of block
-                bt_data_prof[bt_size_cur++].uintptr = ptls->tid + 1;
+                // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block
+                profile_bt_data_prof[profile_bt_size_cur++].uintptr = ptls->tid + 1;
 
-                // store task id (never null)
-                bt_data_prof[bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
+                // META_OFFSET_TASKID store task id (never null)
+                profile_bt_data_prof[profile_bt_size_cur++].jlvalue = (jl_value_t*)jl_atomic_load_relaxed(&ptls->current_task);
 
-                // store cpu cycle clock
-                bt_data_prof[bt_size_cur++].uintptr = cycleclock();
+                // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock
+                profile_bt_data_prof[profile_bt_size_cur++].uintptr = cycleclock();
 
-                // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
-                bt_data_prof[bt_size_cur++].uintptr = jl_atomic_load_relaxed(&ptls->sleep_check_state) + 1;
+                // store whether thread is sleeping (don't ever encode a state as `0` since is preserved to indicate end of block)
+                int state = jl_atomic_load_relaxed(&ptls->sleep_check_state) == 0 ? PROFILE_STATE_THREAD_NOT_SLEEPING : PROFILE_STATE_THREAD_SLEEPING;
+                profile_bt_data_prof[profile_bt_size_cur++].uintptr = state;
 
                 // Mark the end of this block with two 0's
-                bt_data_prof[bt_size_cur++].uintptr = 0;
-                bt_data_prof[bt_size_cur++].uintptr = 0;
+                profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
+                profile_bt_data_prof[profile_bt_size_cur++].uintptr = 0;
                 jl_unlock_stackwalk(lockret);
                 jl_thread_resume(0);
                 jl_check_profile_autostop();
@@ -449,7 +454,7 @@ static DWORD WINAPI profile_bt( LPVOID lparam )
 
 static volatile TIMECAPS timecaps;
 
-JL_DLLEXPORT int jl_profile_start_timer(void)
+JL_DLLEXPORT int jl_profile_start_timer(uint8_t all_tasks)
 {
     if (hBtThread == NULL) {
 
@@ -477,20 +482,24 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
             return -2;
         }
     }
-    if (running == 0) {
+    if (profile_running == 0) {
         // Failure to change the timer resolution is not fatal. However, it is important to
         // ensure that the timeBeginPeriod/timeEndPeriod is paired.
         if (TIMERR_NOERROR != timeBeginPeriod(timecaps.wPeriodMin))
             timecaps.wPeriodMin = 0;
     }
-    running = 1; // set `running` finally
+    profile_all_tasks = all_tasks;
+    profile_running = 1; // set `profile_running` finally
     return 0;
 }
 JL_DLLEXPORT void jl_profile_stop_timer(void)
 {
-    if (running && timecaps.wPeriodMin)
+    uv_mutex_lock(&bt_data_prof_lock);
+    if (profile_running && timecaps.wPeriodMin)
         timeEndPeriod(timecaps.wPeriodMin);
-    running = 0;
+    profile_running = 0;
+    profile_all_tasks = 0;
+    uv_mutex_unlock(&bt_data_prof_lock);
 }
 
 void jl_install_default_signal_handlers(void)
diff --git a/src/stackwalk.c b/src/stackwalk.c
index def7dd9fd6b9f..ccf73d5b5cdd7 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -7,6 +7,7 @@
 #include <inttypes.h>
 #include "julia.h"
 #include "julia_internal.h"
+#include "gc.h"
 #include "threading.h"
 #include "julia_assert.h"
 
@@ -207,7 +208,7 @@ NOINLINE size_t rec_backtrace_ctx(jl_bt_element_t *bt_data, size_t maxsize,
 //
 // The first `skip` frames are omitted, in addition to omitting the frame from
 // `rec_backtrace` itself.
-NOINLINE size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip)
+NOINLINE size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT
 {
     bt_context_t context;
     memset(&context, 0, sizeof(context));
@@ -223,6 +224,24 @@ NOINLINE size_t rec_backtrace(jl_bt_element_t *bt_data, size_t maxsize, int skip
     return bt_size;
 }
 
+NOINLINE int failed_to_sample_task_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT
+{
+    if (maxsize < 1) {
+        return 0;
+    }
+    bt_data[0].uintptr = (uintptr_t) &failed_to_sample_task_fun;
+    return 1;
+}
+
+NOINLINE int failed_to_stop_thread_fun(jl_bt_element_t *bt_data, size_t maxsize, int skip) JL_NOTSAFEPOINT
+{
+    if (maxsize < 1) {
+        return 0;
+    }
+    bt_data[0].uintptr = (uintptr_t) &failed_to_stop_thread_fun;
+    return 1;
+}
+
 static jl_value_t *array_ptr_void_type JL_ALWAYS_LEAFTYPE = NULL;
 // Return backtrace information as an svec of (bt1, bt2, [sp])
 //
@@ -870,24 +889,32 @@ _os_ptr_munge(uintptr_t ptr)
 
 extern bt_context_t *jl_to_bt_context(void *sigctx);
 
-static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
+JL_DLLEXPORT jl_record_backtrace_result_t jl_record_backtrace(jl_task_t *t, jl_bt_element_t *bt_data, size_t max_bt_size, int all_tasks_profiler) JL_NOTSAFEPOINT
 {
-    jl_task_t *ct = jl_current_task;
-    jl_ptls_t ptls = ct->ptls;
-    ptls->bt_size = 0;
+    int16_t tid = INT16_MAX;
+    jl_record_backtrace_result_t result = {0, tid};
+    jl_task_t *ct = NULL;
+    jl_ptls_t ptls = NULL;
+    if (!all_tasks_profiler) {
+        ct = jl_current_task;
+        ptls = ct->ptls;
+        ptls->bt_size = 0;
+        tid = ptls->tid;
+    }
     if (t == ct) {
-        ptls->bt_size = rec_backtrace(ptls->bt_data, JL_MAX_BT_SIZE, 0);
-        return;
+        result.bt_size = rec_backtrace(bt_data, max_bt_size, 0);
+        result.tid = tid;
+        return result;
     }
     bt_context_t *context = NULL;
     bt_context_t c;
     int16_t old;
-    for (old = -1; !jl_atomic_cmpswap(&t->tid, &old, ptls->tid) && old != ptls->tid; old = -1) {
+    for (old = -1; !jl_atomic_cmpswap(&t->tid, &old, tid) && old != tid; old = -1) {
         int lockret = jl_lock_stackwalk();
         // if this task is already running somewhere, we need to stop the thread it is running on and query its state
         if (!jl_thread_suspend_and_get_state(old, 0, &c)) {
             jl_unlock_stackwalk(lockret);
-            return;
+            return result;
         }
         jl_unlock_stackwalk(lockret);
         if (jl_atomic_load_relaxed(&t->tid) == old) {
@@ -1018,7 +1045,7 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
         mc->pc = mc->regs[30];
         context = &c;
       #else
-       #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown linux")
+       #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown linux")
        (void)mc;
        (void)c;
        (void)mctx;
@@ -1080,7 +1107,7 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
         mc->__pad = 0; // aka __ra_sign_state = not signed
         context = &c;
       #else
-       #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown darwin")
+       #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown darwin")
         (void)mctx;
         (void)c;
       #endif
@@ -1098,23 +1125,28 @@ static void jl_rec_backtrace(jl_task_t *t) JL_NOTSAFEPOINT
         mc->mc_r15 = ((long*)mctx)[7];
         context = &c;
      #else
-      #pragma message("jl_rec_backtrace not defined for ASM/SETJMP on unknown system")
+      #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown system")
       (void)c;
      #endif
 #elif defined(JL_HAVE_ASYNCIFY)
-     #pragma message("jl_rec_backtrace not defined for ASYNCIFY")
+     #pragma message("jl_record_backtrace not defined for ASYNCIFY")
 #elif defined(JL_HAVE_SIGALTSTACK)
-     #pragma message("jl_rec_backtrace not defined for SIGALTSTACK")
+     #pragma message("jl_record_backtrace not defined for SIGALTSTACK")
 #else
-     #pragma message("jl_rec_backtrace not defined for unknown task system")
+     #pragma message("jl_record_backtrace not defined for unknown task system")
 #endif
     }
-    if (context)
-        ptls->bt_size = rec_backtrace_ctx(ptls->bt_data, JL_MAX_BT_SIZE, context,  t->gcstack);
+    size_t bt_size = 0;
+    if (context) {
+        bt_size = rec_backtrace_ctx(bt_data, max_bt_size, context, all_tasks_profiler ? NULL : t->gcstack);
+    }
     if (old == -1)
         jl_atomic_store_relaxed(&t->tid, old);
-    else if (old != ptls->tid)
+    else if (old != tid)
         jl_thread_resume(old);
+    result.bt_size = bt_size;
+    result.tid = old;
+    return result;
 }
 
 //--------------------------------------------------
@@ -1152,9 +1184,11 @@ JL_DLLEXPORT void jlbacktracet(jl_task_t *t) JL_NOTSAFEPOINT
 {
     jl_task_t *ct = jl_current_task;
     jl_ptls_t ptls = ct->ptls;
-    jl_rec_backtrace(t);
-    size_t i, bt_size = ptls->bt_size;
+    ptls->bt_size = 0;
     jl_bt_element_t *bt_data = ptls->bt_data;
+    jl_record_backtrace_result_t r = jl_record_backtrace(t, bt_data, JL_MAX_BT_SIZE, 0);
+    size_t bt_size = r.bt_size;
+    size_t i;
     for (i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) {
         jl_print_bt_entry_codeloc(-1, bt_data + i);
     }
@@ -1165,7 +1199,6 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT
     jlbacktrace();
 }
 
-extern int gc_first_tid;
 extern int jl_inside_heartbeat_thread(void);
 extern int jl_heartbeat_pause(void);
 extern int jl_heartbeat_resume(void);
@@ -1192,7 +1225,7 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT
     jl_safe_printf("thread (%d) ++++ Task backtraces\n", ctid);
     for (size_t i = 0; i < nthreads; i++) {
         // skip GC threads since they don't have tasks
-        if (gc_first_tid <= i && i < gc_first_tid + jl_n_gcthreads) {
+        if (gc_is_parallel_collector_thread(i) || gc_is_concurrent_collector_thread(i)) {
             continue;
         }
         jl_ptls_t ptls2 = allstates[i];
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index a0a1ecd1964ed..b8fa3682acc66 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -12,7 +12,7 @@ const nmeta = 4 # number of metadata fields per block (threadid, taskid, cpu_cyc
 # deprecated functions: use `getdict` instead
 lookup(ip::UInt) = lookup(convert(Ptr{Cvoid}, ip))
 
-export @profile
+export @profile, @profile_walltime
 
 """
     @profile
@@ -31,6 +31,28 @@ macro profile(ex)
     end
 end
 
+"""
+    @profile_walltime
+
+`@profile_walltime <expression>` runs your expression while taking periodic backtraces of a sample of all live tasks (both running and not running).
+These are appended to an internal buffer of backtraces.
+
+It can be configured via `Profile.init`, same as the `Profile.@profile`, and that you can't use `@profile` simultaneously with `@profile_walltime`.
+
+As mentioned above, since this tool sample not only running tasks, but also sleeping tasks and tasks performing IO,
+it can be used to diagnose performance issues such as lock contention, IO bottlenecks, and other issues that are not visible in the CPU profile.
+"""
+macro profile_walltime(ex)
+    return quote
+        try
+            start_timer(true)
+            $(esc(ex))
+        finally
+            stop_timer()
+        end
+    end
+end
+
 # An internal function called to show the report after an information request (SIGINFO or SIGUSR1).
 function _peek_report()
     iob = IOBuffer()
@@ -369,9 +391,10 @@ end
 
 function has_meta(data)
     for i in 6:length(data)
-        data[i] == 0 || continue            # first block end null
-        data[i - 1] == 0 || continue        # second block end null
-        data[i - META_OFFSET_SLEEPSTATE] in 1:2 || continue
+        data[i] == 0 || continue                            # first block end null
+        data[i - 1] == 0 || continue                        # second block end null
+        data[i - META_OFFSET_SLEEPSTATE] in 1:3 || continue # 1 for not sleeping, 2 for sleeping, 3 for task profiler fake state
+                                                            # See definition in `src/julia_internal.h`
         data[i - META_OFFSET_CPUCYCLECLOCK] != 0 || continue
         data[i - META_OFFSET_TASKID] != 0 || continue
         data[i - META_OFFSET_THREADID] != 0 || continue
@@ -562,9 +585,9 @@ Julia, and examine the resulting `*.mem` files.
 clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ())
 
 # C wrappers
-function start_timer()
+function start_timer(all_tasks::Bool=false)
     check_init() # if the profile buffer hasn't been initialized, initialize with default size
-    status = ccall(:jl_profile_start_timer, Cint, ())
+    status = ccall(:jl_profile_start_timer, Cint, (Bool,), all_tasks)
     if status < 0
         error(error_codes[status])
     end
@@ -676,12 +699,16 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
     startframe = length(data)
     skip = false
     nsleeping = 0
+    is_task_profile = false
     for i in startframe:-1:1
         (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (its read ahead below) and extra block end NULL IP
         ip = data[i]
         if is_block_end(data, i)
             # read metadata
-            thread_sleeping = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0
+            thread_sleeping_state = data[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0
+            if thread_sleeping_state == 2
+                is_task_profile = true
+            end
             # cpu_cycle_clock = data[i - META_OFFSET_CPUCYCLECLOCK]
             taskid = data[i - META_OFFSET_TASKID]
             threadid = data[i - META_OFFSET_THREADID]
@@ -689,7 +716,7 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
                 skip = true
                 continue
             end
-            if thread_sleeping == 1
+            if thread_sleeping_state == 1
                 nsleeping += 1
             end
             skip = false
@@ -723,12 +750,12 @@ function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict,
         end
     end
     @assert length(lilist) == length(n) == length(m) == length(lilist_idx)
-    return (lilist, n, m, totalshots, nsleeping)
+    return (lilist, n, m, totalshots, nsleeping, is_task_profile)
 end
 
 function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat,
                 threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool)
-    lilist, n, m, totalshots, nsleeping = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks)
+    lilist, n, m, totalshots, nsleeping, is_task_profile = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C, threads, tasks)
     if false # optional: drop the "non-interpretable" ones
         keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist)
         lilist = lilist[keep]
@@ -748,11 +775,15 @@ function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfo
         return true
     end
     is_subsection || print_flat(io, lilist, n, m, cols, filenamemap, fmt)
-    Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%")
+    if is_task_profile
+        Base.print(io, "Total snapshots: ", totalshots, "\n")
+    else
+        Base.print(io, "Total snapshots: ", totalshots, ". Utilization: ", round(Int, util_perc), "%")
+    end
     if is_subsection
         println(io)
         print_flat(io, lilist, n, m, cols, filenamemap, fmt)
-    else
+    elseif !is_task_profile
         Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n")
     end
     return false
@@ -927,12 +958,16 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
     startframe = length(all)
     skip = false
     nsleeping = 0
+    is_task_profile = false
     for i in startframe:-1:1
         (startframe - 1) >= i >= (startframe - (nmeta + 1)) && continue # skip metadata (it's read ahead below) and extra block end NULL IP
         ip = all[i]
         if is_block_end(all, i)
             # read metadata
-            thread_sleeping = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0
+            thread_sleeping_state = all[i - META_OFFSET_SLEEPSTATE] - 1 # subtract 1 as state is incremented to avoid being equal to 0
+            if thread_sleeping_state == 2
+                is_task_profile = true
+            end
             # cpu_cycle_clock = all[i - META_OFFSET_CPUCYCLECLOCK]
             taskid = all[i - META_OFFSET_TASKID]
             threadid = all[i - META_OFFSET_THREADID]
@@ -941,7 +976,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
                 skip = true
                 continue
             end
-            if thread_sleeping == 1
+            if thread_sleeping_state == 1
                 nsleeping += 1
             end
             skip = false
@@ -1047,7 +1082,7 @@ function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineI
         nothing
     end
     cleanup!(root)
-    return root, nsleeping
+    return root, nsleeping, is_task_profile
 end
 
 function maxstats(root::StackFrameTree)
@@ -1116,9 +1151,9 @@ end
 function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat,
                 threads::Union{Int,AbstractVector{Int}}, tasks::Union{UInt,AbstractVector{UInt}}, is_subsection::Bool)
     if fmt.combine
-        root, nsleeping = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
+        root, nsleeping, is_task_profile = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
     else
-        root, nsleeping = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
+        root, nsleeping, is_task_profile = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur, threads, tasks)
     end
     util_perc = (1 - (nsleeping / root.count)) * 100
     is_subsection || print_tree(io, root, cols, fmt, is_subsection)
@@ -1132,11 +1167,15 @@ function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, Line
         end
         return true
     end
-    Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%")
+    if is_task_profile
+        Base.print(io, "Total snapshots: ", root.count, "\n")
+    else
+        Base.print(io, "Total snapshots: ", root.count, ". Utilization: ", round(Int, util_perc), "%")
+    end
     if is_subsection
         Base.println(io)
         print_tree(io, root, cols, fmt, is_subsection)
-    else
+    elseif !is_task_profile
         Base.print(io, " across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.\n")
     end
     return false
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index cb40d80d9b756..9537791118e11 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -25,19 +25,64 @@ end
     end
 end
 
-busywait(0, 0) # compile
-@profile busywait(1, 20)
+@noinline function sleeping_tasks(ch::Channel)
+    for _ in 1:100
+        Threads.@spawn take!(ch)
+    end
+    sleep(10)
+end
 
-let r = Profile.retrieve()
-    mktemp() do path, io
-        serialize(io, r)
-        close(io)
-        open(path) do io
-            @test isa(deserialize(io), Tuple{Vector{UInt},Dict{UInt64,Vector{Base.StackTraces.StackFrame}}})
+function test_profile()
+    let r = Profile.retrieve()
+        mktemp() do path, io
+            serialize(io, r)
+            close(io)
+            open(path) do io
+                @test isa(deserialize(io), Tuple{Vector{UInt},Dict{UInt64,Vector{Base.StackTraces.StackFrame}}})
+            end
+        end
+    end
+end
+
+function test_has_task_profiler_sample_in_buffer()
+    let r = Profile.retrieve()
+        mktemp() do path, io
+            serialize(io, r)
+            close(io)
+            open(path) do io
+                all = deserialize(io)
+                data = all[1]
+                startframe = length(data)
+                for i in startframe:-1:1
+                    (startframe - 1) >= i >= (startframe - (Profile.nmeta + 1)) && continue # skip metadata (its read ahead below) and extra block end NULL IP
+                    if Profile.is_block_end(data, i)
+                        thread_sleeping_state = data[i - Profile.META_OFFSET_SLEEPSTATE]
+                        @test thread_sleeping_state == 0x3
+                    end
+                end
+            end
         end
     end
 end
 
+busywait(0, 0) # compile
+
+@profile_walltime busywait(1, 20)
+test_profile()
+
+Profile.clear()
+
+ch = Channel(1)
+@profile_walltime sleeping_tasks(ch)
+test_profile()
+close(ch)
+test_has_task_profiler_sample_in_buffer()
+
+Profile.clear()
+
+@profile busywait(1, 20)
+test_profile()
+
 let iobuf = IOBuffer()
     Profile.print(iobuf, format=:tree, C=true)
     str = String(take!(iobuf))