From 3b2d97c6709e0e74547addef0c9dba32645c38b2 Mon Sep 17 00:00:00 2001 From: Mikhail Bautin <552936+mbautin@users.noreply.github.com> Date: Thu, 22 Jun 2023 15:05:38 -0700 Subject: [PATCH] [#17875] Do not collect stack traces of threads doing memory allocation Summary: When trying to capture a stack trace with a signal handler, if a memory allocation/deallocation is happening in the thread receiving the signal, the process could crash. Google TCMalloc issue: https://github.com/google/tcmalloc/issues/189. In this diff, we are using the IsCurThreadInAllocDealloc malloc extension API we added in https://github.com/yugabyte/tcmalloc/commit/677ba2d19253354afd0cd8c2181765d589f1b1e4 to skip capturing the stack trace in case the signal interrupted a thread that is currently allocating or deallocating memory. In such cases, we produce an empty stack trace which is later omitted from the overall threads dump. #17889 is a follow-up issue for retrying obtaining stack traces in such cases. Another change contained in the TCMalloc version that we are upgrading to is https://github.com/yugabyte/tcmalloc/commit/d1b0e6996a16a6e9e1c726f40785026a4992a9da (adding an option to not seed lifetime profiler with live allocations). We are now setting seed_with_live_allocs to false when capturing an allocation profile. Test Plan: Jenkins Reviewers: asrivastava Reviewed By: asrivastava Subscribers: ybase, bogdan Differential Revision: https://phorge.dev.yugabyte.com/D26349 --- build-support/thirdparty_archives.yml | 86 +++++++------- requirements_frozen.txt | 2 +- src/yb/server/pprof-path-handler_util-test.cc | 2 +- src/yb/server/pprof-path-handlers_util.cc | 2 +- src/yb/util/debug-util-test.cc | 105 ++++++++++++++++++ src/yb/util/stack_trace.cc | 15 ++- 6 files changed, 165 insertions(+), 47 deletions(-) diff --git a/build-support/thirdparty_archives.yml b/build-support/thirdparty_archives.yml index bf8930fd7db7..0f4c25c4e4bb 100644 --- a/build-support/thirdparty_archives.yml +++ b/build-support/thirdparty_archives.yml @@ -1,149 +1,149 @@ -sha_for_local_checkout: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 +sha_for_local_checkout: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e archives: - os_type: almalinux8 architecture: x86_64 compiler_type: clang15 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215502-04b5c61ec3-almalinux8-x86_64-clang15 + tag: v20230621185546-6777477baa-almalinux8-x86_64-clang15 - os_type: almalinux8 architecture: x86_64 compiler_type: clang15 is_linuxbrew: true - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215522-04b5c61ec3-almalinux8-x86_64-clang15-linuxbrew + tag: v20230621185609-6777477baa-almalinux8-x86_64-clang15-linuxbrew - os_type: almalinux8 architecture: x86_64 compiler_type: clang15 is_linuxbrew: true - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: full - tag: v20230519215509-04b5c61ec3-almalinux8-x86_64-clang15-linuxbrew-full-lto + tag: v20230621185521-6777477baa-almalinux8-x86_64-clang15-linuxbrew-full-lto - os_type: almalinux8 architecture: x86_64 compiler_type: clang16 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215509-04b5c61ec3-almalinux8-x86_64-clang16 + tag: v20230621185529-6777477baa-almalinux8-x86_64-clang16 - os_type: almalinux8 architecture: x86_64 compiler_type: clang16 is_linuxbrew: true - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215503-04b5c61ec3-almalinux8-x86_64-clang16-linuxbrew + tag: v20230621185625-6777477baa-almalinux8-x86_64-clang16-linuxbrew - os_type: almalinux8 architecture: x86_64 compiler_type: clang16 is_linuxbrew: true - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: full - tag: v20230519215507-04b5c61ec3-almalinux8-x86_64-clang16-linuxbrew-full-lto + tag: v20230621185605-6777477baa-almalinux8-x86_64-clang16-linuxbrew-full-lto - os_type: almalinux8 architecture: x86_64 compiler_type: gcc11 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215506-04b5c61ec3-almalinux8-x86_64-gcc11 + tag: v20230621185524-6777477baa-almalinux8-x86_64-gcc11 - os_type: centos7 architecture: aarch64 compiler_type: clang15 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215639-04b5c61ec3-centos7-aarch64-clang15 + tag: v20230621185659-6777477baa-centos7-aarch64-clang15 - os_type: centos7 architecture: aarch64 compiler_type: clang15 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: full - tag: v20230519215621-04b5c61ec3-centos7-aarch64-clang15-full-lto + tag: v20230621185620-6777477baa-centos7-aarch64-clang15-full-lto - os_type: centos7 architecture: aarch64 compiler_type: clang16 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215641-04b5c61ec3-centos7-aarch64-clang16 + tag: v20230621185700-6777477baa-centos7-aarch64-clang16 - os_type: centos7 architecture: aarch64 compiler_type: clang16 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: full - tag: v20230519215609-04b5c61ec3-centos7-aarch64-clang16-full-lto + tag: v20230621185616-6777477baa-centos7-aarch64-clang16-full-lto - os_type: centos7 architecture: x86_64 compiler_type: clang15 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215552-04b5c61ec3-centos7-x86_64-clang15 + tag: v20230621185651-6777477baa-centos7-x86_64-clang15 - os_type: centos7 architecture: x86_64 compiler_type: clang15 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: full - tag: v20230519215632-04b5c61ec3-centos7-x86_64-clang15-full-lto + tag: v20230621185543-6777477baa-centos7-x86_64-clang15-full-lto - os_type: centos7 architecture: x86_64 compiler_type: clang16 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215625-04b5c61ec3-centos7-x86_64-clang16 + tag: v20230621185537-6777477baa-centos7-x86_64-clang16 - os_type: centos7 architecture: x86_64 compiler_type: clang16 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: full - tag: v20230519215522-04b5c61ec3-centos7-x86_64-clang16-full-lto + tag: v20230622043321-6777477baa-centos7-x86_64-clang16-full-lto - os_type: centos7 architecture: x86_64 compiler_type: gcc11 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215529-04b5c61ec3-centos7-x86_64-gcc11 + tag: v20230621185536-6777477baa-centos7-x86_64-gcc11 - os_type: macos architecture: arm64 compiler_type: clang is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230523091634-04b5c61ec3-macos-arm64 + tag: v20230621193812-6777477baa-macos-arm64 - os_type: macos architecture: x86_64 compiler_type: clang is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215557-04b5c61ec3-macos-x86_64 + tag: v20230621185613-6777477baa-macos-x86_64 - os_type: ubuntu20.04 architecture: x86_64 compiler_type: clang15 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215505-04b5c61ec3-ubuntu2004-x86_64-clang15 + tag: v20230621185516-6777477baa-ubuntu2004-x86_64-clang15 - os_type: ubuntu22.04 architecture: x86_64 compiler_type: clang15 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215507-04b5c61ec3-ubuntu2204-x86_64-clang15 + tag: v20230621185524-6777477baa-ubuntu2204-x86_64-clang15 - os_type: ubuntu22.04 architecture: x86_64 compiler_type: gcc11 is_linuxbrew: false - sha: 04b5c61ec3a73ffabdd2faa1f44bebda25193963 + sha: 6777477baaa5727cb3eb0d1b8256c1bb9ab4f33e lto_type: - tag: v20230519215510-04b5c61ec3-ubuntu2204-x86_64-gcc11 + tag: v20230621185622-6777477baa-ubuntu2204-x86_64-gcc11 diff --git a/requirements_frozen.txt b/requirements_frozen.txt index e618f9560a82..6fe14f82ea24 100644 --- a/requirements_frozen.txt +++ b/requirements_frozen.txt @@ -19,7 +19,7 @@ downloadutil==1.0.2 idna==3.4 iniconfig==2.0.0 jmespath==1.0.1 -llvm-installer==1.3.2 +llvm-installer==1.3.4 mypy-extensions==1.0.0 mypy==1.3.0 overrides==7.3.1 diff --git a/src/yb/server/pprof-path-handler_util-test.cc b/src/yb/server/pprof-path-handler_util-test.cc index 5ace66de9a9a..7ba35c24e296 100644 --- a/src/yb/server/pprof-path-handler_util-test.cc +++ b/src/yb/server/pprof-path-handler_util-test.cc @@ -96,7 +96,7 @@ TEST_F(SamplingProfilerTest, AllocationProfile) { const int64_t alloc_size = 30_MB; tcmalloc::MallocExtension::AllocationProfilingToken token; - token = tcmalloc::MallocExtension::StartLifetimeProfiling(); + token = tcmalloc::MallocExtension::StartLifetimeProfiling(/* seed_with_live_allocs= */ false); // We expect to find this allocation in the profile if and only if only_growth is false, since // it is not deallocated before we stop profiling. diff --git a/src/yb/server/pprof-path-handlers_util.cc b/src/yb/server/pprof-path-handlers_util.cc index 980e7d242f42..0c0cddfec0cc 100644 --- a/src/yb/server/pprof-path-handlers_util.cc +++ b/src/yb/server/pprof-path-handlers_util.cc @@ -49,7 +49,7 @@ tcmalloc::Profile GetAllocationProfile(int seconds, int64_t sample_freq_bytes) { auto prev_sample_rate = tcmalloc::MallocExtension::GetProfileSamplingRate(); tcmalloc::MallocExtension::SetProfileSamplingRate(sample_freq_bytes); tcmalloc::MallocExtension::AllocationProfilingToken token; - token = tcmalloc::MallocExtension::StartLifetimeProfiling(); + token = tcmalloc::MallocExtension::StartLifetimeProfiling(/* seed_with_live_allocs= */ false); LOG(INFO) << Format("Sleeping for $0 seconds while profile is collected.", seconds); SleepFor(MonoDelta::FromSeconds(seconds)); diff --git a/src/yb/util/debug-util-test.cc b/src/yb/util/debug-util-test.cc index 6cd436955573..ab04a80af64a 100644 --- a/src/yb/util/debug-util-test.cc +++ b/src/yb/util/debug-util-test.cc @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -50,6 +51,10 @@ #include "yb/util/test_util.h" #include "yb/util/thread.h" #include "yb/util/tsan_util.h" +#include "yb/util/test_thread_holder.h" +#include "yb/util/lockfree.h" +#include "yb/util/random_util.h" +#include "yb/util/tostring.h" using std::string; using std::vector; @@ -329,6 +334,106 @@ TEST_F(DebugUtilTest, TestConcurrentStackTrace) { } } +TEST_F(DebugUtilTest, TestStackTraceSignalDuringAllocation) { + constexpr size_t kNumThreads = 10; + TestThreadHolder thread_holder; + // Each thread has a queue from which it consumes entries. Each thread will add entries to + // a random thread's queue. + + struct Entry : public MPSCQueueEntry { + char* bytes = nullptr; + + explicit Entry(char* bytes_) : bytes(bytes_) {} + ~Entry() { + if (bytes) { + free(bytes); + bytes = nullptr; + } + } + }; + + std::vector>> queues; + for (size_t i = 0; i < kNumThreads; ++i) { + queues.push_back(std::make_unique>()); + } + + std::mutex thread_ids_mutex; + std::vector thread_ids; + + CountDownLatch start_latch(kNumThreads); + + for (size_t i = 0; i < kNumThreads; ++i) { + thread_holder.AddThreadFunctor([ + &thread_ids_mutex, + &start_latch, + &thread_ids, + &queues, + thread_index = i, + &stop = thread_holder.stop_flag() + ]() { + { + std::lock_guard lock(thread_ids_mutex); + thread_ids.push_back(Thread::CurrentThreadIdForStack()); + } + start_latch.CountDown(); + while (!stop.load(std::memory_order_acquire)) { + if (RandomUniformBool()) { + // Allocate between 1 and 16 KB, with some random jitter. + size_t allocation_size = + (1L << RandomUniformInt(0, 10)) * RandomUniformInt(1, 16) + RandomUniformInt(1, 128); + char* bytes = pointer_cast(malloc(allocation_size)); + size_t target_thread = RandomUniformInt(0, kNumThreads - 1); + Entry* entry = new Entry(bytes); + queues[target_thread]->Push(entry); + } else { + Entry* entry = queues[thread_index]->Pop(); + delete entry; + } + } + }); + } + // Wait until all threads start running. + start_latch.Wait(); + auto deadline = MonoTime::Now() + 10s; + + // Keep dumping thread stacks. + while (MonoTime::Now() < deadline) { + for (size_t i = 0; i < 100; ++i) { + auto stacks = ThreadStacks(thread_ids); + int num_ok = 0; + int num_errors = 0; + int num_empty_stacks = 0; + std::set error_statuses; + for (const auto& stack : stacks) { + if (stack.ok()) { + if (*stack) { + num_ok++; + } else { + num_empty_stacks++; + } + } else { + error_statuses.insert(stack.status().ToString()); + num_errors++; + } + } + if (num_errors || num_empty_stacks) { + LOG(WARNING) << "OK stacks: " << num_ok << ", error stacks: " << num_errors + << ", empty stacks: " << num_empty_stacks + << ", errors statuses: " << ToString(error_statuses); + } + } + } + thread_holder.Stop(); + thread_holder.JoinAll(); + + for (size_t i = 0; i < kNumThreads; ++i) { + auto& queue = queues[i]; + while (auto* entry = queue->Pop()) { + delete entry; + } + } +} + TEST_F(DebugUtilTest, LongOperationTracker) { class TestLogSink : public google::LogSink { public: diff --git a/src/yb/util/stack_trace.cc b/src/yb/util/stack_trace.cc index 6b2417483d52..0f3eb553a572 100644 --- a/src/yb/util/stack_trace.cc +++ b/src/yb/util/stack_trace.cc @@ -30,6 +30,10 @@ #include "yb/util/result.h" #include "yb/util/thread.h" +#if YB_GOOGLE_TCMALLOC +#include +#endif + using namespace std::literals; #if defined(__APPLE__) @@ -159,8 +163,9 @@ struct ThreadStackHelper { void RecordStackTrace(const StackTrace& stack_trace) { auto* entry = allocated.Pop(); + // If entry is nullptr, that means there are not enough allocated entries. In that case, don't + // write a log message since we are in a signal handler. if (entry) { - // Not enough allocated entries, don't write log since we are in signal handler. entry->tid = Thread::CurrentThreadIdForStack(); entry->stack = stack_trace; collected.Push(entry); @@ -180,6 +185,14 @@ ThreadStackHelper thread_stack_helper; void HandleStackTraceSignal(int signum) { int old_errno = errno; StackTrace stack_trace; +#if YB_GOOGLE_TCMALLOC + // TODO(#17889): retry in this case. For now, just produce an empty stack trace. + if (tcmalloc::MallocExtension::IsCurThreadInAllocDealloc()) { + thread_stack_helper.RecordStackTrace(stack_trace); + errno = old_errno; + return; + } +#endif stack_trace.Collect(2); thread_stack_helper.RecordStackTrace(stack_trace);