From c0899f29075a919c1a44433979a4ac1da6bac84f Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Wed, 28 Jun 2023 16:42:17 -0700 Subject: [PATCH 1/3] rmgmt: improve space management logging Info level is yes likely too high for some of these. But it only runs in low-disk situations, and is useful in the short term to have for debugging. Signed-off-by: Noah Watkins --- src/v/resource_mgmt/storage.cc | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/v/resource_mgmt/storage.cc b/src/v/resource_mgmt/storage.cc index 1b0c2ccc83c8..fde73e17cf12 100644 --- a/src/v/resource_mgmt/storage.cc +++ b/src/v/resource_mgmt/storage.cc @@ -126,6 +126,13 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) { partitions.push_back(p.second); } + vlog( + rlog.info, + "Attempting to recover {} from {} remote partitions on core {}", + human::bytes(target), + partitions.size(), + ss::this_shard_id()); + size_t partitions_total = 0; for (const auto& p : partitions) { if (partitions_total >= target) { @@ -137,6 +144,13 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) { auto gate = log->gate().hold(); auto segments = log->cloud_gc_eligible_segments(); + + vlog( + rlog.info, + "Remote partition {} reports {} reclaimable segments", + p->ntp(), + segments.size()); + if (segments.empty()) { continue; } @@ -146,6 +160,13 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) { for (const auto& seg : segments) { auto usage = co_await seg->persistent_size(); log_total += usage.total(); + vlog( + rlog.info, + "Collecting segment {}:{}-{} estimated to recover {}", + p->ntp(), + seg->offsets().base_offset(), + seg->offsets().committed_offset(), + human::bytes(usage.total())); if (log_total >= target) { break; } @@ -154,10 +175,12 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) { vlog( rlog.info, "Setting retention offset override {} estimated reclaim of {} for " - "cloud topic {}", + "cloud topic {}. Total reclaim {} of target {}.", offset, log_total, - p->ntp()); + p->ntp(), + partitions_total, + target); log->set_cloud_gc_offset(offset); partitions_total += log_total; From 830ebf712a2fff938ecf5e2bc545939e63cefc15 Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Wed, 28 Jun 2023 16:42:58 -0700 Subject: [PATCH 2/3] rmgmt: fix bug resulting in slow retention overrides 1. Was only choosing the offset of the first eligible segment 2. Was only reporting one segment from each partition as eligible Taken together, the reclaim rate was depending on how often the control loop ran rather than how fast data was uploaded into the cloud. Signed-off-by: Noah Watkins --- src/v/resource_mgmt/storage.cc | 1 + src/v/storage/disk_log_impl.cc | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/v/resource_mgmt/storage.cc b/src/v/resource_mgmt/storage.cc index fde73e17cf12..39ffb9013795 100644 --- a/src/v/resource_mgmt/storage.cc +++ b/src/v/resource_mgmt/storage.cc @@ -160,6 +160,7 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) { for (const auto& seg : segments) { auto usage = co_await seg->persistent_size(); log_total += usage.total(); + offset = seg->offsets().committed_offset; vlog( rlog.info, "Collecting segment {}:{}-{} estimated to recover {}", diff --git a/src/v/storage/disk_log_impl.cc b/src/v/storage/disk_log_impl.cc index a15596646782..83106634aaa0 100644 --- a/src/v/storage/disk_log_impl.cc +++ b/src/v/storage/disk_log_impl.cc @@ -2231,7 +2231,7 @@ disk_log_impl::cloud_gc_eligible_segments() { if (seg->offsets().committed_offset <= max_collectible) { segments.push_back(seg); } - if (--remaining > 0) { + if (--remaining <= 0) { break; } } From 9f60e731e16023612d4617cb9e6a9bdad9495680 Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Wed, 28 Jun 2023 16:44:30 -0700 Subject: [PATCH 3/3] rmgmt: decrease control loop frequency This is now possible by fixing the bug in the previous commit. Also reducing the timeout on the test because it works much better without that previous bug! Signed-off-by: Noah Watkins --- src/v/resource_mgmt/storage.cc | 2 +- tests/rptest/tests/full_disk_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/v/resource_mgmt/storage.cc b/src/v/resource_mgmt/storage.cc index 39ffb9013795..828c54a5141e 100644 --- a/src/v/resource_mgmt/storage.cc +++ b/src/v/resource_mgmt/storage.cc @@ -84,7 +84,7 @@ ss::future<> disk_space_manager::run_loop() { * upcall to start the monitor loop when it appears that we are getting * close to an important threshold. */ - constexpr auto frequency = std::chrono::seconds(5); + constexpr auto frequency = std::chrono::seconds(20); while (!_gate.is_closed()) { try { diff --git a/tests/rptest/tests/full_disk_test.py b/tests/rptest/tests/full_disk_test.py index 796f96d09f04..bec4d392bb55 100644 --- a/tests/rptest/tests/full_disk_test.py +++ b/tests/rptest/tests/full_disk_test.py @@ -593,4 +593,4 @@ def target_size_reached(): return total < (15 * 2**20 + 2 * self.log_segment_size) # give it plenty of time. on debug it is hella slow - wait_until(target_size_reached, timeout_sec=240, backoff_sec=5) + wait_until(target_size_reached, timeout_sec=120, backoff_sec=5)