From c0899f29075a919c1a44433979a4ac1da6bac84f Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 28 Jun 2023 16:42:17 -0700
Subject: [PATCH 1/3] rmgmt: improve space management logging

Info level is yes likely too high for some of these. But it only runs in
low-disk situations, and is useful in the short term to have for
debugging.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/resource_mgmt/storage.cc | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/v/resource_mgmt/storage.cc b/src/v/resource_mgmt/storage.cc
index 1b0c2ccc83c8..fde73e17cf12 100644
--- a/src/v/resource_mgmt/storage.cc
+++ b/src/v/resource_mgmt/storage.cc
@@ -126,6 +126,13 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) {
         partitions.push_back(p.second);
     }
 
+    vlog(
+      rlog.info,
+      "Attempting to recover {} from {} remote partitions on core {}",
+      human::bytes(target),
+      partitions.size(),
+      ss::this_shard_id());
+
     size_t partitions_total = 0;
     for (const auto& p : partitions) {
         if (partitions_total >= target) {
@@ -137,6 +144,13 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) {
         auto gate = log->gate().hold();
 
         auto segments = log->cloud_gc_eligible_segments();
+
+        vlog(
+          rlog.info,
+          "Remote partition {} reports {} reclaimable segments",
+          p->ntp(),
+          segments.size());
+
         if (segments.empty()) {
             continue;
         }
@@ -146,6 +160,13 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) {
         for (const auto& seg : segments) {
             auto usage = co_await seg->persistent_size();
             log_total += usage.total();
+            vlog(
+              rlog.info,
+              "Collecting segment {}:{}-{} estimated to recover {}",
+              p->ntp(),
+              seg->offsets().base_offset(),
+              seg->offsets().committed_offset(),
+              human::bytes(usage.total()));
             if (log_total >= target) {
                 break;
             }
@@ -154,10 +175,12 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) {
         vlog(
           rlog.info,
           "Setting retention offset override {} estimated reclaim of {} for "
-          "cloud topic {}",
+          "cloud topic {}. Total reclaim {} of target {}.",
           offset,
           log_total,
-          p->ntp());
+          p->ntp(),
+          partitions_total,
+          target);
 
         log->set_cloud_gc_offset(offset);
         partitions_total += log_total;

From 830ebf712a2fff938ecf5e2bc545939e63cefc15 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 28 Jun 2023 16:42:58 -0700
Subject: [PATCH 2/3] rmgmt: fix bug resulting in slow retention overrides

1. Was only choosing the offset of the first eligible segment
2. Was only reporting one segment from each partition as eligible

Taken together, the reclaim rate was depending on how often the control
loop ran rather than how fast data was uploaded into the cloud.

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/resource_mgmt/storage.cc | 1 +
 src/v/storage/disk_log_impl.cc | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/v/resource_mgmt/storage.cc b/src/v/resource_mgmt/storage.cc
index fde73e17cf12..39ffb9013795 100644
--- a/src/v/resource_mgmt/storage.cc
+++ b/src/v/resource_mgmt/storage.cc
@@ -160,6 +160,7 @@ set_partition_retention_offsets(cluster::partition_manager& pm, size_t target) {
         for (const auto& seg : segments) {
             auto usage = co_await seg->persistent_size();
             log_total += usage.total();
+            offset = seg->offsets().committed_offset;
             vlog(
               rlog.info,
               "Collecting segment {}:{}-{} estimated to recover {}",
diff --git a/src/v/storage/disk_log_impl.cc b/src/v/storage/disk_log_impl.cc
index a15596646782..83106634aaa0 100644
--- a/src/v/storage/disk_log_impl.cc
+++ b/src/v/storage/disk_log_impl.cc
@@ -2231,7 +2231,7 @@ disk_log_impl::cloud_gc_eligible_segments() {
         if (seg->offsets().committed_offset <= max_collectible) {
             segments.push_back(seg);
         }
-        if (--remaining > 0) {
+        if (--remaining <= 0) {
             break;
         }
     }

From 9f60e731e16023612d4617cb9e6a9bdad9495680 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noahwatkins@gmail.com>
Date: Wed, 28 Jun 2023 16:44:30 -0700
Subject: [PATCH 3/3] rmgmt: decrease control loop frequency

This is now possible by fixing the bug in the previous commit. Also
reducing the timeout on the test because it works much better without
that previous bug!

Signed-off-by: Noah Watkins <noahwatkins@gmail.com>
---
 src/v/resource_mgmt/storage.cc       | 2 +-
 tests/rptest/tests/full_disk_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/v/resource_mgmt/storage.cc b/src/v/resource_mgmt/storage.cc
index 39ffb9013795..828c54a5141e 100644
--- a/src/v/resource_mgmt/storage.cc
+++ b/src/v/resource_mgmt/storage.cc
@@ -84,7 +84,7 @@ ss::future<> disk_space_manager::run_loop() {
      * upcall to start the monitor loop when it appears that we are getting
      * close to an important threshold.
      */
-    constexpr auto frequency = std::chrono::seconds(5);
+    constexpr auto frequency = std::chrono::seconds(20);
 
     while (!_gate.is_closed()) {
         try {
diff --git a/tests/rptest/tests/full_disk_test.py b/tests/rptest/tests/full_disk_test.py
index 796f96d09f04..bec4d392bb55 100644
--- a/tests/rptest/tests/full_disk_test.py
+++ b/tests/rptest/tests/full_disk_test.py
@@ -593,4 +593,4 @@ def target_size_reached():
             return total < (15 * 2**20 + 2 * self.log_segment_size)
 
         # give it plenty of time. on debug it is hella slow
-        wait_until(target_size_reached, timeout_sec=240, backoff_sec=5)
+        wait_until(target_size_reached, timeout_sec=120, backoff_sec=5)