Skip to content

Commit 8222377

Browse files
authored
Merge pull request #29794 from redpanda-data/ct/core-15649/gc-auto-pause
2 parents c69e015 + 7fe2133 commit 8222377

File tree

14 files changed

+497
-64
lines changed

14 files changed

+497
-64
lines changed

proto/redpanda/core/admin/internal/cloud_topics/v1/level_zero.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ enum Status {
174174
L0_GC_STATUS_STOPPING = 3;
175175
L0_GC_STATUS_STOPPED = 4;
176176
L0_GC_STATUS_RESETTING = 5;
177+
L0_GC_STATUS_SAFETY_BLOCKED = 6;
177178
}
178179

179180
// Request the L0 size estimate for a single partition.

src/v/cloud_topics/level_zero/gc/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ redpanda_cc_library(
3030
"//src/v/cloud_topics:logger",
3131
"//src/v/cloud_topics:object_utils",
3232
"//src/v/cluster",
33+
"//src/v/config",
3334
"//src/v/ssx:semaphore",
3435
"//src/v/ssx:work_queue",
3536
],

src/v/cloud_topics/level_zero/gc/level_zero_gc.cc

Lines changed: 100 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "cluster/health_monitor_frontend.h"
2020
#include "cluster/members_table.h"
2121
#include "cluster/topic_table.h"
22+
#include "config/configuration.h"
2223
#include "ssx/semaphore.h"
2324
#include "ssx/work_queue.h"
2425

@@ -32,7 +33,8 @@
3233

3334
namespace {
3435
constexpr ss::lowres_clock::duration control_timeout = 5s;
35-
}
36+
constexpr ss::lowres_clock::duration health_report_query_timeout = 10s;
37+
} // namespace
3638

3739
namespace cloud_topics {
3840

@@ -484,7 +486,6 @@ class epoch_source_impl : public level_zero_gc::epoch_source {
484486
* Get a recent health report. Partitions use the health reporting
485487
* mechanism to self-report their max GC eligible epoch.
486488
*/
487-
constexpr auto health_report_query_timeout = 10s;
488489

489490
auto health_report
490491
= co_await health_monitor_->local().get_cluster_health(
@@ -615,13 +616,82 @@ class node_info_impl : public level_zero_gc::node_info {
615616
seastar::sharded<cluster::members_table>* members_table_;
616617
};
617618

619+
class cluster_safety_monitor : public level_zero_gc::safety_monitor {
620+
public:
621+
explicit cluster_safety_monitor(
622+
seastar::sharded<cluster::health_monitor_frontend>* health_monitor,
623+
config::binding<std::chrono::milliseconds> check_interval)
624+
: health_monitor_(health_monitor)
625+
, check_interval_(std::move(check_interval))
626+
, cached_result_{.ok = false, .reason = "awaiting first health check"}
627+
, poll_loop_(do_poll_loop()) {}
628+
629+
result can_proceed() const override { return cached_result_; }
630+
631+
void start() override { started_ = true; }
632+
633+
seastar::future<> stop() override {
634+
started_ = false;
635+
as_.request_abort();
636+
co_await std::exchange(poll_loop_, seastar::make_ready_future<>());
637+
}
638+
639+
private:
640+
seastar::future<> do_poll_loop() noexcept {
641+
while (!as_.abort_requested()) {
642+
if (started_) {
643+
auto poll_fut = co_await ss::coroutine::as_future(
644+
poll_health());
645+
if (poll_fut.failed()) {
646+
auto ex = poll_fut.get_exception();
647+
cached_result_ = result{
648+
.ok = false,
649+
.reason = fmt::format("health check failed: {}", ex)};
650+
}
651+
}
652+
653+
auto sleep_fut = co_await seastar::coroutine::as_future(
654+
seastar::sleep_abortable(check_interval_(), as_));
655+
if (sleep_fut.failed()) {
656+
sleep_fut.ignore_ready_future();
657+
break;
658+
}
659+
}
660+
}
661+
662+
seastar::future<> poll_health() {
663+
auto overview
664+
= co_await health_monitor_->local().get_cluster_health_overview(
665+
model::timeout_clock::now() + health_report_query_timeout);
666+
667+
if (overview.is_healthy()) {
668+
cached_result_ = result{.ok = true, .reason = std::nullopt};
669+
} else {
670+
cached_result_ = result{
671+
.ok = false,
672+
.reason = overview.unhealthy_reasons.empty()
673+
? "cluster unhealthy"
674+
: overview.unhealthy_reasons.front()};
675+
}
676+
}
677+
678+
seastar::sharded<cluster::health_monitor_frontend>* health_monitor_;
679+
config::binding<std::chrono::milliseconds> check_interval_;
680+
result cached_result_;
681+
bool started_{false};
682+
seastar::abort_source as_;
683+
seastar::future<> poll_loop_;
684+
};
685+
618686
level_zero_gc::level_zero_gc(
619687
level_zero_gc_config config,
620688
std::unique_ptr<object_storage> storage,
621689
std::unique_ptr<epoch_source> epoch_source,
622-
std::unique_ptr<node_info> node_info)
690+
std::unique_ptr<node_info> node_info,
691+
std::unique_ptr<safety_monitor> safety_monitor)
623692
: config_(std::move(config))
624693
, epoch_source_(std::move(epoch_source))
694+
, safety_monitor_(std::move(safety_monitor))
625695
, should_run_(false) // begin in a stopped state
626696
, should_shutdown_(false)
627697
, worker_(worker())
@@ -654,7 +724,11 @@ level_zero_gc::level_zero_gc(
654724
std::make_unique<object_storage_remote_impl>(remote, std::move(bucket)),
655725
std::make_unique<epoch_source_impl>(
656726
health_monitor, controller_stm, topic_table),
657-
std::make_unique<node_info_impl>(self, members_table)) {}
727+
std::make_unique<node_info_impl>(self, members_table),
728+
std::make_unique<cluster_safety_monitor>(
729+
health_monitor,
730+
config::shard_local_cfg()
731+
.cloud_topics_gc_health_check_interval.bind())) {}
658732

659733
level_zero_gc::~level_zero_gc() = default;
660734

@@ -665,6 +739,7 @@ seastar::future<> level_zero_gc::start() {
665739
}
666740
vlog(cd_log.info, "Starting cloud topics L0 GC worker");
667741
delete_worker_->start();
742+
safety_monitor_->start();
668743
should_run_ = true;
669744
worker_cv_.signal();
670745
}
@@ -687,6 +762,7 @@ seastar::future<> level_zero_gc::stop() {
687762
worker_cv_.signal();
688763
co_await delete_worker_->stop();
689764
co_await std::exchange(worker_, seastar::make_ready_future<>());
765+
co_await safety_monitor_->stop();
690766
vlog(cd_log.info, "Stopped cloud_topics L0 GC worker");
691767
}
692768

@@ -732,6 +808,8 @@ std::string_view to_string_view(level_zero_gc::state s) {
732808
return "level_zero_gc::state::stopping";
733809
case stopped:
734810
return "level_zero_gc::state::stopped";
811+
case safety_blocked:
812+
return "level_zero_gc::state::safety_blocked";
735813
}
736814
vunreachable("Unrecognized GC state: {}", s);
737815
}
@@ -746,7 +824,11 @@ auto level_zero_gc::get_state() const -> state {
746824
if (resetting_) {
747825
return state::resetting;
748826
}
749-
return should_run_ ? state::running : state::paused;
827+
if (!should_run_) {
828+
return state::paused;
829+
}
830+
return safety_monitor_->can_proceed().ok ? state::running
831+
: state::safety_blocked;
750832
}();
751833
vlog(cd_log.debug, "cloud_topics L0 GC worker state: {}", st);
752834
return st;
@@ -779,6 +861,19 @@ seastar::future<> level_zero_gc::worker() {
779861
// ensure that the abort source is unreferenced at this time.
780862
asrc_ = {};
781863

864+
if (auto safety = safety_monitor_->can_proceed(); !safety.ok) {
865+
vlog(
866+
cd_log.debug,
867+
"L0 GC blocked by safety monitor: {}",
868+
safety.reason.value_or("unknown"));
869+
probe_.safety_blocked();
870+
(co_await seastar::coroutine::as_future(
871+
seastar::sleep_abortable(
872+
config_.throttle_no_progress(), asrc_)))
873+
.ignore_ready_future();
874+
continue;
875+
}
876+
782877
if (backoff.count() > 0) {
783878
auto t0 = ss::lowres_clock::now();
784879
(co_await seastar::coroutine::as_future(

src/v/cloud_topics/level_zero/gc/level_zero_gc.h

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,74 @@ struct level_zero_gc_config {
143143
config::binding<std::chrono::milliseconds> throttle_no_progress;
144144
};
145145

146+
/*
147+
* State Machine
148+
* =============
149+
*
150+
* Construction
151+
* |
152+
* v
153+
* +--------+
154+
* | paused |<-----------------------------+
155+
* +---+----+ |
156+
* | start() |
157+
* v |
158+
* +----------------+ |
159+
* | worker loop top|<-----------+ |
160+
* +-------+--------+ | |
161+
* | | |
162+
* | check safety | |
163+
* | | |
164+
* ok +-+-+ !ok | |
165+
* +---+ +----+ | |
166+
* v v | |
167+
* +---------+ +---------------+ | |
168+
* | running | |safety_blocked | | |
169+
* | | | | | |
170+
* | backoff | | increment | | |
171+
* | then | | probe, | | |
172+
* | collect | | sleep | | |
173+
* +----+----+ +-------+-------+ | |
174+
* | | | |
175+
* +------+-------+ | |
176+
* | | |
177+
* +-----------------+ |
178+
* loop back |
179+
* |
180+
* pause() from running or safety_blocked |
181+
* ----------------------------------------+
182+
*
183+
* reset() from any non-stopped state:
184+
*
185+
* +----------+
186+
* |resetting | drains delete worker, then
187+
* +----+-----+ restores prior run/pause state
188+
* |
189+
* v
190+
* was_running?
191+
* yes -> running
192+
* no -> paused
193+
*
194+
* stop() from any state:
195+
*
196+
* +---------+ worker done +---------+
197+
* |stopping |-------------->| stopped |
198+
* +---------+ +---------+
199+
* (terminal)
200+
*
201+
* The running/safety_blocked distinction is not an
202+
* explicit transition. Both are phases of the worker
203+
* loop -- each iteration checks can_proceed() and
204+
* takes the corresponding branch. The state returned
205+
* by get_state() reflects whichever branch would be
206+
* taken at query time.
207+
*
208+
* get_state() priority:
209+
* should_shutdown_ > resetting_ > !should_run_ >
210+
* safety_monitor_.can_proceed()
211+
*
212+
* start()/pause() block while resetting_ is true.
213+
*/
146214
class level_zero_gc {
147215
public:
148216
/*
@@ -253,6 +321,33 @@ class level_zero_gc {
253321
virtual size_t total_shards() const = 0;
254322
};
255323

324+
/// Interface for gating GC on system-level safety conditions.
325+
///
326+
/// Production implementations poll external signals (e.g. cluster health)
327+
/// in a background fiber and cache the result so that can_proceed() is
328+
/// synchronous and cheap. This is orthogonal to admin pause/start — even
329+
/// if an operator calls start(), GC will not proceed while the safety
330+
/// monitor reports not-ok.
331+
class safety_monitor {
332+
public:
333+
safety_monitor() = default;
334+
safety_monitor(const safety_monitor&) = delete;
335+
safety_monitor(safety_monitor&&) = delete;
336+
safety_monitor& operator=(const safety_monitor&) = delete;
337+
safety_monitor& operator=(safety_monitor&&) = delete;
338+
virtual ~safety_monitor() = default;
339+
340+
struct result {
341+
bool ok;
342+
std::optional<ss::sstring> reason;
343+
};
344+
345+
virtual result can_proceed() const = 0;
346+
347+
virtual void start() {}
348+
virtual seastar::future<> stop() { return seastar::now(); }
349+
};
350+
256351
public:
257352
/*
258353
* Construct with the given storage and epoch providers. This interface is
@@ -262,7 +357,8 @@ class level_zero_gc {
262357
level_zero_gc_config,
263358
std::unique_ptr<object_storage>,
264359
std::unique_ptr<epoch_source>,
265-
std::unique_ptr<node_info>);
360+
std::unique_ptr<node_info>,
361+
std::unique_ptr<safety_monitor>);
266362

267363
/*
268364
* Construct with default implementations of storage and epoch providers.
@@ -308,13 +404,16 @@ class level_zero_gc {
308404
* - resetting: reset() is draining in-flight work
309405
* - stopping: stop() requested but there may be work still in flight
310406
* - stopped: Permanently stopped.
407+
* - safety_blocked: GC is started but the safety monitor is preventing
408+
* collection (e.g. cluster unhealthy)
311409
*/
312410
enum class state : uint8_t {
313411
paused,
314412
running,
315413
resetting,
316414
stopping,
317415
stopped,
416+
safety_blocked,
318417
};
319418

320419
/**
@@ -325,6 +424,7 @@ class level_zero_gc {
325424
private:
326425
level_zero_gc_config config_;
327426
std::unique_ptr<epoch_source> epoch_source_;
427+
std::unique_ptr<safety_monitor> safety_monitor_;
328428

329429
bool should_run_;
330430
bool should_shutdown_;

src/v/cloud_topics/level_zero/gc/level_zero_gc_probe.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,13 @@ void level_zero_gc_probe::setup_internal_metrics(bool disable) {
9595
"Cumulative time in seconds spent in backoff between L0 "
9696
"garbage collection rounds."),
9797
labels),
98+
sm::make_counter(
99+
"safety_blocked_rounds_total",
100+
[this] { return safety_blocked_rounds_; },
101+
sm::description(
102+
"Number of L0 GC rounds skipped because the safety monitor "
103+
"reported an unsafe condition."),
104+
labels),
98105
sm::make_gauge(
99106
"min_partition_gc_epoch",
100107
[this] {

src/v/cloud_topics/level_zero/gc/level_zero_gc_probe.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class level_zero_gc_probe {
3333
void list_error() { list_errors_++; }
3434
void delete_error() { delete_errors_++; }
3535
void add_backpressure(double seconds) { backpressure_seconds_ += seconds; }
36+
void safety_blocked() { safety_blocked_rounds_++; }
3637
void set_max_gc_eligible_epoch(cluster_epoch epoch) {
3738
max_gc_eligible_epoch_ = epoch;
3839
}
@@ -59,6 +60,7 @@ class level_zero_gc_probe {
5960
uint64_t list_errors_{0};
6061
uint64_t delete_errors_{0};
6162
double backpressure_seconds_{0};
63+
uint64_t safety_blocked_rounds_{0};
6264

6365
std::optional<cluster_epoch> min_partition_gc_epoch_;
6466
std::optional<cluster_epoch> max_gc_eligible_epoch_;

src/v/cloud_topics/level_zero/gc/tests/level_zero_gc_mt_test.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,14 @@ class mt_node_info : public level_zero_gc::node_info {
171171
size_t total_shards() const override { return ss::smp::count; }
172172
};
173173

174+
class safety_monitor_test_impl
175+
: public cloud_topics::level_zero_gc::safety_monitor {
176+
public:
177+
result can_proceed() const override {
178+
return {.ok = true, .reason = std::nullopt};
179+
}
180+
};
181+
174182
/*
175183
* Test fixture for multithreaded GC tests.
176184
*/
@@ -200,7 +208,9 @@ struct level_zero_gc_mt_test : public seastar_test {
200208
return std::make_unique<mt_epoch_source>(g_bucket_state.get());
201209
}),
202210
ss::sharded_parameter(
203-
[] { return std::make_unique<mt_node_info>(); }));
211+
[] { return std::make_unique<mt_node_info>(); }),
212+
ss::sharded_parameter(
213+
[] { return std::make_unique<safety_monitor_test_impl>(); }));
204214
}
205215

206216
ss::future<> TearDownAsync() override {

0 commit comments

Comments
 (0)