Skip to content

Commit fe7a4e1

Browse files
authored
fix(test): wait compaction in timeline offload test (#12673)
## Problem close LKB-753. `test_pageserver_metrics_removed_after_offload` is unstable and it sometimes leave the metrics behind after tenant offloading. It turns out that we triggered an image compaction before the offload and the job was stopped after the offload request was completed. ## Summary of changes Wait all background tasks to finish before checking the metrics. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
1 parent 40cae8c commit fe7a4e1

File tree

1 file changed

+31
-3
lines changed

1 file changed

+31
-3
lines changed

test_runner/regress/test_tenants.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -298,15 +298,26 @@ def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> list[Sample]:
298298
assert post_detach_samples == set()
299299

300300

301-
def test_pageserver_metrics_removed_after_offload(neon_env_builder: NeonEnvBuilder):
301+
@pytest.mark.parametrize("compaction", ["compaction_enabled", "compaction_disabled"])
302+
def test_pageserver_metrics_removed_after_offload(
303+
neon_env_builder: NeonEnvBuilder, compaction: str
304+
):
302305
"""Tests that when a timeline is offloaded, the tenant specific metrics are not left behind"""
303306

304307
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
305-
306308
neon_env_builder.num_safekeepers = 3
307309

308310
env = neon_env_builder.init_start()
309-
tenant_1, _ = env.create_tenant()
311+
tenant_1, _ = env.create_tenant(
312+
conf={
313+
# disable background compaction and GC so that we don't have leftover tasks
314+
# after offloading.
315+
"gc_period": "0s",
316+
"compaction_period": "0s",
317+
}
318+
if compaction == "compaction_disabled"
319+
else None
320+
)
310321

311322
timeline_1 = env.create_timeline("test_metrics_removed_after_offload_1", tenant_id=tenant_1)
312323
timeline_2 = env.create_timeline("test_metrics_removed_after_offload_2", tenant_id=tenant_1)
@@ -351,6 +362,23 @@ def get_ps_metric_samples_for_timeline(
351362
state=TimelineArchivalState.ARCHIVED,
352363
)
353364
env.pageserver.http_client().timeline_offload(tenant_1, timeline)
365+
# We need to wait until all background jobs are finished before we can check the metrics.
366+
# There're many of them: compaction, GC, etc.
367+
wait_until(
368+
lambda: all(
369+
sample.value == 0
370+
for sample in env.pageserver.http_client()
371+
.get_metrics()
372+
.query_all("pageserver_background_loop_semaphore_waiting_tasks")
373+
)
374+
and all(
375+
sample.value == 0
376+
for sample in env.pageserver.http_client()
377+
.get_metrics()
378+
.query_all("pageserver_background_loop_semaphore_running_tasks")
379+
)
380+
)
381+
354382
post_offload_samples = set(
355383
[x.name for x in get_ps_metric_samples_for_timeline(tenant_1, timeline)]
356384
)

0 commit comments

Comments
 (0)