From 7a37d83beda8f1e2c53180236494b590c63b9d49 Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Fri, 15 Aug 2025 16:19:52 +0100 Subject: [PATCH] Filter obsolete layer files from an older generation from heatmap --- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/timeline.rs | 186 +++++++++++++++++- pageserver/src/tenant/timeline/init.rs | 10 + .../src/tenant/timeline/layer_manager.rs | 7 +- 4 files changed, 202 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 91b717a2e948..62f0ef114e70 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1580,7 +1580,7 @@ impl TenantShard { } #[instrument(skip_all)] - pub(crate) async fn preload( + pub(crate) async fn preload( self: &Arc, remote_storage: &GenericRemoteStorage, cancel: CancellationToken, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ff66b0ecc8aa..274ac9e37dea 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -88,7 +88,7 @@ use self::eviction_task::EvictionTaskTimelineState; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::remote_timeline_client::RemoteTimelineClient; -use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; +use super::remote_timeline_client::index::{GcCompactionState, IndexPart, LayerFileMetadata}; use super::secondary::heatmap::HeatMapLayer; use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; @@ -4205,6 +4205,12 @@ impl Timeline { let desc: PersistentLayerDesc = hl.name.clone().into(); let layer = guard.try_get_from_key(&desc.key())?; + // Make sure the layer in the old heatmap is the same generation one as in the layer + // map otherwise we can in some edge case keep old obsolete layers in the heatmap. + if layer.metadata().generation != hl.metadata.generation { + return None; + } + if layer.visibility() == LayerVisibilityHint::Covered { return None; } @@ -6474,6 +6480,7 @@ pub struct DeltaLayerTestDesc { pub lsn_range: Range, pub key_range: Range, pub data: Vec<(Key, Lsn, Value)>, + pub resident: bool, } #[cfg(test)] @@ -6491,12 +6498,22 @@ impl DeltaLayerTestDesc { lsn_range, key_range, data, + // Default test code creates resident layers. + resident: true, } } pub fn new_with_inferred_key_range( lsn_range: Range, data: Vec<(Key, Lsn, Value)>, + ) -> Self { + Self::new_with_inferred_key_range_and_resident_state(lsn_range, data, true) + } + + pub fn new_with_inferred_key_range_and_resident_state( + lsn_range: Range, + data: Vec<(Key, Lsn, Value)>, + resident: bool, ) -> Self { let key_min = data.iter().map(|(key, _, _)| key).min().unwrap(); let key_max = data.iter().map(|(key, _, _)| key).max().unwrap(); @@ -6504,6 +6521,7 @@ impl DeltaLayerTestDesc { key_range: (*key_min)..(key_max.next()), lsn_range, data, + resident } } @@ -7505,6 +7523,30 @@ impl Timeline { check_start_lsn: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { + + if !deltas.resident { + // Don't need to bother creating an on-disk file, we just want the metadata for a non-resident layer. + let delta_layer = Layer::for_evicted( + self.conf, + self, + deltas.layer_name(), + LayerFileMetadata { + generation: self.generation, + shard: self.get_shard_index(), + file_size: 1024, + }, + ); + info!("force created non-resident delta layer {}", deltas.layer_name()); + { + let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await; + guard + .open_mut() + .unwrap() + .force_insert_optionally_resident_layer(delta_layer); + } + return Ok(()); + } + let last_record_lsn = self.get_last_record_lsn(); deltas .data @@ -8263,6 +8305,148 @@ mod tests { )); } + #[tokio::test] + async fn test_heatmap_generation_removes_layers_from_old_generation() { + use std::time::SystemTime; + use utils::generation::Generation; + use crate::tenant::remote_timeline_client::index::LayerFileMetadata; + use crate::tenant::secondary::heatmap::{HeatMapLayer, HeatMapTimeline as HeatMapTimelineStruct}; + + let harness = TenantHarness::create("heatmaheatmap_genereation_removes_layers_from_old_generationp_generation").await.unwrap(); + + // Create your existing layer descriptions + let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + ); + // This visible layer is non-resident on disk. This is important to reproduce the failure as + // a resident file will take priority over the previous heatmap even without the fix for + // this issue. + let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range_and_resident_state( + Lsn(0x10)..Lsn(0x20), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x11), + Value::Image(test_img("foo")), + )], + false, // Non-resident + ); + let l0_delta = DeltaLayerTestDesc::new( + Lsn(0x20)..Lsn(0x30), + Key::from_hex("000000000000000000000000000000000000").unwrap() + ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(), + vec![( + Key::from_hex("720000000033333333444444445500000000").unwrap(), + Lsn(0x25), + Value::Image(test_img("foo")), + )], + ); + let delta_layers = vec![ + covered_delta.clone(), + visible_delta.clone(), + l0_delta.clone(), + ]; + + let image_layer = ( + Lsn(0x40), + vec![( + Key::from_hex("620000000033333333444444445500000000").unwrap(), + test_img("bar"), + )], + ); + let image_layers = vec![image_layer]; + + let (tenant, ctx) = harness.load().await; + + // Create timeline with current generation (0xdeadbeef by default) + let timeline = tenant + .create_test_timeline_with_layers( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + Vec::new(), // in-memory layers + delta_layers, + image_layers, + Lsn(0x100), + ) + .await + .unwrap(); + + // Now create a previous heatmap with the visible_delta layer from an older generation + let current_layer_metadata = LayerFileMetadata { + generation: timeline.generation, + shard: timeline.get_shard_index(), + file_size: 1024, + }; + let old_generation = Generation::new(0x12345678); // Older than 0xdeadbeef + let old_layer_metadata = LayerFileMetadata { + generation: old_generation, + shard: timeline.get_shard_index(), + file_size: 1024, + }; + + // Create heatmap layers that reference the same keys but with old generation + let prev_heatmap_layers = vec![ + HeatMapLayer::new( + covered_delta.layer_name(), + current_layer_metadata.clone(), + SystemTime::now(), + false, // not cold + ), + HeatMapLayer::new( + visible_delta.layer_name(), + // Visible delta layer is from an older generation in heatmap + old_layer_metadata.clone(), + SystemTime::now(), + false, // not cold + ), + HeatMapLayer::new( + l0_delta.layer_name(), + current_layer_metadata.clone(), + SystemTime::now(), + false, // not cold + ), + ]; + + // Create the previous heatmap with old generation layers + let prev_heatmap = HeatMapTimelineStruct::new(timeline.timeline_id, prev_heatmap_layers); + + // Set the previous heatmap on the timeline + timeline + .previous_heatmap + .store(Some(Arc::new(PreviousHeatmap::Active { + heatmap: prev_heatmap, + read_at: std::time::Instant::now(), + end_lsn: None, + }))); + + // Layer visibility is an input to heatmap generation, so refresh it first + timeline.update_layer_visibility().await.unwrap(); + + // Generate a new heatmap - this should filter out the old generation layers + let heatmap = timeline + .generate_heatmap() + .await + .expect("Infallible while timeline is not shut down"); + + assert_eq!(heatmap.timeline_id, timeline.timeline_id); + + // Verify that layers exist but they should be the current generation ones, + // not the old generation ones from previous_heatmap + for layer in heatmap.all_layers() { + assert_eq!( + layer.metadata.generation, + timeline.generation, + "Heatmap should only contain current generation layers, not old ones" + ); + } + } + #[tokio::test] async fn two_layer_eviction_attempts_at_the_same_time() { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time") diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index e952df0845ad..90c935071af7 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -129,6 +129,16 @@ pub(super) fn reconcile( // Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise // construct DismissedLayers to get rid of them. for (layer_name, local_metadata) in local_layers { + // FIXME: This should probably take generation into account. Currently it's possible to have + // an old generation file on disk while a newer one with same name is in index (because + // primary just split shard) and we miss that here. We are saved by the check below because + // the file size is very likely to be different (and if it isn't then the file contents will + // probably be the same anyway in case of shard split), but it's confusing that this logic + // doesn't account for name collisions from older generations. Ideally, we should consider a + // local file from an older generation than the one in the index to be a different file and + // return `DismissedLayer::LocalOnly` if generations don't match. Right now though, + // layer_name has the generation part stripped so we'd need to re-parse the generation from + // the file name here or back in scan_timeline_dir and add it to LocalLayerFileMetadata. let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else { result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))); continue; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index d8d81a6c91d5..27cf79092c35 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -645,8 +645,13 @@ impl OpenLayerManager { #[cfg(test)] pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) { + self.force_insert_optionally_resident_layer(layer.as_ref().clone()); + } + + #[cfg(test)] + pub(crate) fn force_insert_optionally_resident_layer(&mut self, layer: Layer) { let mut updates = self.layer_map.batch_update(); - Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr); + Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr); updates.flush() }