Skip to content

Commit 7a37d83

Browse files
committed
Filter obsolete layer files from an older generation from heatmap
1 parent d96cea1 commit 7a37d83

File tree

4 files changed

+202
-3
lines changed

4 files changed

+202
-3
lines changed

pageserver/src/tenant.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1580,7 +1580,7 @@ impl TenantShard {
15801580
}
15811581

15821582
#[instrument(skip_all)]
1583-
pub(crate) async fn preload(
1583+
pub(crate) async fn preload(
15841584
self: &Arc<Self>,
15851585
remote_storage: &GenericRemoteStorage,
15861586
cancel: CancellationToken,

pageserver/src/tenant/timeline.rs

Lines changed: 185 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ use self::eviction_task::EvictionTaskTimelineState;
8888
use self::logical_size::LogicalSize;
8989
use self::walreceiver::{WalReceiver, WalReceiverConf};
9090
use super::remote_timeline_client::RemoteTimelineClient;
91-
use super::remote_timeline_client::index::{GcCompactionState, IndexPart};
91+
use super::remote_timeline_client::index::{GcCompactionState, IndexPart, LayerFileMetadata};
9292
use super::secondary::heatmap::HeatMapLayer;
9393
use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
9494
use super::tasks::log_compaction_error;
@@ -4205,6 +4205,12 @@ impl Timeline {
42054205
let desc: PersistentLayerDesc = hl.name.clone().into();
42064206
let layer = guard.try_get_from_key(&desc.key())?;
42074207

4208+
// Make sure the layer in the old heatmap is the same generation one as in the layer
4209+
// map otherwise we can in some edge case keep old obsolete layers in the heatmap.
4210+
if layer.metadata().generation != hl.metadata.generation {
4211+
return None;
4212+
}
4213+
42084214
if layer.visibility() == LayerVisibilityHint::Covered {
42094215
return None;
42104216
}
@@ -6474,6 +6480,7 @@ pub struct DeltaLayerTestDesc {
64746480
pub lsn_range: Range<Lsn>,
64756481
pub key_range: Range<Key>,
64766482
pub data: Vec<(Key, Lsn, Value)>,
6483+
pub resident: bool,
64776484
}
64786485

64796486
#[cfg(test)]
@@ -6491,19 +6498,30 @@ impl DeltaLayerTestDesc {
64916498
lsn_range,
64926499
key_range,
64936500
data,
6501+
// Default test code creates resident layers.
6502+
resident: true,
64946503
}
64956504
}
64966505

64976506
pub fn new_with_inferred_key_range(
64986507
lsn_range: Range<Lsn>,
64996508
data: Vec<(Key, Lsn, Value)>,
6509+
) -> Self {
6510+
Self::new_with_inferred_key_range_and_resident_state(lsn_range, data, true)
6511+
}
6512+
6513+
pub fn new_with_inferred_key_range_and_resident_state(
6514+
lsn_range: Range<Lsn>,
6515+
data: Vec<(Key, Lsn, Value)>,
6516+
resident: bool,
65006517
) -> Self {
65016518
let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
65026519
let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
65036520
Self {
65046521
key_range: (*key_min)..(key_max.next()),
65056522
lsn_range,
65066523
data,
6524+
resident
65076525
}
65086526
}
65096527

@@ -7505,6 +7523,30 @@ impl Timeline {
75057523
check_start_lsn: Option<Lsn>,
75067524
ctx: &RequestContext,
75077525
) -> anyhow::Result<()> {
7526+
7527+
if !deltas.resident {
7528+
// Don't need to bother creating an on-disk file, we just want the metadata for a non-resident layer.
7529+
let delta_layer = Layer::for_evicted(
7530+
self.conf,
7531+
self,
7532+
deltas.layer_name(),
7533+
LayerFileMetadata {
7534+
generation: self.generation,
7535+
shard: self.get_shard_index(),
7536+
file_size: 1024,
7537+
},
7538+
);
7539+
info!("force created non-resident delta layer {}", deltas.layer_name());
7540+
{
7541+
let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
7542+
guard
7543+
.open_mut()
7544+
.unwrap()
7545+
.force_insert_optionally_resident_layer(delta_layer);
7546+
}
7547+
return Ok(());
7548+
}
7549+
75087550
let last_record_lsn = self.get_last_record_lsn();
75097551
deltas
75107552
.data
@@ -8263,6 +8305,148 @@ mod tests {
82638305
));
82648306
}
82658307

8308+
#[tokio::test]
8309+
async fn test_heatmap_generation_removes_layers_from_old_generation() {
8310+
use std::time::SystemTime;
8311+
use utils::generation::Generation;
8312+
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
8313+
use crate::tenant::secondary::heatmap::{HeatMapLayer, HeatMapTimeline as HeatMapTimelineStruct};
8314+
8315+
let harness = TenantHarness::create("heatmaheatmap_genereation_removes_layers_from_old_generationp_generation").await.unwrap();
8316+
8317+
// Create your existing layer descriptions
8318+
let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
8319+
Lsn(0x10)..Lsn(0x20),
8320+
vec![(
8321+
Key::from_hex("620000000033333333444444445500000000").unwrap(),
8322+
Lsn(0x11),
8323+
Value::Image(test_img("foo")),
8324+
)],
8325+
);
8326+
// This visible layer is non-resident on disk. This is important to reproduce the failure as
8327+
// a resident file will take priority over the previous heatmap even without the fix for
8328+
// this issue.
8329+
let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range_and_resident_state(
8330+
Lsn(0x10)..Lsn(0x20),
8331+
vec![(
8332+
Key::from_hex("720000000033333333444444445500000000").unwrap(),
8333+
Lsn(0x11),
8334+
Value::Image(test_img("foo")),
8335+
)],
8336+
false, // Non-resident
8337+
);
8338+
let l0_delta = DeltaLayerTestDesc::new(
8339+
Lsn(0x20)..Lsn(0x30),
8340+
Key::from_hex("000000000000000000000000000000000000").unwrap()
8341+
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
8342+
vec![(
8343+
Key::from_hex("720000000033333333444444445500000000").unwrap(),
8344+
Lsn(0x25),
8345+
Value::Image(test_img("foo")),
8346+
)],
8347+
);
8348+
let delta_layers = vec![
8349+
covered_delta.clone(),
8350+
visible_delta.clone(),
8351+
l0_delta.clone(),
8352+
];
8353+
8354+
let image_layer = (
8355+
Lsn(0x40),
8356+
vec![(
8357+
Key::from_hex("620000000033333333444444445500000000").unwrap(),
8358+
test_img("bar"),
8359+
)],
8360+
);
8361+
let image_layers = vec![image_layer];
8362+
8363+
let (tenant, ctx) = harness.load().await;
8364+
8365+
// Create timeline with current generation (0xdeadbeef by default)
8366+
let timeline = tenant
8367+
.create_test_timeline_with_layers(
8368+
TimelineId::generate(),
8369+
Lsn(0x10),
8370+
PgMajorVersion::PG14,
8371+
&ctx,
8372+
Vec::new(), // in-memory layers
8373+
delta_layers,
8374+
image_layers,
8375+
Lsn(0x100),
8376+
)
8377+
.await
8378+
.unwrap();
8379+
8380+
// Now create a previous heatmap with the visible_delta layer from an older generation
8381+
let current_layer_metadata = LayerFileMetadata {
8382+
generation: timeline.generation,
8383+
shard: timeline.get_shard_index(),
8384+
file_size: 1024,
8385+
};
8386+
let old_generation = Generation::new(0x12345678); // Older than 0xdeadbeef
8387+
let old_layer_metadata = LayerFileMetadata {
8388+
generation: old_generation,
8389+
shard: timeline.get_shard_index(),
8390+
file_size: 1024,
8391+
};
8392+
8393+
// Create heatmap layers that reference the same keys but with old generation
8394+
let prev_heatmap_layers = vec![
8395+
HeatMapLayer::new(
8396+
covered_delta.layer_name(),
8397+
current_layer_metadata.clone(),
8398+
SystemTime::now(),
8399+
false, // not cold
8400+
),
8401+
HeatMapLayer::new(
8402+
visible_delta.layer_name(),
8403+
// Visible delta layer is from an older generation in heatmap
8404+
old_layer_metadata.clone(),
8405+
SystemTime::now(),
8406+
false, // not cold
8407+
),
8408+
HeatMapLayer::new(
8409+
l0_delta.layer_name(),
8410+
current_layer_metadata.clone(),
8411+
SystemTime::now(),
8412+
false, // not cold
8413+
),
8414+
];
8415+
8416+
// Create the previous heatmap with old generation layers
8417+
let prev_heatmap = HeatMapTimelineStruct::new(timeline.timeline_id, prev_heatmap_layers);
8418+
8419+
// Set the previous heatmap on the timeline
8420+
timeline
8421+
.previous_heatmap
8422+
.store(Some(Arc::new(PreviousHeatmap::Active {
8423+
heatmap: prev_heatmap,
8424+
read_at: std::time::Instant::now(),
8425+
end_lsn: None,
8426+
})));
8427+
8428+
// Layer visibility is an input to heatmap generation, so refresh it first
8429+
timeline.update_layer_visibility().await.unwrap();
8430+
8431+
// Generate a new heatmap - this should filter out the old generation layers
8432+
let heatmap = timeline
8433+
.generate_heatmap()
8434+
.await
8435+
.expect("Infallible while timeline is not shut down");
8436+
8437+
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
8438+
8439+
// Verify that layers exist but they should be the current generation ones,
8440+
// not the old generation ones from previous_heatmap
8441+
for layer in heatmap.all_layers() {
8442+
assert_eq!(
8443+
layer.metadata.generation,
8444+
timeline.generation,
8445+
"Heatmap should only contain current generation layers, not old ones"
8446+
);
8447+
}
8448+
}
8449+
82668450
#[tokio::test]
82678451
async fn two_layer_eviction_attempts_at_the_same_time() {
82688452
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")

pageserver/src/tenant/timeline/init.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,16 @@ pub(super) fn reconcile(
129129
// Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise
130130
// construct DismissedLayers to get rid of them.
131131
for (layer_name, local_metadata) in local_layers {
132+
// FIXME: This should probably take generation into account. Currently it's possible to have
133+
// an old generation file on disk while a newer one with same name is in index (because
134+
// primary just split shard) and we miss that here. We are saved by the check below because
135+
// the file size is very likely to be different (and if it isn't then the file contents will
136+
// probably be the same anyway in case of shard split), but it's confusing that this logic
137+
// doesn't account for name collisions from older generations. Ideally, we should consider a
138+
// local file from an older generation than the one in the index to be a different file and
139+
// return `DismissedLayer::LocalOnly` if generations don't match. Right now though,
140+
// layer_name has the generation part stripped so we'd need to re-parse the generation from
141+
// the file name here or back in scan_timeline_dir and add it to LocalLayerFileMetadata.
132142
let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
133143
result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
134144
continue;

pageserver/src/tenant/timeline/layer_manager.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,8 +645,13 @@ impl OpenLayerManager {
645645

646646
#[cfg(test)]
647647
pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) {
648+
self.force_insert_optionally_resident_layer(layer.as_ref().clone());
649+
}
650+
651+
#[cfg(test)]
652+
pub(crate) fn force_insert_optionally_resident_layer(&mut self, layer: Layer) {
648653
let mut updates = self.layer_map.batch_update();
649-
Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
654+
Self::insert_historic_layer(layer, &mut updates, &mut self.layer_fmgr);
650655
updates.flush()
651656
}
652657

0 commit comments

Comments
 (0)