Skip to content

Commit e389a09

Browse files
authored
Fix bug where retries within RemoteStoreRefreshListener cause infos/checkpoint mismatch (#10655)
* Fix bug where retries within RemoteStoreRefreshListener cause mismatch between ReplicationCheckpoint and uploaded SegmentInfos. Retries within RemoteStoreRefreshListener run outside of the refresh thread. This means that concurrent refreshes may occur during syncSegments execution updating the on-reader SegmentInfos. A shard's latest ReplicationCheckpoint is computed and set in a refresh listener, but it is not guaranteed the listener has run before the retry fetches the infos or checkpoint independently. This fix ensures the listener recomputes the checkpoint while fetching the SegmentInfos. This change also ensures that we only recompute the checkpoint when necessary because it comes with an IO cost to compute StoreFileMetadata. Signed-off-by: Marc Handalian <handalm@amazon.com> Update refresh listener to recompute checkpoint from latest infos snapshot. Signed-off-by: Marc Handalian <handalm@amazon.com> Fix broken test case by comparing segments gen Signed-off-by: Marc Handalian <handalm@amazon.com> spotless Signed-off-by: Marc Handalian <handalm@amazon.com> Fix RemoteStoreRefreshListener tests Signed-off-by: Marc Handalian <handalm@amazon.com> * add extra log Signed-off-by: Marc Handalian <handalm@amazon.com> --------- Signed-off-by: Marc Handalian <handalm@amazon.com>
1 parent da24ca7 commit e389a09

File tree

4 files changed

+70
-34
lines changed

4 files changed

+70
-34
lines changed

server/src/main/java/org/opensearch/index/shard/IndexShard.java

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1608,8 +1608,11 @@ public GatedCloseable<IndexCommit> acquireSafeIndexCommit() throws EngineExcepti
16081608
}
16091609

16101610
/**
1611-
* Compute and return the latest ReplicationCheckpoint for a particular shard.
1612-
* @return EMPTY checkpoint before the engine is opened and null for non-segrep enabled indices
1611+
* return the most recently computed ReplicationCheckpoint for a particular shard.
1612+
* The checkpoint is updated inside a refresh listener and may lag behind the SegmentInfos on the reader.
1613+
* To guarantee the checkpoint is upto date with the latest on-reader infos, use `getLatestSegmentInfosAndCheckpoint` instead.
1614+
*
1615+
* @return {@link ReplicationCheckpoint} - The most recently computed ReplicationCheckpoint.
16131616
*/
16141617
public ReplicationCheckpoint getLatestReplicationCheckpoint() {
16151618
return replicationTracker.getLatestReplicationCheckpoint();
@@ -1628,34 +1631,12 @@ public ReplicationCheckpoint getLatestReplicationCheckpoint() {
16281631
public Tuple<GatedCloseable<SegmentInfos>, ReplicationCheckpoint> getLatestSegmentInfosAndCheckpoint() {
16291632
assert indexSettings.isSegRepEnabled();
16301633

1631-
Tuple<GatedCloseable<SegmentInfos>, ReplicationCheckpoint> nullSegmentInfosEmptyCheckpoint = new Tuple<>(
1632-
new GatedCloseable<>(null, () -> {}),
1633-
getLatestReplicationCheckpoint()
1634-
);
1635-
1636-
if (getEngineOrNull() == null) {
1637-
return nullSegmentInfosEmptyCheckpoint;
1638-
}
16391634
// do not close the snapshot - caller will close it.
16401635
GatedCloseable<SegmentInfos> snapshot = null;
16411636
try {
16421637
snapshot = getSegmentInfosSnapshot();
1643-
if (snapshot.get() != null) {
1644-
SegmentInfos segmentInfos = snapshot.get();
1645-
final Map<String, StoreFileMetadata> metadataMap = store.getSegmentMetadataMap(segmentInfos);
1646-
return new Tuple<>(
1647-
snapshot,
1648-
new ReplicationCheckpoint(
1649-
this.shardId,
1650-
getOperationPrimaryTerm(),
1651-
segmentInfos.getGeneration(),
1652-
segmentInfos.getVersion(),
1653-
metadataMap.values().stream().mapToLong(StoreFileMetadata::length).sum(),
1654-
getEngine().config().getCodec().getName(),
1655-
metadataMap
1656-
)
1657-
);
1658-
}
1638+
final SegmentInfos segmentInfos = snapshot.get();
1639+
return new Tuple<>(snapshot, computeReplicationCheckpoint(segmentInfos));
16591640
} catch (IOException | AlreadyClosedException e) {
16601641
logger.error("Error Fetching SegmentInfos and latest checkpoint", e);
16611642
if (snapshot != null) {
@@ -1666,7 +1647,39 @@ public Tuple<GatedCloseable<SegmentInfos>, ReplicationCheckpoint> getLatestSegme
16661647
}
16671648
}
16681649
}
1669-
return nullSegmentInfosEmptyCheckpoint;
1650+
return new Tuple<>(new GatedCloseable<>(null, () -> {}), getLatestReplicationCheckpoint());
1651+
}
1652+
1653+
/**
1654+
* Compute the latest {@link ReplicationCheckpoint} from a SegmentInfos.
1655+
* This function fetches a metadata snapshot from the store that comes with an IO cost.
1656+
* We will reuse the existing stored checkpoint if it is at the same SI version.
1657+
*
1658+
* @param segmentInfos {@link SegmentInfos} infos to use to compute.
1659+
* @return {@link ReplicationCheckpoint} Checkpoint computed from the infos.
1660+
* @throws IOException When there is an error computing segment metadata from the store.
1661+
*/
1662+
ReplicationCheckpoint computeReplicationCheckpoint(SegmentInfos segmentInfos) throws IOException {
1663+
if (segmentInfos == null) {
1664+
return ReplicationCheckpoint.empty(shardId);
1665+
}
1666+
final ReplicationCheckpoint latestReplicationCheckpoint = getLatestReplicationCheckpoint();
1667+
if (latestReplicationCheckpoint.getSegmentInfosVersion() == segmentInfos.getVersion()
1668+
&& latestReplicationCheckpoint.getSegmentsGen() == segmentInfos.getGeneration()) {
1669+
return latestReplicationCheckpoint;
1670+
}
1671+
final Map<String, StoreFileMetadata> metadataMap = store.getSegmentMetadataMap(segmentInfos);
1672+
final ReplicationCheckpoint checkpoint = new ReplicationCheckpoint(
1673+
this.shardId,
1674+
getOperationPrimaryTerm(),
1675+
segmentInfos.getGeneration(),
1676+
segmentInfos.getVersion(),
1677+
metadataMap.values().stream().mapToLong(StoreFileMetadata::length).sum(),
1678+
getEngine().config().getCodec().getName(),
1679+
metadataMap
1680+
);
1681+
logger.trace("Recomputed ReplicationCheckpoint for shard {}", checkpoint);
1682+
return checkpoint;
16701683
}
16711684

16721685
/**

server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,6 @@ private boolean syncSegments() {
181181
// in the remote store.
182182
return indexShard.state() != IndexShardState.STARTED || !(indexShard.getEngine() instanceof InternalEngine);
183183
}
184-
ReplicationCheckpoint checkpoint = indexShard.getLatestReplicationCheckpoint();
185184
beforeSegmentsSync();
186185
long refreshTimeMs = segmentTracker.getLocalRefreshTimeMs(), refreshClockTimeMs = segmentTracker.getLocalRefreshClockTimeMs();
187186
long refreshSeqNo = segmentTracker.getLocalRefreshSeqNo();
@@ -199,10 +198,7 @@ private boolean syncSegments() {
199198

200199
try (GatedCloseable<SegmentInfos> segmentInfosGatedCloseable = indexShard.getSegmentInfosSnapshot()) {
201200
SegmentInfos segmentInfos = segmentInfosGatedCloseable.get();
202-
assert segmentInfos.getGeneration() == checkpoint.getSegmentsGen() : "SegmentInfos generation: "
203-
+ segmentInfos.getGeneration()
204-
+ " does not match metadata generation: "
205-
+ checkpoint.getSegmentsGen();
201+
final ReplicationCheckpoint checkpoint = indexShard.computeReplicationCheckpoint(segmentInfos);
206202
// Capture replication checkpoint before uploading the segments as upload can take some time and checkpoint can
207203
// move.
208204
long lastRefreshedCheckpoint = ((InternalEngine) indexShard.getEngine()).lastRefreshedCheckpoint();

server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,8 +520,8 @@ private Tuple<RemoteStoreRefreshListener, RemoteStoreStatsTrackerFactory> mockIn
520520
if (counter.incrementAndGet() <= succeedOnAttempt) {
521521
throw new RuntimeException("Inducing failure in upload");
522522
}
523-
return indexShard.getLatestSegmentInfosAndCheckpoint();
524-
})).when(shard).getLatestSegmentInfosAndCheckpoint();
523+
return indexShard.getLatestReplicationCheckpoint();
524+
})).when(shard).computeReplicationCheckpoint(any());
525525

526526
doAnswer(invocation -> {
527527
if (Objects.nonNull(successLatch)) {

server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,33 @@ public void testSnapshotWhileFailoverIncomplete() throws Exception {
925925
}
926926
}
927927

928+
public void testReuseReplicationCheckpointWhenLatestInfosIsUnChanged() throws Exception {
929+
try (ReplicationGroup shards = createGroup(1, settings, indexMapping, new NRTReplicationEngineFactory(), createTempDir())) {
930+
final IndexShard primaryShard = shards.getPrimary();
931+
shards.startAll();
932+
shards.indexDocs(10);
933+
shards.refresh("test");
934+
replicateSegments(primaryShard, shards.getReplicas());
935+
shards.assertAllEqual(10);
936+
final ReplicationCheckpoint latestReplicationCheckpoint = primaryShard.getLatestReplicationCheckpoint();
937+
try (GatedCloseable<SegmentInfos> segmentInfosSnapshot = primaryShard.getSegmentInfosSnapshot()) {
938+
assertEquals(latestReplicationCheckpoint, primaryShard.computeReplicationCheckpoint(segmentInfosSnapshot.get()));
939+
}
940+
final Tuple<GatedCloseable<SegmentInfos>, ReplicationCheckpoint> latestSegmentInfosAndCheckpoint = primaryShard
941+
.getLatestSegmentInfosAndCheckpoint();
942+
try (final GatedCloseable<SegmentInfos> closeable = latestSegmentInfosAndCheckpoint.v1()) {
943+
assertEquals(latestReplicationCheckpoint, primaryShard.computeReplicationCheckpoint(closeable.get()));
944+
}
945+
}
946+
}
947+
948+
public void testComputeReplicationCheckpointNullInfosReturnsEmptyCheckpoint() throws Exception {
949+
try (ReplicationGroup shards = createGroup(1, settings, indexMapping, new NRTReplicationEngineFactory(), createTempDir())) {
950+
final IndexShard primaryShard = shards.getPrimary();
951+
assertEquals(ReplicationCheckpoint.empty(primaryShard.shardId), primaryShard.computeReplicationCheckpoint(null));
952+
}
953+
}
954+
928955
private SnapshotShardsService getSnapshotShardsService(IndexShard replicaShard) {
929956
final TransportService transportService = mock(TransportService.class);
930957
when(transportService.getThreadPool()).thenReturn(threadPool);

0 commit comments

Comments
 (0)