Skip to content

Commit c8b1dcc

Browse files
sachinpkaleSachin Kale
authored andcommitted
Add support to skip pinned timestamp in remote segment garbage collector (opensearch-project#15017)
--------- Signed-off-by: Sachin Kale <kalsac@amazon.com> Co-authored-by: Sachin Kale <kalsac@amazon.com>
1 parent 8228f60 commit c8b1dcc

File tree

8 files changed

+430
-51
lines changed

8 files changed

+430
-51
lines changed

server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreIT.java

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,10 +217,15 @@ public void testStaleCommitDeletionWithInvokeFlush() throws Exception {
217217
} else {
218218
// As delete is async its possible that the file gets created before the deletion or after
219219
// deletion.
220-
MatcherAssert.assertThat(
221-
actualFileCount,
222-
is(oneOf(lastNMetadataFilesToKeep - 1, lastNMetadataFilesToKeep, lastNMetadataFilesToKeep + 1))
223-
);
220+
if (RemoteStoreSettings.isPinnedTimestampsEnabled()) {
221+
// With pinned timestamp, we also keep md files since last successful fetch
222+
assertTrue(actualFileCount >= lastNMetadataFilesToKeep);
223+
} else {
224+
MatcherAssert.assertThat(
225+
actualFileCount,
226+
is(oneOf(lastNMetadataFilesToKeep - 1, lastNMetadataFilesToKeep, lastNMetadataFilesToKeep + 1))
227+
);
228+
}
224229
}
225230
}, 30, TimeUnit.SECONDS);
226231
}
@@ -249,7 +254,12 @@ public void testStaleCommitDeletionWithMinSegmentFiles_3() throws Exception {
249254
Path indexPath = Path.of(segmentRepoPath + "/" + shardPath);
250255
int actualFileCount = getFileCount(indexPath);
251256
// We also allow (numberOfIterations + 1) as index creation also triggers refresh.
252-
MatcherAssert.assertThat(actualFileCount, is(oneOf(4)));
257+
if (RemoteStoreSettings.isPinnedTimestampsEnabled()) {
258+
// With pinned timestamp, we also keep md files since last successful fetch
259+
assertTrue(actualFileCount >= 4);
260+
} else {
261+
assertEquals(4, actualFileCount);
262+
}
253263
}
254264

255265
public void testStaleCommitDeletionWithMinSegmentFiles_Disabled() throws Exception {

server/src/internalClusterTest/java/org/opensearch/snapshots/DeleteSnapshotIT.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.opensearch.common.settings.Settings;
2020
import org.opensearch.common.unit.TimeValue;
2121
import org.opensearch.index.store.RemoteBufferedOutputDirectory;
22+
import org.opensearch.indices.RemoteStoreSettings;
2223
import org.opensearch.remotestore.RemoteStoreBaseIntegTestCase;
2324
import org.opensearch.repositories.RepositoriesService;
2425
import org.opensearch.repositories.blobstore.BlobStoreRepository;
@@ -287,8 +288,14 @@ public void testDeleteMultipleShallowCopySnapshotsCase3() throws Exception {
287288
public void testRemoteStoreCleanupForDeletedIndex() throws Exception {
288289
disableRepoConsistencyCheck("Remote store repository is being used in the test");
289290
final Path remoteStoreRepoPath = randomRepoPath();
290-
internalCluster().startClusterManagerOnlyNode(remoteStoreClusterSettings(REMOTE_REPO_NAME, remoteStoreRepoPath));
291-
internalCluster().startDataOnlyNode(remoteStoreClusterSettings(REMOTE_REPO_NAME, remoteStoreRepoPath));
291+
Settings settings = remoteStoreClusterSettings(REMOTE_REPO_NAME, remoteStoreRepoPath);
292+
// Disabling pinned timestamp as this test is specifically for shallow snapshot.
293+
settings = Settings.builder()
294+
.put(settings)
295+
.put(RemoteStoreSettings.CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_ENABLED.getKey(), false)
296+
.build();
297+
internalCluster().startClusterManagerOnlyNode(settings);
298+
internalCluster().startDataOnlyNode(settings);
292299
final Client clusterManagerClient = internalCluster().clusterManagerClient();
293300
ensureStableCluster(2);
294301

server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java

Lines changed: 95 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadata;
4141
import org.opensearch.index.store.remote.metadata.RemoteSegmentMetadataHandler;
4242
import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint;
43+
import org.opensearch.node.remotestore.RemoteStorePinnedTimestampService;
4344
import org.opensearch.threadpool.ThreadPool;
4445

4546
import java.io.FileNotFoundException;
@@ -91,6 +92,8 @@ public final class RemoteSegmentStoreDirectory extends FilterDirectory implement
9192

9293
private final RemoteStoreLockManager mdLockManager;
9394

95+
private final Map<Long, String> metadataFilePinnedTimestampMap;
96+
9497
private final ThreadPool threadPool;
9598

9699
/**
@@ -132,6 +135,7 @@ public RemoteSegmentStoreDirectory(
132135
this.remoteMetadataDirectory = remoteMetadataDirectory;
133136
this.mdLockManager = mdLockManager;
134137
this.threadPool = threadPool;
138+
this.metadataFilePinnedTimestampMap = new HashMap<>();
135139
this.logger = Loggers.getLogger(getClass(), shardId);
136140
init();
137141
}
@@ -176,6 +180,42 @@ public RemoteSegmentMetadata initializeToSpecificCommit(long primaryTerm, long c
176180
return remoteSegmentMetadata;
177181
}
178182

183+
/**
184+
* Initializes the remote segment metadata to a specific timestamp.
185+
*
186+
* @param timestamp The timestamp to initialize the remote segment metadata to.
187+
* @return The RemoteSegmentMetadata object corresponding to the specified timestamp, or null if no metadata file is found for that timestamp.
188+
* @throws IOException If an I/O error occurs while reading the metadata file.
189+
*/
190+
public RemoteSegmentMetadata initializeToSpecificTimestamp(long timestamp) throws IOException {
191+
List<String> metadataFiles = remoteMetadataDirectory.listFilesByPrefixInLexicographicOrder(
192+
MetadataFilenameUtils.METADATA_PREFIX,
193+
Integer.MAX_VALUE
194+
);
195+
Set<String> lockedMetadataFiles = RemoteStoreUtils.getPinnedTimestampLockedFiles(
196+
metadataFiles,
197+
Set.of(timestamp),
198+
MetadataFilenameUtils::getTimestamp,
199+
MetadataFilenameUtils::getNodeIdByPrimaryTermAndGen
200+
);
201+
if (lockedMetadataFiles.isEmpty()) {
202+
return null;
203+
}
204+
if (lockedMetadataFiles.size() > 1) {
205+
throw new IOException(
206+
"Expected exactly one metadata file matching timestamp: " + timestamp + " but got " + lockedMetadataFiles
207+
);
208+
}
209+
String metadataFile = lockedMetadataFiles.iterator().next();
210+
RemoteSegmentMetadata remoteSegmentMetadata = readMetadataFile(metadataFile);
211+
if (remoteSegmentMetadata != null) {
212+
this.segmentsUploadedToRemoteStore = new ConcurrentHashMap<>(remoteSegmentMetadata.getMetadata());
213+
} else {
214+
this.segmentsUploadedToRemoteStore = new ConcurrentHashMap<>();
215+
}
216+
return remoteSegmentMetadata;
217+
}
218+
179219
/**
180220
* Read the latest metadata file to get the list of segments uploaded to the remote segment store.
181221
* We upload a metadata file per refresh, but it is not unique per refresh. Refresh metadata file is unique for a given commit.
@@ -324,7 +364,8 @@ public static String getMetadataFilename(
324364
long translogGeneration,
325365
long uploadCounter,
326366
int metadataVersion,
327-
String nodeId
367+
String nodeId,
368+
long creationTimestamp
328369
) {
329370
return String.join(
330371
SEPARATOR,
@@ -334,11 +375,30 @@ public static String getMetadataFilename(
334375
RemoteStoreUtils.invertLong(translogGeneration),
335376
RemoteStoreUtils.invertLong(uploadCounter),
336377
String.valueOf(Objects.hash(nodeId)),
337-
RemoteStoreUtils.invertLong(System.currentTimeMillis()),
378+
RemoteStoreUtils.invertLong(creationTimestamp),
338379
String.valueOf(metadataVersion)
339380
);
340381
}
341382

383+
public static String getMetadataFilename(
384+
long primaryTerm,
385+
long generation,
386+
long translogGeneration,
387+
long uploadCounter,
388+
int metadataVersion,
389+
String nodeId
390+
) {
391+
return getMetadataFilename(
392+
primaryTerm,
393+
generation,
394+
translogGeneration,
395+
uploadCounter,
396+
metadataVersion,
397+
nodeId,
398+
System.currentTimeMillis()
399+
);
400+
}
401+
342402
// Visible for testing
343403
static long getPrimaryTerm(String[] filenameTokens) {
344404
return RemoteStoreUtils.invertLong(filenameTokens[1]);
@@ -778,6 +838,7 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException
778838
);
779839
return;
780840
}
841+
781842
List<String> sortedMetadataFileList = remoteMetadataDirectory.listFilesByPrefixInLexicographicOrder(
782843
MetadataFilenameUtils.METADATA_PREFIX,
783844
Integer.MAX_VALUE
@@ -791,16 +852,44 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException
791852
return;
792853
}
793854

794-
List<String> metadataFilesEligibleToDelete = new ArrayList<>(
795-
sortedMetadataFileList.subList(lastNMetadataFilesToKeep, sortedMetadataFileList.size())
855+
// Check last fetch status of pinned timestamps. If stale, return.
856+
if (RemoteStoreUtils.isPinnedTimestampStateStale()) {
857+
logger.warn("Skipping remote segment store garbage collection as last fetch of pinned timestamp is stale");
858+
return;
859+
}
860+
861+
Tuple<Long, Set<Long>> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps();
862+
863+
Set<String> implicitLockedFiles = RemoteStoreUtils.getPinnedTimestampLockedFiles(
864+
sortedMetadataFileList,
865+
pinnedTimestampsState.v2(),
866+
metadataFilePinnedTimestampMap,
867+
MetadataFilenameUtils::getTimestamp,
868+
MetadataFilenameUtils::getNodeIdByPrimaryTermAndGen
796869
);
797-
Set<String> allLockFiles;
870+
final Set<String> allLockFiles = new HashSet<>(implicitLockedFiles);
871+
798872
try {
799-
allLockFiles = ((RemoteStoreMetadataLockManager) mdLockManager).fetchLockedMetadataFiles(MetadataFilenameUtils.METADATA_PREFIX);
873+
allLockFiles.addAll(
874+
((RemoteStoreMetadataLockManager) mdLockManager).fetchLockedMetadataFiles(MetadataFilenameUtils.METADATA_PREFIX)
875+
);
800876
} catch (Exception e) {
801877
logger.error("Exception while fetching segment metadata lock files, skipping deleteStaleSegments", e);
802878
return;
803879
}
880+
881+
List<String> metadataFilesEligibleToDelete = new ArrayList<>(
882+
sortedMetadataFileList.subList(lastNMetadataFilesToKeep, sortedMetadataFileList.size())
883+
);
884+
885+
// Along with last N files, we need to keep files since last successful run of scheduler
886+
long lastSuccessfulFetchOfPinnedTimestamps = pinnedTimestampsState.v1();
887+
metadataFilesEligibleToDelete = RemoteStoreUtils.filterOutMetadataFilesBasedOnAge(
888+
metadataFilesEligibleToDelete,
889+
MetadataFilenameUtils::getTimestamp,
890+
lastSuccessfulFetchOfPinnedTimestamps
891+
);
892+
804893
List<String> metadataFilesToBeDeleted = metadataFilesEligibleToDelete.stream()
805894
.filter(metadataFile -> allLockFiles.contains(metadataFile) == false)
806895
.collect(Collectors.toList());

server/src/main/java/org/opensearch/node/remotestore/RemoteStorePinnedTimestampService.java

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,9 @@ private ActionListener<Void> getListenerForWriteCallResponse(
219219

220220
private PinnedTimestamps readExistingPinnedTimestamps(String blobFilename, RemotePinnedTimestamps remotePinnedTimestamps) {
221221
remotePinnedTimestamps.setBlobFileName(blobFilename);
222-
remotePinnedTimestamps.setFullBlobName(pinnedTimestampsBlobStore.getBlobPathForUpload(remotePinnedTimestamps));
222+
remotePinnedTimestamps.setFullBlobName(pinnedTimestampsBlobStore().getBlobPathForUpload(remotePinnedTimestamps));
223223
try {
224-
return pinnedTimestampsBlobStore.read(remotePinnedTimestamps);
224+
return pinnedTimestampsBlobStore().read(remotePinnedTimestamps);
225225
} catch (IOException e) {
226226
throw new RuntimeException("Failed to read existing pinned timestamps", e);
227227
}
@@ -245,6 +245,14 @@ public static Tuple<Long, Set<Long>> getPinnedTimestamps() {
245245
return pinnedTimestampsSet;
246246
}
247247

248+
public RemoteStorePinnedTimestampsBlobStore pinnedTimestampsBlobStore() {
249+
return pinnedTimestampsBlobStore;
250+
}
251+
252+
public BlobStoreTransferService blobStoreTransferService() {
253+
return blobStoreTransferService;
254+
}
255+
248256
/**
249257
* Inner class for asynchronously updating the pinned timestamp set.
250258
*/
@@ -266,11 +274,12 @@ protected void runInternal() {
266274
clusterService.state().metadata().clusterUUID(),
267275
blobStoreRepository.getCompressor()
268276
);
269-
BlobPath path = pinnedTimestampsBlobStore.getBlobPathForUpload(remotePinnedTimestamps);
270-
blobStoreTransferService.listAllInSortedOrder(path, remotePinnedTimestamps.getType(), 1, new ActionListener<>() {
277+
BlobPath path = pinnedTimestampsBlobStore().getBlobPathForUpload(remotePinnedTimestamps);
278+
blobStoreTransferService().listAllInSortedOrder(path, remotePinnedTimestamps.getType(), 1, new ActionListener<>() {
271279
@Override
272280
public void onResponse(List<BlobMetadata> blobMetadata) {
273281
if (blobMetadata.isEmpty()) {
282+
pinnedTimestampsSet = new Tuple<>(triggerTimestamp, Set.of());
274283
return;
275284
}
276285
PinnedTimestamps pinnedTimestamps = readExistingPinnedTimestamps(blobMetadata.get(0).name(), remotePinnedTimestamps);

server/src/test/java/org/opensearch/index/remote/RemoteStoreUtilsTests.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1081,5 +1081,4 @@ public void testIsPinnedTimestampStateStaleFeatureEnabled() {
10811081
setupRemotePinnedTimestampFeature(true);
10821082
assertTrue(RemoteStoreUtils.isPinnedTimestampStateStale());
10831083
}
1084-
10851084
}

server/src/test/java/org/opensearch/index/store/BaseRemoteSegmentStoreDirectoryTests.java

Lines changed: 5 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -43,47 +43,19 @@ public class BaseRemoteSegmentStoreDirectoryTests extends IndexShardTestCase {
4343
protected SegmentInfos segmentInfos;
4444
protected ThreadPool threadPool;
4545

46-
protected final String metadataFilename = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(
47-
12,
48-
23,
49-
34,
50-
1,
51-
1,
52-
"node-1"
53-
);
46+
protected String metadataFilename = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(12, 23, 34, 1, 1, "node-1");
5447

55-
protected final String metadataFilenameDup = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(
48+
protected String metadataFilenameDup = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(
5649
12,
5750
23,
5851
34,
5952
2,
6053
1,
6154
"node-2"
6255
);
63-
protected final String metadataFilename2 = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(
64-
12,
65-
13,
66-
34,
67-
1,
68-
1,
69-
"node-1"
70-
);
71-
protected final String metadataFilename3 = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(
72-
10,
73-
38,
74-
34,
75-
1,
76-
1,
77-
"node-1"
78-
);
79-
protected final String metadataFilename4 = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(
80-
10,
81-
36,
82-
34,
83-
1,
84-
1,
85-
"node-1"
86-
);
56+
protected String metadataFilename2 = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(12, 13, 34, 1, 1, "node-1");
57+
protected String metadataFilename3 = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(10, 38, 34, 1, 1, "node-1");
58+
protected String metadataFilename4 = RemoteSegmentStoreDirectory.MetadataFilenameUtils.getMetadataFilename(10, 36, 34, 1, 1, "node-1");
8759

8860
public void setupRemoteSegmentStoreDirectory() throws IOException {
8961
remoteDataDirectory = mock(RemoteDirectory.class);

0 commit comments

Comments
 (0)