Skip to content

Commit c5b64ee

Browse files
Fix bugs causing red indexes with remote indexes during translog upload & store recovery (#10449) (#10498)
--------- (cherry picked from commit 8bb11a6) Signed-off-by: Ashish Singh <ssashish@amazon.com> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 6c04459 commit c5b64ee

File tree

4 files changed

+79
-4
lines changed

4 files changed

+79
-4
lines changed

server/src/internalClusterTest/java/org/opensearch/remotestore/AbstractRemoteStoreMockRepositoryIntegTestCase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ private String getLocalSegmentFilename(String remoteFilename) {
160160
return remoteFilename.split(RemoteSegmentStoreDirectory.SEGMENT_NAME_UUID_SEPARATOR)[0];
161161
}
162162

163-
private IndexResponse indexSingleDoc() {
163+
protected IndexResponse indexSingleDoc() {
164164
return client().prepareIndex(INDEX_NAME)
165165
.setId(UUIDs.randomBase64UUID())
166166
.setSource(randomAlphaOfLength(5), randomAlphaOfLength(5))

server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureIT.java renamed to server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureAndResiliencyIT.java

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,18 @@
1212
import org.opensearch.action.admin.cluster.remotestore.stats.RemoteStoreStatsResponse;
1313
import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse;
1414
import org.opensearch.common.settings.Settings;
15+
import org.opensearch.common.unit.TimeValue;
16+
import org.opensearch.common.util.concurrent.AbstractAsyncTask;
17+
import org.opensearch.common.util.concurrent.UncategorizedExecutionException;
1518
import org.opensearch.core.common.bytes.BytesArray;
1619
import org.opensearch.core.common.bytes.BytesReference;
1720
import org.opensearch.core.common.unit.ByteSizeUnit;
1821
import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException;
1922
import org.opensearch.core.xcontent.MediaTypeRegistry;
23+
import org.opensearch.index.IndexService;
2024
import org.opensearch.index.remote.RemoteSegmentTransferTracker;
25+
import org.opensearch.index.shard.IndexShard;
26+
import org.opensearch.indices.IndicesService;
2127
import org.opensearch.repositories.RepositoriesService;
2228
import org.opensearch.snapshots.mockstore.MockRepository;
2329
import org.opensearch.test.OpenSearchIntegTestCase;
@@ -33,7 +39,7 @@
3339
import static org.opensearch.index.remote.RemoteStorePressureSettings.REMOTE_REFRESH_SEGMENT_PRESSURE_ENABLED;
3440

3541
@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0)
36-
public class RemoteStoreBackpressureIT extends AbstractRemoteStoreMockRepositoryIntegTestCase {
42+
public class RemoteStoreBackpressureAndResiliencyIT extends AbstractRemoteStoreMockRepositoryIntegTestCase {
3743
public void testWritesRejectedDueToConsecutiveFailureBreach() throws Exception {
3844
// Here the doc size of the request remains same throughout the test. After initial indexing, all remote store interactions
3945
// fail leading to consecutive failure limit getting exceeded and leading to rejections.
@@ -156,4 +162,70 @@ private String generateString(int sizeInBytes) {
156162
sb.append("}");
157163
return sb.toString();
158164
}
165+
166+
/**
167+
* Fixes <a href="https://github.com/opensearch-project/OpenSearch/issues/10398">Github#10398</a>
168+
*/
169+
public void testAsyncTrimTaskSucceeds() {
170+
Path location = randomRepoPath().toAbsolutePath();
171+
String dataNodeName = setup(location, 0d, "metadata", Long.MAX_VALUE);
172+
173+
logger.info("Increasing the frequency of async trim task to ensure it runs in background while indexing");
174+
IndexService indexService = internalCluster().getInstance(IndicesService.class, dataNodeName).iterator().next();
175+
((AbstractAsyncTask) indexService.getTrimTranslogTask()).setInterval(TimeValue.timeValueMillis(100));
176+
177+
logger.info("--> Indexing data");
178+
indexData(randomIntBetween(2, 5), true);
179+
logger.info("--> Indexing succeeded");
180+
181+
MockRepository translogRepo = (MockRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName)
182+
.repository(TRANSLOG_REPOSITORY_NAME);
183+
logger.info("--> Failing all remote store interaction");
184+
translogRepo.setRandomControlIOExceptionRate(1d);
185+
186+
for (int i = 0; i < randomIntBetween(5, 10); i++) {
187+
UncategorizedExecutionException exception = assertThrows(UncategorizedExecutionException.class, this::indexSingleDoc);
188+
assertEquals("Failed execution", exception.getMessage());
189+
}
190+
191+
translogRepo.setRandomControlIOExceptionRate(0d);
192+
indexSingleDoc();
193+
logger.info("Indexed single doc successfully");
194+
}
195+
196+
/**
197+
* Fixes <a href="https://github.com/opensearch-project/OpenSearch/issues/10400">Github#10400</a>
198+
*/
199+
public void testSkipLoadGlobalCheckpointToReplicationTracker() {
200+
Path location = randomRepoPath().toAbsolutePath();
201+
String dataNodeName = setup(location, 0d, "metadata", Long.MAX_VALUE);
202+
203+
logger.info("--> Indexing data");
204+
indexData(randomIntBetween(1, 2), true);
205+
logger.info("--> Indexing succeeded");
206+
207+
IndexService indexService = internalCluster().getInstance(IndicesService.class, dataNodeName).iterator().next();
208+
IndexShard indexShard = indexService.getShard(0);
209+
indexShard.failShard("failing shard", null);
210+
211+
ensureRed(INDEX_NAME);
212+
213+
MockRepository translogRepo = (MockRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName)
214+
.repository(TRANSLOG_REPOSITORY_NAME);
215+
logger.info("--> Failing all remote store interaction");
216+
translogRepo.setRandomControlIOExceptionRate(1d);
217+
client().admin().cluster().prepareReroute().setRetryFailed(true).get();
218+
// CLuster stays red still as the remote interactions are still failing
219+
ensureRed(INDEX_NAME);
220+
221+
logger.info("Retrying to allocate failed shards");
222+
client().admin().cluster().prepareReroute().setRetryFailed(true).get();
223+
// CLuster stays red still as the remote interactions are still failing
224+
ensureRed(INDEX_NAME);
225+
226+
logger.info("Stop failing all remote store interactions");
227+
translogRepo.setRandomControlIOExceptionRate(0d);
228+
client().admin().cluster().prepareReroute().setRetryFailed(true).get();
229+
ensureGreen(INDEX_NAME);
230+
}
159231
}

server/src/main/java/org/opensearch/index/IndexService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1286,7 +1286,7 @@ AsyncTranslogFSync getFsyncTask() { // for tests
12861286
return fsyncTask;
12871287
}
12881288

1289-
AsyncTrimTranslogTask getTrimTranslogTask() { // for tests
1289+
public AsyncTrimTranslogTask getTrimTranslogTask() { // for tests
12901290
return trimTranslogTask;
12911291
}
12921292

server/src/main/java/org/opensearch/index/shard/IndexShard.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1470,6 +1470,9 @@ public void flush(FlushRequest request) {
14701470
* {@link org.opensearch.index.translog.TranslogDeletionPolicy} for details
14711471
*/
14721472
public void trimTranslog() {
1473+
if (isRemoteTranslogEnabled()) {
1474+
return;
1475+
}
14731476
verifyNotClosed();
14741477
final Engine engine = getEngine();
14751478
engine.trimUnreferencedTranslogFiles();
@@ -2320,7 +2323,7 @@ public void openEngineAndRecoverFromTranslog() throws IOException {
23202323
};
23212324

23222325
// Do not load the global checkpoint if this is a remote snapshot index
2323-
if (indexSettings.isRemoteSnapshot() == false) {
2326+
if (indexSettings.isRemoteSnapshot() == false && indexSettings.isRemoteTranslogStoreEnabled() == false) {
23242327
loadGlobalCheckpointToReplicationTracker();
23252328
}
23262329

0 commit comments

Comments
 (0)