Skip to content

Commit 5ca675f

Browse files
zane-neoakolarkunnu
authored andcommitted
fix cluster not able to spin up issue when disk usage exceeds threshold (opensearch-project#15258)
* fix cluster not able to spin up issue when disk usage exceeds threshold Signed-off-by: zane-neo <zaniu@amazon.com> * Add comment to changes Signed-off-by: zane-neo <zaniu@amazon.com> * Add UT to ensure the keepAliveThread starts before node starts Signed-off-by: zane-neo <zaniu@amazon.com> * remove unused imports Signed-off-by: zane-neo <zaniu@amazon.com> * Fix forbidden API calls check failed issue Signed-off-by: zane-neo <zaniu@amazon.com> * format code Signed-off-by: zane-neo <zaniu@amazon.com> * format code Signed-off-by: zane-neo <zaniu@amazon.com> * change setInstance method to static Signed-off-by: zane-neo <zaniu@amazon.com> * Add countdownlatch in test to coordinate the thread to avoid concureency issue caused test failure Signed-off-by: zane-neo <zaniu@amazon.com> --------- Signed-off-by: zane-neo <zaniu@amazon.com>
1 parent 5dfdba4 commit 5ca675f

File tree

3 files changed

+63
-2
lines changed

3 files changed

+63
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
8383
- Fix protobuf-java leak through client library dependencies ([#16254](https://github.com/opensearch-project/OpenSearch/pull/16254))
8484
- Fix multi-search with template doesn't return status code ([#16265](https://github.com/opensearch-project/OpenSearch/pull/16265))
8585
- Fix wrong default value when setting `index.number_of_routing_shards` to null on index creation ([#16331](https://github.com/opensearch-project/OpenSearch/pull/16331))
86+
- Fix disk usage exceeds threshold cluster can't spin up issue ([#15258](https://github.com/opensearch-project/OpenSearch/pull/15258)))
87+
8688

8789
### Security
8890

distribution/tools/keystore-cli/src/test/java/org/opensearch/bootstrap/BootstrapTests.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,15 @@
3131

3232
package org.opensearch.bootstrap;
3333

34+
import org.opensearch.common.logging.LogConfigurator;
3435
import org.opensearch.common.settings.KeyStoreCommandTestCase;
3536
import org.opensearch.common.settings.KeyStoreWrapper;
3637
import org.opensearch.common.settings.SecureSettings;
3738
import org.opensearch.common.settings.Settings;
3839
import org.opensearch.common.util.io.IOUtils;
3940
import org.opensearch.core.common.settings.SecureString;
4041
import org.opensearch.env.Environment;
42+
import org.opensearch.node.Node;
4143
import org.opensearch.test.OpenSearchTestCase;
4244
import org.junit.After;
4345
import org.junit.Before;
@@ -51,8 +53,14 @@
5153
import java.nio.file.Path;
5254
import java.util.ArrayList;
5355
import java.util.List;
56+
import java.util.concurrent.CountDownLatch;
57+
import java.util.concurrent.TimeUnit;
58+
import java.util.concurrent.atomic.AtomicInteger;
5459

5560
import static org.hamcrest.Matchers.equalTo;
61+
import static org.mockito.Mockito.doAnswer;
62+
import static org.mockito.Mockito.mock;
63+
import static org.mockito.Mockito.verify;
5664

5765
public class BootstrapTests extends OpenSearchTestCase {
5866
Environment env;
@@ -131,4 +139,38 @@ private void assertPassphraseRead(String source, String expected) {
131139
}
132140
}
133141

142+
public void testInitExecutionOrder() throws Exception {
143+
AtomicInteger order = new AtomicInteger(0);
144+
CountDownLatch countDownLatch = new CountDownLatch(1);
145+
Thread mockThread = new Thread(() -> {
146+
assertEquals(0, order.getAndIncrement());
147+
countDownLatch.countDown();
148+
});
149+
150+
Node mockNode = mock(Node.class);
151+
doAnswer(invocation -> {
152+
try {
153+
boolean threadStarted = countDownLatch.await(1000, TimeUnit.MILLISECONDS);
154+
assertTrue(
155+
"Waited for one second but the keepAliveThread isn't started, please check the execution order of"
156+
+ "keepAliveThread.start and node.start",
157+
threadStarted
158+
);
159+
} catch (InterruptedException e) {
160+
fail("Thread interrupted");
161+
}
162+
assertEquals(1, order.getAndIncrement());
163+
return null;
164+
}).when(mockNode).start();
165+
166+
LogConfigurator.registerErrorListener();
167+
Bootstrap testBootstrap = new Bootstrap(mockThread, mockNode);
168+
Bootstrap.setInstance(testBootstrap);
169+
170+
Bootstrap.startInstance(testBootstrap);
171+
172+
verify(mockNode).start();
173+
assertEquals(2, order.get());
174+
}
175+
134176
}

server/src/main/java/org/opensearch/bootstrap/Bootstrap.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,17 @@ final class Bootstrap {
9393
private final Thread keepAliveThread;
9494
private final Spawner spawner = new Spawner();
9595

96+
// For testing purpose
97+
static void setInstance(Bootstrap bootstrap) {
98+
INSTANCE = bootstrap;
99+
}
100+
101+
// For testing purpose
102+
Bootstrap(Thread keepAliveThread, Node node) {
103+
this.keepAliveThread = keepAliveThread;
104+
this.node = node;
105+
}
106+
96107
/** creates a new instance */
97108
Bootstrap() {
98109
keepAliveThread = new Thread(new Runnable() {
@@ -336,8 +347,10 @@ private static Environment createEnvironment(
336347
}
337348

338349
private void start() throws NodeValidationException {
339-
node.start();
350+
// keepAliveThread should start first than node to ensure the cluster can spin up successfully in edge cases:
351+
// https://github.com/opensearch-project/OpenSearch/issues/14791
340352
keepAliveThread.start();
353+
node.start();
341354
}
342355

343356
static void stop() throws IOException {
@@ -410,7 +423,7 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui
410423
throw new BootstrapException(e);
411424
}
412425

413-
INSTANCE.start();
426+
startInstance(INSTANCE);
414427

415428
// We don't close stderr if `--quiet` is passed, because that
416429
// hides fatal startup errors. For example, if OpenSearch is
@@ -462,6 +475,10 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui
462475
}
463476
}
464477

478+
static void startInstance(Bootstrap instance) throws NodeValidationException {
479+
instance.start();
480+
}
481+
465482
@SuppressForbidden(reason = "System#out")
466483
private static void closeSystOut() {
467484
System.out.close();

0 commit comments

Comments
 (0)