Skip to content

Commit fc170ee

Browse files
author
Rahul Karajgikar
committed
changes to tests based on comments
Signed-off-by: Rahul Karajgikar <karajgik@amazon.com>
1 parent 83b16de commit fc170ee

File tree

3 files changed

+179
-59
lines changed

3 files changed

+179
-59
lines changed

server/src/internalClusterTest/java/org/opensearch/cluster/coordination/NodeJoinLeftIT.java

Lines changed: 31 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,9 @@
3333
package org.opensearch.cluster.coordination;
3434

3535
import org.apache.logging.log4j.LogManager;
36-
import org.apache.logging.log4j.core.LogEvent;
3736
import org.apache.logging.log4j.core.LoggerContext;
38-
import org.apache.logging.log4j.core.appender.AbstractAppender;
3937
import org.apache.logging.log4j.core.config.Configuration;
4038
import org.apache.logging.log4j.core.config.LoggerConfig;
41-
import org.apache.logging.log4j.core.config.Property;
42-
import org.apache.logging.log4j.core.layout.PatternLayout;
4339
import org.opensearch.action.admin.cluster.health.ClusterHealthResponse;
4440
import org.opensearch.cluster.NodeConnectionsService;
4541
import org.opensearch.cluster.metadata.IndexMetadata;
@@ -53,6 +49,7 @@
5349
import org.opensearch.test.OpenSearchIntegTestCase;
5450
import org.opensearch.test.OpenSearchIntegTestCase.ClusterScope;
5551
import org.opensearch.test.OpenSearchIntegTestCase.Scope;
52+
import org.opensearch.test.TestLogsAppender;
5653
import org.opensearch.test.store.MockFSIndexStore;
5754
import org.opensearch.test.transport.MockTransportService;
5855
import org.opensearch.test.transport.StubbableTransport;
@@ -64,12 +61,11 @@
6461
import org.junit.After;
6562
import org.junit.Before;
6663

67-
import java.util.ArrayList;
6864
import java.util.Arrays;
6965
import java.util.Collection;
7066
import java.util.List;
67+
import java.util.concurrent.TimeUnit;
7168
import java.util.concurrent.atomic.AtomicBoolean;
72-
import java.util.regex.Pattern;
7369

7470
import static org.opensearch.cluster.coordination.FollowersChecker.FOLLOWER_CHECK_ACTION_NAME;
7571
import static org.hamcrest.Matchers.is;
@@ -81,7 +77,7 @@
8177
@ClusterScope(scope = Scope.TEST, numDataNodes = 0)
8278
public class NodeJoinLeftIT extends OpenSearchIntegTestCase {
8379

84-
private TestAppender testAppender;
80+
private TestLogsAppender testLogsAppender;
8581
private String clusterManager;
8682
private String redNodeName;
8783
private LoggerContext loggerContext;
@@ -108,11 +104,13 @@ protected void beforeIndexDeletion() throws Exception {
108104
@Before
109105
public void setUp() throws Exception {
110106
super.setUp();
111-
testAppender = new TestAppender();
107+
// Add any other specific messages you want to capture
108+
List<String> messagesToCapture = Arrays.asList("failed to join", "IllegalStateException");
109+
testLogsAppender = new TestLogsAppender(messagesToCapture);
112110
loggerContext = (LoggerContext) LogManager.getContext(false);
113111
Configuration config = loggerContext.getConfiguration();
114112
LoggerConfig loggerConfig = config.getLoggerConfig(ClusterConnectionManager.class.getName());
115-
loggerConfig.addAppender(testAppender, null, null);
113+
loggerConfig.addAppender(testLogsAppender, null, null);
116114
loggerContext.updateLoggers();
117115

118116
String indexName = "test";
@@ -148,10 +146,11 @@ public void setUp() throws Exception {
148146

149147
@After
150148
public void tearDown() throws Exception {
149+
testLogsAppender.clearCapturedLogs();
151150
loggerContext = (LoggerContext) LogManager.getContext(false);
152151
Configuration config = loggerContext.getConfiguration();
153152
LoggerConfig loggerConfig = config.getLoggerConfig(ClusterConnectionManager.class.getName());
154-
loggerConfig.removeAppender(testAppender.getName());
153+
loggerConfig.removeAppender(testLogsAppender.getName());
155154
loggerContext.updateLoggers();
156155
super.tearDown();
157156
}
@@ -190,9 +189,9 @@ public void testClusterStabilityWhenJoinRequestHappensDuringNodeLeftTask() throw
190189
);
191190
redTransportService.addRequestHandlingBehavior(FOLLOWER_CHECK_ACTION_NAME, simulatedFailureBehaviour);
192191

193-
// Loop runs 10 times to ensure race condition gets reproduced
194-
for (int i = 0; i < 10; i++) {
195-
// Fail followerchecker by force to trigger node disconnect and node left
192+
// Loop runs 5 times to ensure race condition gets reproduced
193+
testLogsAppender.clearCapturedLogs();
194+
for (int i = 0; i < 5; i++) {
196195
logger.info("--> simulating followerchecker failure to trigger node-left");
197196
succeedFollowerChecker.set(false);
198197
ClusterHealthResponse response1 = client().admin().cluster().prepareHealth().setWaitForNodes("2").get();
@@ -214,12 +213,14 @@ public void testClusterStabilityWhenJoinRequestHappensDuringNodeLeftTask() throw
214213
ClusterHealthResponse response = client().admin().cluster().prepareHealth().setWaitForNodes("3").get();
215214
assertThat(response.isTimedOut(), is(false));
216215

217-
// assert that the right exception message showed up in logs
218-
assertTrue(
219-
"Expected IllegalStateException was not logged",
220-
testAppender.containsExceptionMessage("IllegalStateException[cannot make a new connection as disconnect to node")
221-
);
222-
216+
// assert that join requests fail with the right exception
217+
boolean logFound = testLogsAppender.waitForLog("failed to join", 30, TimeUnit.SECONDS)
218+
&& testLogsAppender.waitForLog(
219+
"IllegalStateException[cannot make a new connection as disconnect to node",
220+
30,
221+
TimeUnit.SECONDS
222+
);
223+
assertTrue("Expected log was not found within the timeout period", logFound);
223224
}
224225

225226
public void testClusterStabilityWhenDisconnectDuringSlowNodeLeftTask() throws Exception {
@@ -259,8 +260,9 @@ public void testClusterStabilityWhenDisconnectDuringSlowNodeLeftTask() throws Ex
259260
);
260261
redTransportService.addRequestHandlingBehavior(FOLLOWER_CHECK_ACTION_NAME, simulatedFailureBehaviour);
261262

262-
// Loop runs 10 times to ensure race condition gets reproduced
263-
for (int i = 0; i < 10; i++) {
263+
// Loop runs 5 times to ensure race condition gets reproduced
264+
testLogsAppender.clearCapturedLogs();
265+
for (int i = 0; i < 5; i++) {
264266
// Fail followerchecker by force to trigger node disconnect and node left
265267
logger.info("--> simulating followerchecker failure to trigger node-left");
266268
succeedFollowerChecker.set(false);
@@ -291,11 +293,15 @@ public void testClusterStabilityWhenDisconnectDuringSlowNodeLeftTask() throws Ex
291293
ClusterHealthResponse response = client().admin().cluster().prepareHealth().setWaitForNodes("3").get();
292294
assertThat(response.isTimedOut(), is(false));
293295

294-
// assert that the right exception message showed up in logs
295-
assertTrue(
296-
"Expected IllegalStateException was not logged",
297-
testAppender.containsExceptionMessage("IllegalStateException[cannot make a new connection as disconnect to node")
296+
// assert that join requests fail with the right exception
297+
boolean logFound = testLogsAppender.waitForLog("failed to join", 30, TimeUnit.SECONDS);
298+
assertTrue("Expected log was not found within the timeout period", logFound);
299+
logFound = testLogsAppender.waitForLog(
300+
"IllegalStateException[cannot make a new connection as disconnect to node",
301+
30,
302+
TimeUnit.SECONDS
298303
);
304+
assertTrue("Expected log was not found within the timeout period", logFound);
299305
}
300306

301307
public void testRestartDataNode() throws Exception {
@@ -346,29 +352,4 @@ public void messageReceived(
346352
handler.messageReceived(request, channel, task);
347353
}
348354
}
349-
350-
private static class TestAppender extends AbstractAppender {
351-
private final List<String> logs = new ArrayList<>();
352-
353-
TestAppender() {
354-
super("TestAppender", null, PatternLayout.createDefaultLayout(), false, Property.EMPTY_ARRAY);
355-
start();
356-
}
357-
358-
@Override
359-
public void append(LogEvent event) {
360-
logs.add(event.getMessage().getFormattedMessage());
361-
if (event.getThrown() != null) {
362-
logs.add(event.getThrown().toString());
363-
for (StackTraceElement element : event.getThrown().getStackTrace()) {
364-
logs.add(element.toString());
365-
}
366-
}
367-
}
368-
369-
boolean containsExceptionMessage(String exceptionMessage) {
370-
Pattern pattern = Pattern.compile(Pattern.quote(exceptionMessage), Pattern.CASE_INSENSITIVE);
371-
return logs.stream().anyMatch(log -> pattern.matcher(log).find());
372-
}
373-
}
374355
}

server/src/test/java/org/opensearch/cluster/NodeConnectionsServiceTests.java

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
import org.apache.logging.log4j.Level;
3636
import org.apache.logging.log4j.LogManager;
3737
import org.apache.logging.log4j.Logger;
38+
import org.apache.logging.log4j.core.LoggerContext;
39+
import org.apache.logging.log4j.core.config.Configuration;
40+
import org.apache.logging.log4j.core.config.LoggerConfig;
3841
import org.opensearch.OpenSearchTimeoutException;
3942
import org.opensearch.Version;
4043
import org.opensearch.action.support.PlainActionFuture;
@@ -53,9 +56,11 @@
5356
import org.opensearch.telemetry.tracing.noop.NoopTracer;
5457
import org.opensearch.test.MockLogAppender;
5558
import org.opensearch.test.OpenSearchTestCase;
59+
import org.opensearch.test.TestLogsAppender;
5660
import org.opensearch.test.junit.annotations.TestLogging;
5761
import org.opensearch.threadpool.TestThreadPool;
5862
import org.opensearch.threadpool.ThreadPool;
63+
import org.opensearch.transport.ClusterConnectionManager;
5964
import org.opensearch.transport.ConnectTransportException;
6065
import org.opensearch.transport.ConnectionProfile;
6166
import org.opensearch.transport.Transport;
@@ -69,6 +74,7 @@
6974
import org.junit.Before;
7075

7176
import java.util.ArrayList;
77+
import java.util.Arrays;
7278
import java.util.Collections;
7379
import java.util.HashSet;
7480
import java.util.List;
@@ -94,6 +100,8 @@ public class NodeConnectionsServiceTests extends OpenSearchTestCase {
94100
private ThreadPool threadPool;
95101
private TransportService transportService;
96102
private Map<DiscoveryNode, CheckedRunnable<Exception>> nodeConnectionBlocks;
103+
private TestLogsAppender testLogsAppender;
104+
private LoggerContext loggerContext;
97105

98106
private List<DiscoveryNode> generateNodes() {
99107
List<DiscoveryNode> nodes = new ArrayList<>();
@@ -516,7 +524,7 @@ public void testConnectionCheckerRetriesIfPendingDisconnection() throws Interrup
516524

517525
// setup the connections
518526
final DiscoveryNode node = new DiscoveryNode("node0", buildNewFakeTransportAddress(), Version.CURRENT);
519-
;
527+
520528
final DiscoveryNodes nodes = DiscoveryNodes.builder().add(node).build();
521529

522530
final AtomicBoolean connectionCompleted = new AtomicBoolean();
@@ -526,13 +534,15 @@ public void testConnectionCheckerRetriesIfPendingDisconnection() throws Interrup
526534

527535
// now trigger a disconnect, and then set pending disconnections to true to fail any new connections
528536
final long maxDisconnectionTime = 1000;
529-
final long disconnectionTime = 100;
530-
deterministicTaskQueue.scheduleAt(disconnectionTime, new Runnable() {
537+
deterministicTaskQueue.scheduleNow(new Runnable() {
531538
@Override
532539
public void run() {
533540
transportService.disconnectFromNode(node);
534541
logger.info("--> setting pending disconnections to fail next connection attempts");
535542
service.setPendingDisconnections(new HashSet<>(Collections.singleton(node)));
543+
// we reset the connection count during the first disconnection
544+
// we also clear the captured logs as we want to assert for exceptions that show up after this
545+
testLogsAppender.clearCapturedLogs();
536546
transportService.resetConnectToNodeCallCount();
537547
}
538548

@@ -541,20 +551,39 @@ public String toString() {
541551
return "scheduled disconnection of " + node;
542552
}
543553
});
554+
final long maxReconnectionTime = 2000;
555+
final int expectedReconnectionAttempts = 5;
544556

545-
// ensure the disconnect task completes, give extra time also for connection checker tasks
546-
runTasksUntil(deterministicTaskQueue, maxDisconnectionTime);
547-
548-
// verify that connectionchecker is trying to call connectToNode multiple times
557+
// ensure the disconnect task completes, and run for additional time to check for reconnections
558+
// exit early if we see enough reconnection attempts
549559
logger.info("--> verifying connectionchecker is trying to reconnect");
560+
runTasksUntilExpectedReconnectionAttempts(
561+
deterministicTaskQueue,
562+
maxDisconnectionTime + maxReconnectionTime,
563+
transportService,
564+
expectedReconnectionAttempts
565+
);
566+
567+
// assert that we saw at least the required number of reconnection attempts, and the exceptions that showed up are as expected
550568
logger.info("--> number of reconnection attempts: {}", transportService.getConnectToNodeCallCount());
551-
assertThat("ConnectToNode should be called multiple times", transportService.getConnectToNodeCallCount(), greaterThan(5));
569+
assertThat(
570+
"Did not see enough reconnection attempts from connection checker",
571+
transportService.getConnectToNodeCallCount(),
572+
greaterThan(expectedReconnectionAttempts)
573+
);
574+
boolean logFound = testLogsAppender.waitForLog("failed to connect", 1, TimeUnit.SECONDS)
575+
&& testLogsAppender.waitForLog(
576+
"IllegalStateException: cannot make a new connection as disconnect to node",
577+
1,
578+
TimeUnit.SECONDS
579+
);
580+
assertTrue("Expected log for reconnection failure was not found in the required time period", logFound);
552581
assertFalse("connected to " + node, transportService.nodeConnected(node));
553582

554583
// clear the pending disconnections and ensure the connection gets re-established automatically by connectionchecker
555584
logger.info("--> clearing pending disconnections to allow connections to re-establish");
556585
service.clearPendingDisconnections();
557-
runTasksUntil(deterministicTaskQueue, maxDisconnectionTime + 2 * reconnectIntervalMillis);
586+
runTasksUntil(deterministicTaskQueue, maxDisconnectionTime + maxReconnectionTime + 2 * reconnectIntervalMillis);
558587
assertConnectedExactlyToNodes(transportService, nodes);
559588
}
560589

@@ -569,6 +598,24 @@ private void runTasksUntil(DeterministicTaskQueue deterministicTaskQueue, long e
569598
deterministicTaskQueue.runAllRunnableTasks();
570599
}
571600

601+
private void runTasksUntilExpectedReconnectionAttempts(
602+
DeterministicTaskQueue deterministicTaskQueue,
603+
long endTimeMillis,
604+
TestTransportService transportService,
605+
int expectedReconnectionAttempts
606+
) {
607+
// break the loop if we timeout or if we have enough reconnection attempts
608+
while ((deterministicTaskQueue.getCurrentTimeMillis() < endTimeMillis)
609+
&& (transportService.getConnectToNodeCallCount() <= expectedReconnectionAttempts)) {
610+
if (deterministicTaskQueue.hasRunnableTasks() && randomBoolean()) {
611+
deterministicTaskQueue.runRandomTask();
612+
} else if (deterministicTaskQueue.hasDeferredTasks()) {
613+
deterministicTaskQueue.advanceTime();
614+
}
615+
}
616+
deterministicTaskQueue.runAllRunnableTasks();
617+
}
618+
572619
private void ensureConnections(NodeConnectionsService service) {
573620
final PlainActionFuture<Void> future = new PlainActionFuture<>();
574621
service.ensureConnections(() -> future.onResponse(null));
@@ -594,6 +641,16 @@ private void assertConnected(TransportService transportService, Iterable<Discove
594641
@Before
595642
public void setUp() throws Exception {
596643
super.setUp();
644+
// Add any other specific messages you want to capture
645+
List<String> messagesToCapture = Arrays.asList("failed to connect", "IllegalStateException");
646+
testLogsAppender = new TestLogsAppender(messagesToCapture);
647+
loggerContext = (LoggerContext) LogManager.getContext(false);
648+
Configuration config = loggerContext.getConfiguration();
649+
LoggerConfig loggerConfig = config.getLoggerConfig(NodeConnectionsService.class.getName());
650+
loggerConfig.addAppender(testLogsAppender, null, null);
651+
loggerConfig = config.getLoggerConfig(ClusterConnectionManager.class.getName());
652+
loggerConfig.addAppender(testLogsAppender, null, null);
653+
loggerContext.updateLoggers();
597654
ThreadPool threadPool = new TestThreadPool(getClass().getName());
598655
this.threadPool = threadPool;
599656
nodeConnectionBlocks = newConcurrentMap();
@@ -605,6 +662,14 @@ public void setUp() throws Exception {
605662
@Override
606663
@After
607664
public void tearDown() throws Exception {
665+
testLogsAppender.clearCapturedLogs();
666+
loggerContext = (LoggerContext) LogManager.getContext(false);
667+
Configuration config = loggerContext.getConfiguration();
668+
LoggerConfig loggerConfig = config.getLoggerConfig(NodeConnectionsService.class.getName());
669+
loggerConfig.removeAppender(testLogsAppender.getName());
670+
loggerConfig = config.getLoggerConfig(ClusterConnectionManager.class.getName());
671+
loggerConfig.removeAppender(testLogsAppender.getName());
672+
loggerContext.updateLoggers();
608673
transportService.stop();
609674
ThreadPool.terminate(threadPool, 30, TimeUnit.SECONDS);
610675
threadPool = null;

0 commit comments

Comments
 (0)