33
33
package org .opensearch .cluster .coordination ;
34
34
35
35
import org .opensearch .action .admin .cluster .health .ClusterHealthResponse ;
36
- import org .opensearch .cluster .ClusterChangedEvent ;
37
- import org .opensearch .cluster .ClusterStateApplier ;
38
36
import org .opensearch .cluster .NodeConnectionsService ;
39
37
import org .opensearch .cluster .metadata .IndexMetadata ;
40
- import org .opensearch .cluster .node .DiscoveryNode ;
41
38
import org .opensearch .cluster .service .ClusterService ;
42
39
import org .opensearch .common .settings .Settings ;
43
40
import org .opensearch .index .MockEngineFactoryPlugin ;
51
48
import org .opensearch .test .store .MockFSIndexStore ;
52
49
import org .opensearch .test .transport .MockTransportService ;
53
50
import org .opensearch .test .transport .StubbableTransport ;
54
- import org .opensearch .transport .Transport ;
55
51
import org .opensearch .transport .TransportChannel ;
56
- import org .opensearch .transport .TransportConnectionListener ;
57
52
import org .opensearch .transport .TransportRequest ;
58
53
import org .opensearch .transport .TransportRequestHandler ;
59
54
import org .opensearch .transport .TransportService ;
@@ -104,22 +99,14 @@ public void testTransientErrorsDuringRecovery1AreRetried() throws Exception {
104
99
.put (FollowersChecker .FOLLOWER_CHECK_INTERVAL_SETTING .getKey (), "100ms" )
105
100
.put (FollowersChecker .FOLLOWER_CHECK_RETRY_COUNT_SETTING .getKey (), 1 )
106
101
.build ();
107
- // start a cluster-manager node
102
+ // start a 3 node cluster with 1 cluster -manager
108
103
final String cm = internalCluster ().startNode (nodeSettings );
109
-
110
- logger .info ("--> spawning node t1" );
111
- final String blueNodeName = internalCluster ().startNode (
112
- Settings .builder ().put ("node.attr.color" , "blue" ).put (nodeSettings ).build ()
113
- );
114
- logger .info ("--> spawning node t2" );
104
+ internalCluster ().startNode (Settings .builder ().put ("node.attr.color" , "blue" ).put (nodeSettings ).build ());
115
105
final String redNodeName = internalCluster ().startNode (Settings .builder ().put ("node.attr.color" , "red" ).put (nodeSettings ).build ());
116
106
117
- logger .info ("--> initial health check" );
118
107
ClusterHealthResponse response = client ().admin ().cluster ().prepareHealth ().setWaitForNodes (">=3" ).get ();
119
108
assertThat (response .isTimedOut (), is (false ));
120
- logger .info ("--> done initial health check" );
121
109
122
- logger .info ("--> creating index" );
123
110
client ().admin ()
124
111
.indices ()
125
112
.prepareCreate (indexName )
@@ -130,63 +117,22 @@ public void testTransientErrorsDuringRecovery1AreRetried() throws Exception {
130
117
.put (IndexMetadata .SETTING_NUMBER_OF_REPLICAS , 0 )
131
118
)
132
119
.get ();
133
- logger .info ("--> done creating index" );
134
- MockTransportService cmTransportService = (MockTransportService ) internalCluster ().getInstance (TransportService .class , cm );
135
- MockTransportService redTransportService = (MockTransportService ) internalCluster ().getInstance (
136
- TransportService .class ,
137
- redNodeName
138
- );
139
120
140
121
ClusterService cmClsService = internalCluster ().getInstance (ClusterService .class , cm );
141
- // simulate a slow applier on the cm
142
- cmClsService .addStateApplier (new ClusterStateApplier () {
143
- @ Override
144
- public void applyClusterState (ClusterChangedEvent event ) {
145
- if (event .nodesRemoved ()) {
146
- try {
147
- Thread .sleep (3000 );
148
- } catch (InterruptedException e ) {
149
- throw new RuntimeException (e );
150
- }
122
+ // Simulate a slow applier on the cm to delay node-left state application
123
+ cmClsService .addStateApplier (event -> {
124
+ if (event .nodesRemoved ()) {
125
+ try {
126
+ Thread .sleep (3000 );
127
+ } catch (InterruptedException e ) {
128
+ throw new RuntimeException (e );
151
129
}
152
130
}
153
131
});
154
- cmTransportService .connectionManager ().addListener (new TransportConnectionListener () {
155
-
156
- @ Override
157
- public void onConnectionOpened (Transport .Connection connection ) {
158
- // try {
159
- // Thread.sleep(500);
160
- // } catch (InterruptedException e) {
161
- // throw new RuntimeException(e);
162
- // }
163
-
164
- }
165
-
166
- @ Override
167
- public void onNodeConnected (DiscoveryNode node , Transport .Connection connection ) {
168
- // if (node.getName().equals("node_t2")) {
169
- // try {
170
- // Thread.sleep(250);
171
- // } catch (InterruptedException e) {
172
- // throw new RuntimeException(e);
173
- // }
174
- // }
175
- }
176
-
177
- // @Override
178
- // public void onNodeDisconnected(DiscoveryNode node, Transport.Connection connection) {
179
- // try {
180
- // Thread.sleep(5000);
181
- // } catch (InterruptedException e) {
182
- // throw new RuntimeException(e);
183
- // }
184
- // }
185
- });
186
132
AtomicBoolean bb = new AtomicBoolean ();
187
- // simulate followerchecker failure
188
133
189
- ConnectionDelay handlingBehavior = new ConnectionDelay (FOLLOWER_CHECK_ACTION_NAME , () -> {
134
+ // Simulate followerchecker failure on 1 node when bb is false
135
+ ConnectionDelay handlingBehavior = new ConnectionDelay (() -> {
190
136
if (bb .get ()) {
191
137
return ;
192
138
}
@@ -197,55 +143,39 @@ public void onNodeConnected(DiscoveryNode node, Transport.Connection connection)
197
143
}
198
144
throw new NodeHealthCheckFailureException ("non writable exception" );
199
145
});
146
+ MockTransportService redTransportService = (MockTransportService ) internalCluster ().getInstance (
147
+ TransportService .class ,
148
+ redNodeName
149
+ );
200
150
redTransportService .addRequestHandlingBehavior (FOLLOWER_CHECK_ACTION_NAME , handlingBehavior );
201
151
152
+ // Loop runs 10 times to ensure race condition gets reproduced
202
153
for (int i = 0 ; i < 10 ; i ++) {
203
- bb .set (false ); // fail followerchecker by force to trigger node disconnect
204
- logger .info ("--> disconnecting from red node, iteration: " + i );
205
- // cmTransportService.disconnectFromNode(redTransportService.getLocalDiscoNode());
154
+ bb .set (false );
155
+ // fail followerchecker by force to trigger node disconnect
206
156
// now followerchecker should fail and trigger node left
207
- logger .info ("--> checking cluster health 2 nodes, iteration: " + i );
208
157
ClusterHealthResponse response1 = client ().admin ().cluster ().prepareHealth ().setWaitForNodes ("2" ).get ();
209
158
assertThat (response1 .isTimedOut (), is (false ));
210
- logger .info ("--> completed checking cluster health 2 nodes, iteration: " + i );
211
159
212
160
// once we know a node has left, we can re-enable followerchecker to work normally
213
161
bb .set (true );
214
- Thread .sleep (1500 );
215
- logger .info ("--> checking cluster health 3 nodes, iteration: " + i );
216
162
ClusterHealthResponse response2 = client ().admin ().cluster ().prepareHealth ().setWaitForNodes ("3" ).get ();
217
163
assertThat (response2 .isTimedOut (), is (false ));
218
- logger .info ("--> completed checking cluster health 3 nodes, iteration: " + i );
219
-
220
- Thread .sleep (1500 );
221
164
222
- // Checking again
223
- logger .info ("--> checking cluster health 3 nodes again, iteration: " + i );
165
+ // Checking again to validate stability
224
166
ClusterHealthResponse response3 = client ().admin ().cluster ().prepareHealth ().setWaitForNodes ("3" ).get ();
225
167
assertThat (response3 .isTimedOut (), is (false ));
226
- logger .info ("--> completed checking cluster health 3 nodes again, iteration: " + i );
227
168
}
228
169
229
170
bb .set (true );
230
- logger .info ("-->first validation outside loop" );
231
- response = client ().admin ().cluster ().prepareHealth ().setWaitForNodes ("3" ).get ();
232
- assertThat (response .isTimedOut (), is (false ));
233
-
234
- logger .info ("-->sleeping for 20s" );
235
- Thread .sleep (20000 );
236
-
237
- logger .info ("-->second validation outside loop after sleep" );
238
171
response = client ().admin ().cluster ().prepareHealth ().setWaitForNodes ("3" ).get ();
239
172
assertThat (response .isTimedOut (), is (false ));
240
173
}
241
174
242
175
private class ConnectionDelay implements StubbableTransport .RequestHandlingBehavior <TransportRequest > {
243
-
244
- private final String actionName ;
245
176
private final Runnable connectionBreaker ;
246
177
247
- private ConnectionDelay (String actionName , Runnable connectionBreaker ) {
248
- this .actionName = actionName ;
178
+ private ConnectionDelay (Runnable connectionBreaker ) {
249
179
this .connectionBreaker = connectionBreaker ;
250
180
}
251
181
0 commit comments