Skip to content

Commit c1c7218

Browse files
Refresh potential lost connections at query start for _search (#130463)
CPS S2D9: Explicitly refresh connection(s) to remote(s) before executing query. Previously, we'd refresh connection(s) to remote only when skip_unavailable=false. We now do it when operating under CPS context too. However, to prevent listening for too long, we now listen for a short time -- the duration to wait is controlled by the setting search.ccs.force_connect_timeout that we'd eventually inject for CPS env.
1 parent b9573b3 commit c1c7218

File tree

4 files changed

+304
-68
lines changed

4 files changed

+304
-68
lines changed

docs/changelog/130463.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 130463
2+
summary: Refresh potential lost connections at query start for `_search`
3+
area: Search
4+
type: enhancement
5+
issues: []
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the "Elastic License
4+
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
5+
* Public License v 1"; you may not use this file except in compliance with, at
6+
* your election, the "Elastic License 2.0", the "GNU Affero General Public
7+
* License v3.0 only", or the "Server Side Public License, v 1".
8+
*/
9+
10+
package org.elasticsearch.indices.cluster;
11+
12+
import org.elasticsearch.action.search.SearchRequest;
13+
import org.elasticsearch.action.search.TransportSearchAction;
14+
import org.elasticsearch.common.settings.Setting;
15+
import org.elasticsearch.common.settings.Settings;
16+
import org.elasticsearch.common.util.CollectionUtils;
17+
import org.elasticsearch.plugins.ClusterPlugin;
18+
import org.elasticsearch.plugins.Plugin;
19+
import org.elasticsearch.test.AbstractMultiClustersTestCase;
20+
import org.elasticsearch.test.transport.MockTransportService;
21+
import org.elasticsearch.transport.TransportService;
22+
import org.hamcrest.Matchers;
23+
24+
import java.util.Collection;
25+
import java.util.List;
26+
import java.util.Map;
27+
import java.util.concurrent.CountDownLatch;
28+
29+
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
30+
31+
public class RemoteSearchForceConnectTimeoutIT extends AbstractMultiClustersTestCase {
32+
private static final String REMOTE_CLUSTER_1 = "cluster-a";
33+
34+
public static class ForceConnectTimeoutPlugin extends Plugin implements ClusterPlugin {
35+
@Override
36+
public List<Setting<?>> getSettings() {
37+
return List.of(ForceConnectTimeoutSetting);
38+
}
39+
}
40+
41+
private static final Setting<String> ForceConnectTimeoutSetting = Setting.simpleString(
42+
"search.ccs.force_connect_timeout",
43+
Setting.Property.NodeScope
44+
);
45+
46+
@Override
47+
protected List<String> remoteClusterAlias() {
48+
return List.of(REMOTE_CLUSTER_1);
49+
}
50+
51+
@Override
52+
protected Collection<Class<? extends Plugin>> nodePlugins(String clusterAlias) {
53+
return CollectionUtils.appendToCopy(super.nodePlugins(clusterAlias), ForceConnectTimeoutPlugin.class);
54+
}
55+
56+
@Override
57+
protected Settings nodeSettings() {
58+
/*
59+
* This is the setting that controls how long TransportSearchAction will wait for establishing a connection
60+
* with a remote. At present, we set it to low 1s to prevent stalling the test for too long -- this is consistent
61+
* with what we've done in other tests.
62+
*/
63+
return Settings.builder().put(super.nodeSettings()).put("search.ccs.force_connect_timeout", "1s").build();
64+
}
65+
66+
@Override
67+
protected Map<String, Boolean> skipUnavailableForRemoteClusters() {
68+
return Map.of(REMOTE_CLUSTER_1, true);
69+
}
70+
71+
public void testTimeoutSetting() {
72+
var latch = new CountDownLatch(1);
73+
for (String nodeName : cluster(LOCAL_CLUSTER).getNodeNames()) {
74+
MockTransportService mts = (MockTransportService) cluster(LOCAL_CLUSTER).getInstance(TransportService.class, nodeName);
75+
76+
mts.addConnectBehavior(
77+
cluster(REMOTE_CLUSTER_1).getInstance(TransportService.class, randomFrom(cluster(REMOTE_CLUSTER_1).getNodeNames())),
78+
((transport, discoveryNode, profile, listener) -> {
79+
try {
80+
latch.await();
81+
} catch (InterruptedException e) {
82+
throw new AssertionError(e);
83+
}
84+
85+
transport.openConnection(discoveryNode, profile, listener);
86+
})
87+
);
88+
}
89+
90+
// Add some dummy data to prove we are communicating fine with the remote.
91+
assertAcked(client(REMOTE_CLUSTER_1).admin().indices().prepareCreate("test-index"));
92+
client(REMOTE_CLUSTER_1).prepareIndex("test-index").setSource("sample-field", "sample-value").get();
93+
client(REMOTE_CLUSTER_1).admin().indices().prepareRefresh("test-index").get();
94+
95+
/*
96+
* Do a full restart so that our custom connect behaviour takes effect since it does not apply to
97+
* pre-existing connections -- they're already established by the time this test runs.
98+
*/
99+
try {
100+
cluster(REMOTE_CLUSTER_1).fullRestart();
101+
} catch (Exception e) {
102+
throw new AssertionError(e);
103+
} finally {
104+
var searchRequest = new SearchRequest("*", "*:*");
105+
searchRequest.allowPartialSearchResults(false);
106+
var result = safeGet(client().execute(TransportSearchAction.TYPE, searchRequest));
107+
108+
// The remote cluster should've failed.
109+
var failures = result.getClusters().getCluster(REMOTE_CLUSTER_1).getFailures();
110+
assertThat(failures.size(), Matchers.equalTo(1));
111+
112+
/*
113+
* Reason should be a timed out exception. The timeout should be equal to what we've set and there should
114+
* be a reference to the subscribable listener -- which is what we use to listen for a valid connection.
115+
*/
116+
var failureReason = failures.getFirst().reason();
117+
assertThat(
118+
failureReason,
119+
Matchers.containsString("org.elasticsearch.ElasticsearchTimeoutException: timed out after [1s/1000ms]")
120+
);
121+
assertThat(failureReason, Matchers.containsString("SubscribableListener"));
122+
latch.countDown();
123+
result.decRef();
124+
}
125+
}
126+
}

0 commit comments

Comments
 (0)