Skip to content

Commit 643f969

Browse files
author
Harsh Garg
committed
Adding warn logging for timed out remote state read tasks
Signed-off-by: Harsh Garg <gkharsh@amazon.com>
1 parent 52b8b05 commit 643f969

10 files changed

+89
-21
lines changed

server/src/main/java/org/opensearch/cluster/routing/remote/InternalRemoteRoutingTableService.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,8 @@ protected void doStart() {
256256
clusterName,
257257
threadPool,
258258
ThreadPool.Names.REMOTE_STATE_READ,
259-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
259+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
260+
clusterSettings
260261
);
261262
}
262263

server/src/main/java/org/opensearch/common/remote/RemoteWriteableEntityBlobStore.java

+44-3
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,13 @@
88

99
package org.opensearch.common.remote;
1010

11+
import org.apache.logging.log4j.LogManager;
12+
import org.apache.logging.log4j.Logger;
1113
import org.opensearch.common.blobstore.BlobPath;
1214
import org.opensearch.common.blobstore.stream.write.WritePriority;
15+
import org.opensearch.common.settings.ClusterSettings;
16+
import org.opensearch.common.settings.Setting;
17+
import org.opensearch.common.unit.TimeValue;
1318
import org.opensearch.core.action.ActionListener;
1419
import org.opensearch.index.translog.transfer.BlobStoreTransferService;
1520
import org.opensearch.repositories.blobstore.BlobStoreRepository;
@@ -29,25 +34,44 @@
2934
*/
3035
public class RemoteWriteableEntityBlobStore<T, U extends RemoteWriteableBlobEntity<T>> implements RemoteWritableEntityStore<T, U> {
3136

37+
private static final Logger logger = LogManager.getLogger(RemoteWriteableEntityBlobStore.class);
3238
private final BlobStoreTransferService transferService;
3339
private final BlobStoreRepository blobStoreRepository;
3440
private final String clusterName;
3541
private final ExecutorService executorService;
3642
private final String pathToken;
43+
/**
44+
* To be used for identifying and logging read tasks/entities which take considerably more time in getting completed.
45+
* Threshold corresponds to the total time spent in reading the blob along with deserializing the i/p stream.
46+
*/
47+
public static final Setting<TimeValue> REMOTE_STORE_SLOW_READ_LOGGING_THRESHOLD_SETTING = Setting.positiveTimeSetting(
48+
"cluster.remote_store.slow_read_logging_threshold",
49+
TimeValue.timeValueSeconds(10),
50+
Setting.Property.Dynamic,
51+
Setting.Property.NodeScope
52+
);
53+
private volatile TimeValue slowReadLoggingThreshold;
3754

3855
public RemoteWriteableEntityBlobStore(
3956
final BlobStoreTransferService blobStoreTransferService,
4057
final BlobStoreRepository blobStoreRepository,
4158
final String clusterName,
4259
final ThreadPool threadPool,
4360
final String executor,
44-
final String pathToken
61+
final String pathToken,
62+
final ClusterSettings clusterSettings
4563
) {
4664
this.transferService = blobStoreTransferService;
4765
this.blobStoreRepository = blobStoreRepository;
4866
this.clusterName = clusterName;
4967
this.executorService = threadPool.executor(executor);
5068
this.pathToken = pathToken;
69+
this.slowReadLoggingThreshold = clusterSettings.get(REMOTE_STORE_SLOW_READ_LOGGING_THRESHOLD_SETTING);
70+
clusterSettings.addSettingsUpdateConsumer(REMOTE_STORE_SLOW_READ_LOGGING_THRESHOLD_SETTING, this::setSlowReadLoggingThreshold);
71+
}
72+
73+
private void setSlowReadLoggingThreshold(TimeValue slowReadLoggingThreshold) {
74+
this.slowReadLoggingThreshold = slowReadLoggingThreshold;
5175
}
5276

5377
@Override
@@ -71,10 +95,13 @@ public void writeAsync(final U entity, final ActionListener<Void> listener) {
7195

7296
@Override
7397
public T read(final U entity) throws IOException {
74-
// TODO Add timing logs and tracing
7598
assert entity.getFullBlobName() != null;
99+
final long readStartTimeNS = System.nanoTime();
76100
try (InputStream inputStream = transferService.downloadBlob(getBlobPathForDownload(entity), entity.getBlobFileName())) {
77-
return entity.deserialize(inputStream);
101+
final long deserializeStartTimeNS = System.nanoTime();
102+
T deserializedBlobEntity = entity.deserialize(inputStream);
103+
warnAboutSlowReadIfNeeded(entity, deserializeStartTimeNS, readStartTimeNS);
104+
return deserializedBlobEntity;
78105
}
79106
}
80107

@@ -122,4 +149,18 @@ private static String encodeString(String content) {
122149
return Base64.getUrlEncoder().withoutPadding().encodeToString(content.getBytes(StandardCharsets.UTF_8));
123150
}
124151

152+
private void warnAboutSlowReadIfNeeded(final U entity, final long deserializeStartTimeNS, final long readStartTimeNS) {
153+
long totalReadTimeMS = Math.max(0, TimeValue.nsecToMSec(System.nanoTime() - readStartTimeNS));
154+
long serdeTimeMS = Math.max(0, TimeValue.nsecToMSec(System.nanoTime() - deserializeStartTimeNS));
155+
if (totalReadTimeMS > slowReadLoggingThreshold.getMillis()) {
156+
logger.warn(
157+
"entity [{}] for [{}] took [{}] for serde out of total read time [{}] which is above the total warn threshold of [{}]",
158+
entity.getClass().getSimpleName(),
159+
entity.getBlobFileName(),
160+
serdeTimeMS,
161+
totalReadTimeMS,
162+
slowReadLoggingThreshold
163+
);
164+
}
165+
}
125166
}

server/src/main/java/org/opensearch/common/settings/ClusterSettings.java

+2
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@
8989
import org.opensearch.common.logging.Loggers;
9090
import org.opensearch.common.network.NetworkModule;
9191
import org.opensearch.common.network.NetworkService;
92+
import org.opensearch.common.remote.RemoteWriteableEntityBlobStore;
9293
import org.opensearch.common.settings.Setting.Property;
9394
import org.opensearch.common.util.FeatureFlags;
9495
import org.opensearch.common.util.PageCacheRecycler;
@@ -759,6 +760,7 @@ public void apply(Settings value, Settings current, Settings previous) {
759760
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_HASH_ALGO_SETTING,
760761
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING,
761762
RemoteRoutingTableBlobStore.CLUSTER_REMOTE_STORE_ROUTING_TABLE_PATH_PREFIX,
763+
RemoteWriteableEntityBlobStore.REMOTE_STORE_SLOW_READ_LOGGING_THRESHOLD_SETTING,
762764

763765
// Admission Control Settings
764766
AdmissionControlSettings.ADMISSION_CONTROL_TRANSPORT_LAYER_MODE,

server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateAttributesManager.java

+9-4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import org.opensearch.common.remote.AbstractClusterMetadataWriteableBlobEntity;
1515
import org.opensearch.common.remote.AbstractRemoteWritableEntityManager;
1616
import org.opensearch.common.remote.RemoteWriteableEntityBlobStore;
17+
import org.opensearch.common.settings.ClusterSettings;
1718
import org.opensearch.core.action.ActionListener;
1819
import org.opensearch.core.common.io.stream.NamedWriteableRegistry;
1920
import org.opensearch.gateway.remote.model.RemoteClusterBlocks;
@@ -43,7 +44,8 @@ public class RemoteClusterStateAttributesManager extends AbstractRemoteWritableE
4344
BlobStoreRepository blobStoreRepository,
4445
BlobStoreTransferService blobStoreTransferService,
4546
NamedWriteableRegistry namedWriteableRegistry,
46-
ThreadPool threadpool
47+
ThreadPool threadpool,
48+
ClusterSettings clusterSettings
4749
) {
4850
this.remoteWritableEntityStores.put(
4951
RemoteDiscoveryNodes.DISCOVERY_NODES,
@@ -53,7 +55,8 @@ public class RemoteClusterStateAttributesManager extends AbstractRemoteWritableE
5355
clusterName,
5456
threadpool,
5557
ThreadPool.Names.REMOTE_STATE_READ,
56-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
58+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
59+
clusterSettings
5760
)
5861
);
5962
this.remoteWritableEntityStores.put(
@@ -64,7 +67,8 @@ public class RemoteClusterStateAttributesManager extends AbstractRemoteWritableE
6467
clusterName,
6568
threadpool,
6669
ThreadPool.Names.REMOTE_STATE_READ,
67-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
70+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
71+
clusterSettings
6872
)
6973
);
7074
this.remoteWritableEntityStores.put(
@@ -75,7 +79,8 @@ public class RemoteClusterStateAttributesManager extends AbstractRemoteWritableE
7579
clusterName,
7680
threadpool,
7781
ThreadPool.Names.REMOTE_STATE_READ,
78-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
82+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
83+
clusterSettings
7984
)
8085
);
8186
}

server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java

+10-2
Original file line numberDiff line numberDiff line change
@@ -1114,7 +1114,8 @@ public void start() {
11141114
blobStoreRepository,
11151115
blobStoreTransferService,
11161116
namedWriteableRegistry,
1117-
threadpool
1117+
threadpool,
1118+
clusterSettings
11181119
);
11191120

11201121
remoteRoutingTableService.start();
@@ -1394,7 +1395,14 @@ ClusterState readClusterStateInParallel(
13941395
try {
13951396
if (latch.await(this.remoteStateReadTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
13961397
RemoteStateTransferException exception = new RemoteStateTransferException(
1397-
"Timed out waiting to read cluster state from remote within timeout " + this.remoteStateReadTimeout
1398+
String.format(
1399+
Locale.ROOT,
1400+
"Timed out waiting to read total [%s] tasks for cluster state from remote within timeout of [%s]. Could only read [%s] tasks while [%s] tasks failed to be read",
1401+
totalReadTasks,
1402+
this.remoteStateReadTimeout,
1403+
latch.getCount(),
1404+
exceptionList.size()
1405+
)
13981406
);
13991407
exceptionList.forEach(exception::addSuppressed);
14001408
throw exception;

server/src/main/java/org/opensearch/gateway/remote/RemoteGlobalMetadataManager.java

+14-7
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ public class RemoteGlobalMetadataManager extends AbstractRemoteWritableEntityMan
9191
clusterName,
9292
threadpool,
9393
ThreadPool.Names.REMOTE_STATE_READ,
94-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
94+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
95+
clusterSettings
9596
)
9697
);
9798
this.remoteWritableEntityStores.put(
@@ -102,7 +103,8 @@ public class RemoteGlobalMetadataManager extends AbstractRemoteWritableEntityMan
102103
clusterName,
103104
threadpool,
104105
ThreadPool.Names.REMOTE_STATE_READ,
105-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
106+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
107+
clusterSettings
106108
)
107109
);
108110
this.remoteWritableEntityStores.put(
@@ -113,7 +115,8 @@ public class RemoteGlobalMetadataManager extends AbstractRemoteWritableEntityMan
113115
clusterName,
114116
threadpool,
115117
ThreadPool.Names.REMOTE_STATE_READ,
116-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
118+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
119+
clusterSettings
117120
)
118121
);
119122
this.remoteWritableEntityStores.put(
@@ -124,7 +127,8 @@ public class RemoteGlobalMetadataManager extends AbstractRemoteWritableEntityMan
124127
clusterName,
125128
threadpool,
126129
ThreadPool.Names.REMOTE_STATE_READ,
127-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
130+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
131+
clusterSettings
128132
)
129133
);
130134
this.remoteWritableEntityStores.put(
@@ -135,7 +139,8 @@ public class RemoteGlobalMetadataManager extends AbstractRemoteWritableEntityMan
135139
clusterName,
136140
threadpool,
137141
ThreadPool.Names.REMOTE_STATE_READ,
138-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
142+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
143+
clusterSettings
139144
)
140145
);
141146
this.remoteWritableEntityStores.put(
@@ -146,7 +151,8 @@ public class RemoteGlobalMetadataManager extends AbstractRemoteWritableEntityMan
146151
clusterName,
147152
threadpool,
148153
ThreadPool.Names.REMOTE_STATE_READ,
149-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
154+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
155+
clusterSettings
150156
)
151157
);
152158
this.remoteWritableEntityStores.put(
@@ -157,7 +163,8 @@ public class RemoteGlobalMetadataManager extends AbstractRemoteWritableEntityMan
157163
clusterName,
158164
threadpool,
159165
ThreadPool.Names.REMOTE_STATE_READ,
160-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
166+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
167+
clusterSettings
161168
)
162169
);
163170
clusterSettings.addSettingsUpdateConsumer(GLOBAL_METADATA_UPLOAD_TIMEOUT_SETTING, this::setGlobalMetadataUploadTimeout);

server/src/main/java/org/opensearch/gateway/remote/RemoteIndexMetadataManager.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ public RemoteIndexMetadataManager(
9292
clusterName,
9393
threadpool,
9494
ThreadPool.Names.REMOTE_STATE_READ,
95-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
95+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
96+
clusterSettings
9697
)
9798
);
9899
this.namedXContentRegistry = blobStoreRepository.getNamedXContentRegistry();

server/src/main/java/org/opensearch/gateway/remote/RemoteManifestManager.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ public class RemoteManifestManager {
8585
clusterName,
8686
threadpool,
8787
ThreadPool.Names.REMOTE_STATE_READ,
88-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
88+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
89+
clusterSettings
8990
);
9091
;
9192
clusterSettings.addSettingsUpdateConsumer(METADATA_MANIFEST_UPLOAD_TIMEOUT_SETTING, this::setMetadataManifestUploadTimeout);

server/src/main/java/org/opensearch/gateway/remote/model/RemoteRoutingTableBlobStore.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ public RemoteRoutingTableBlobStore(
8787
clusterName,
8888
threadPool,
8989
executor,
90-
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN
90+
RemoteClusterStateUtils.CLUSTER_STATE_PATH_TOKEN,
91+
clusterSettings
9192
);
9293
this.pathType = clusterSettings.get(REMOTE_ROUTING_TABLE_PATH_TYPE_SETTING);
9394
this.pathHashAlgo = clusterSettings.get(REMOTE_ROUTING_TABLE_PATH_HASH_ALGO_SETTING);

server/src/test/java/org/opensearch/gateway/remote/RemoteClusterStateAttributesManagerTests.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ public void setup() throws Exception {
9797
blobStoreRepository,
9898
blobStoreTransferService,
9999
writableRegistry(),
100-
threadPool
100+
threadPool,
101+
clusterSettings
101102
);
102103
}
103104

0 commit comments

Comments
 (0)