11
11
import org .apache .logging .log4j .Logger ;
12
12
import org .elasticsearch .ElasticsearchStatusException ;
13
13
import org .elasticsearch .ResourceNotFoundException ;
14
- import org .elasticsearch .TransportVersion ;
15
- import org .elasticsearch .TransportVersions ;
16
14
import org .elasticsearch .action .ActionListener ;
17
15
import org .elasticsearch .action .ActionRequestValidationException ;
18
16
import org .elasticsearch .action .support .master .AcknowledgedResponse ;
@@ -80,9 +78,6 @@ public class TrainedModelAssignmentClusterService implements ClusterStateListene
80
78
81
79
private static final Logger logger = LogManager .getLogger (TrainedModelAssignmentClusterService .class );
82
80
83
- private static final TransportVersion RENAME_ALLOCATION_TO_ASSIGNMENT_TRANSPORT_VERSION = TransportVersions .V_8_3_0 ;
84
- public static final TransportVersion DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION = TransportVersions .V_8_4_0 ;
85
-
86
81
private final ClusterService clusterService ;
87
82
private final ThreadPool threadPool ;
88
83
private final NodeLoadDetector nodeLoadDetector ;
@@ -170,14 +165,6 @@ public void clusterChanged(ClusterChangedEvent event) {
170
165
return ;
171
166
}
172
167
173
- if (eventStateMinTransportVersionIsBeforeDistributedModelAllocationTransportVersion (event )) {
174
- // we should not try to rebalance assignments while there may be nodes running on a version
175
- // prior to introducing distributed model allocation.
176
- // But we should remove routing to removed or shutting down nodes.
177
- removeRoutingToRemovedOrShuttingDownNodes (event );
178
- return ;
179
- }
180
-
181
168
if (event .nodesAdded ()) {
182
169
logMlNodeHeterogeneity ();
183
170
}
@@ -204,10 +191,6 @@ public void clusterChanged(ClusterChangedEvent event) {
204
191
}
205
192
}
206
193
207
- boolean eventStateMinTransportVersionIsBeforeDistributedModelAllocationTransportVersion (ClusterChangedEvent event ) {
208
- return event .state ().getMinTransportVersion ().before (DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION );
209
- }
210
-
211
194
boolean eventStateHasGlobalBlockStateNotRecoveredBlock (ClusterChangedEvent event ) {
212
195
return event .state ().blocks ().hasGlobalBlock (GatewayService .STATE_NOT_RECOVERED_BLOCK );
213
196
}
@@ -401,18 +384,6 @@ public void createNewModelAssignment(
401
384
CreateTrainedModelAssignmentAction .Request request ,
402
385
ActionListener <TrainedModelAssignment > listener
403
386
) {
404
- if (clusterService .state ().getMinTransportVersion ().before (DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION )) {
405
- listener .onFailure (
406
- new ElasticsearchStatusException (
407
- "cannot create new assignment [{}] for model [{}] while cluster upgrade is in progress" ,
408
- RestStatus .CONFLICT ,
409
- request .getTaskParams ().getDeploymentId (),
410
- request .getTaskParams ().getModelId ()
411
- )
412
- );
413
- return ;
414
- }
415
-
416
387
if (MlMetadata .getMlMetadata (clusterService .state ()).isResetMode ()) {
417
388
listener .onFailure (
418
389
new ElasticsearchStatusException (
@@ -524,12 +495,8 @@ private static ClusterState update(ClusterState currentState, TrainedModelAssign
524
495
private static ClusterState forceUpdate (ClusterState currentState , TrainedModelAssignmentMetadata .Builder modelAssignments ) {
525
496
logger .debug (() -> format ("updated assignments: %s" , modelAssignments .build ()));
526
497
Metadata .Builder metadata = Metadata .builder (currentState .metadata ());
527
- if (currentState .getMinTransportVersion ().onOrAfter (RENAME_ALLOCATION_TO_ASSIGNMENT_TRANSPORT_VERSION )) {
528
- metadata .putCustom (TrainedModelAssignmentMetadata .NAME , modelAssignments .build ())
529
- .removeProjectCustom (TrainedModelAssignmentMetadata .DEPRECATED_NAME );
530
- } else {
531
- metadata .putCustom (TrainedModelAssignmentMetadata .DEPRECATED_NAME , modelAssignments .buildOld ());
532
- }
498
+ metadata .putCustom (TrainedModelAssignmentMetadata .NAME , modelAssignments .build ())
499
+ .removeProjectCustom (TrainedModelAssignmentMetadata .DEPRECATED_NAME );
533
500
return ClusterState .builder (currentState ).metadata (metadata ).build ();
534
501
}
535
502
@@ -847,7 +814,7 @@ private void updateDeployment(
847
814
}
848
815
boolean hasUpdates = hasUpdates (numberOfAllocations , adaptiveAllocationsSettingsUpdates , existingAssignment );
849
816
if (hasUpdates == false ) {
850
- logger .info ("no updates" );
817
+ logger .debug ("no updates to be made for deployment [{}]" , deploymentId );
851
818
listener .onResponse (existingAssignment );
852
819
return ;
853
820
}
@@ -861,27 +828,17 @@ private void updateDeployment(
861
828
);
862
829
return ;
863
830
}
864
- if (clusterState .getMinTransportVersion ().before (DISTRIBUTED_MODEL_ALLOCATION_TRANSPORT_VERSION )) {
865
- listener .onFailure (
866
- new ElasticsearchStatusException (
867
- "cannot update deployment with model id [{}] while cluster upgrade is in progress." ,
868
- RestStatus .CONFLICT ,
869
- deploymentId
870
- )
871
- );
872
- return ;
873
- }
874
831
875
- ActionListener <ClusterState > updatedStateListener = ActionListener .wrap (
876
- updatedState -> submitUnbatchedTask ("update model deployment" , new ClusterStateUpdateTask () {
832
+ ActionListener <TrainedModelAssignmentMetadata . Builder > updatedAssignmentListener = ActionListener .wrap (
833
+ updatedAssignment -> submitUnbatchedTask ("update model deployment" , new ClusterStateUpdateTask () {
877
834
878
835
private volatile boolean isUpdated ;
879
836
880
837
@ Override
881
838
public ClusterState execute (ClusterState currentState ) {
882
839
if (areClusterStatesCompatibleForRebalance (clusterState , currentState )) {
883
840
isUpdated = true ;
884
- return updatedState ;
841
+ return update ( currentState , updatedAssignment ) ;
885
842
}
886
843
logger .debug (() -> format ("[%s] Retrying update as cluster state has been modified" , deploymentId ));
887
844
updateDeployment (currentState , deploymentId , numberOfAllocations , adaptiveAllocationsSettings , isInternal , listener );
@@ -913,7 +870,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState)
913
870
listener ::onFailure
914
871
);
915
872
916
- updateAssignment (clusterState , existingAssignment , numberOfAllocations , adaptiveAllocationsSettings , updatedStateListener );
873
+ updateAssignment (clusterState , existingAssignment , numberOfAllocations , adaptiveAllocationsSettings , updatedAssignmentListener );
917
874
}
918
875
919
876
static boolean hasUpdates (
@@ -947,7 +904,7 @@ private void updateAssignment(
947
904
TrainedModelAssignment assignment ,
948
905
Integer numberOfAllocations ,
949
906
AdaptiveAllocationsSettings adaptiveAllocationsSettings ,
950
- ActionListener <ClusterState > listener
907
+ ActionListener <TrainedModelAssignmentMetadata . Builder > listener
951
908
) {
952
909
threadPool .executor (MachineLearning .UTILITY_THREAD_POOL_NAME ).execute (() -> {
953
910
if (numberOfAllocations == null || numberOfAllocations == assignment .getTaskParams ().getNumberOfAllocations ()) {
@@ -964,21 +921,21 @@ private void updateAndKeepNumberOfAllocations(
964
921
ClusterState clusterState ,
965
922
TrainedModelAssignment assignment ,
966
923
AdaptiveAllocationsSettings adaptiveAllocationsSettings ,
967
- ActionListener <ClusterState > listener
924
+ ActionListener <TrainedModelAssignmentMetadata . Builder > listener
968
925
) {
969
926
TrainedModelAssignment .Builder updatedAssignment = TrainedModelAssignment .Builder .fromAssignment (assignment )
970
927
.setAdaptiveAllocationsSettings (adaptiveAllocationsSettings );
971
928
TrainedModelAssignmentMetadata .Builder builder = TrainedModelAssignmentMetadata .builder (clusterState );
972
929
builder .updateAssignment (assignment .getDeploymentId (), updatedAssignment );
973
- listener .onResponse (update ( clusterState , builder ) );
930
+ listener .onResponse (builder );
974
931
}
975
932
976
933
private void increaseNumberOfAllocations (
977
934
ClusterState clusterState ,
978
935
TrainedModelAssignment assignment ,
979
936
int numberOfAllocations ,
980
937
AdaptiveAllocationsSettings adaptiveAllocationsSettings ,
981
- ActionListener <ClusterState > listener
938
+ ActionListener <TrainedModelAssignmentMetadata . Builder > listener
982
939
) {
983
940
try {
984
941
TrainedModelAssignment .Builder updatedAssignment = TrainedModelAssignment .Builder .fromAssignment (assignment )
@@ -998,7 +955,7 @@ private void increaseNumberOfAllocations(
998
955
)
999
956
);
1000
957
} else {
1001
- listener .onResponse (update ( clusterState , rebalancedMetadata ) );
958
+ listener .onResponse (rebalancedMetadata );
1002
959
}
1003
960
} catch (Exception e ) {
1004
961
listener .onFailure (e );
@@ -1010,7 +967,7 @@ private void decreaseNumberOfAllocations(
1010
967
TrainedModelAssignment assignment ,
1011
968
int numberOfAllocations ,
1012
969
AdaptiveAllocationsSettings adaptiveAllocationsSettings ,
1013
- ActionListener <ClusterState > listener
970
+ ActionListener <TrainedModelAssignmentMetadata . Builder > listener
1014
971
) {
1015
972
TrainedModelAssignment .Builder updatedAssignment = numberOfAllocations < assignment .totalTargetAllocations ()
1016
973
? new AllocationReducer (assignment , nodeAvailabilityZoneMapper .buildMlNodesByAvailabilityZone (clusterState )).reduceTo (
@@ -1025,7 +982,7 @@ private void decreaseNumberOfAllocations(
1025
982
}
1026
983
TrainedModelAssignmentMetadata .Builder builder = TrainedModelAssignmentMetadata .builder (clusterState );
1027
984
builder .updateAssignment (assignment .getDeploymentId (), updatedAssignment );
1028
- listener .onResponse (update ( clusterState , builder ) );
985
+ listener .onResponse (builder );
1029
986
}
1030
987
1031
988
static ClusterState setToStopping (ClusterState clusterState , String deploymentId , String reason ) {
0 commit comments