@@ -29,6 +29,7 @@ import com.google.api.services.compute.model.InstanceGroupManagerAutoHealingPoli
29
29
import com.google.api.services.compute.model.InstanceProperties
30
30
import com.google.api.services.compute.model.InstanceTemplate
31
31
import com.google.api.services.compute.model.NamedPort
32
+ import com.google.api.services.compute.model.Operation
32
33
import com.netflix.frigga.Names
33
34
import com.netflix.spectator.api.Registry
34
35
import com.netflix.spinnaker.cats.cache.Cache
@@ -64,6 +65,7 @@ import com.netflix.spinnaker.moniker.Namer
64
65
import groovy.util.logging.Slf4j
65
66
import org.springframework.beans.factory.annotation.Autowired
66
67
import org.springframework.stereotype.Component
68
+ import javax.annotation.PostConstruct
67
69
68
70
import static com.google.common.base.Preconditions.checkArgument
69
71
import static com.netflix.spinnaker.clouddriver.google.deploy.GCEUtil.BACKEND_SERVICE_NAMES
@@ -587,14 +589,33 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
587
589
if (willCreateAutoscaler) {
588
590
task. updateStatus BASE_PHASE , " Creating regional autoscaler for $serverGroupName ... "
589
591
592
+ // Build autoscaler configuration from the deployment description
593
+ // The autoscaler will manage the instance group created above (migCreateOperation.targetLink)
590
594
Autoscaler autoscaler = GCEUtil . buildAutoscaler(serverGroupName,
591
595
migCreateOperation. targetLink,
592
596
description. autoscalingPolicy)
593
597
594
- timeExecute(
598
+ // Google Cloud autoscaler insert operations are asynchronous and return an Operation object.
599
+ //
600
+ // Per Google Cloud documentation:
601
+ // "When you perform any requests that modify data, a zoneOperations or regionOperations resource
602
+ // is returned, and you can query the operation to check the status of your change."
603
+ //
604
+ // Without waiting for autoscaler creation to complete, subsequent deployment steps (health checks, traffic routing)
605
+ // may execute before the autoscaler is active, leading to inconsistent behavior and potential deployment failures.
606
+ //
607
+ // This fix aligns Spinnaker GCP behavior with Spinnaker AWS behavior, where autoscaling group operations are synchronous.
608
+ Operation autoscalerOperation = timeExecute(
595
609
compute. regionAutoscalers(). insert(project, region, autoscaler),
596
610
" compute.regionAutoscalers.insert" ,
597
611
TAG_SCOPE , SCOPE_REGIONAL , TAG_REGION , region)
612
+
613
+ // Wait for regional autoscaler creation to complete before proceeding with deployment
614
+ // Uses GoogleOperationPoller which implements proper retry logic and handles operation status polling
615
+ if (googleDeployDefaults. enableAsyncOperationWait) {
616
+ googleOperationPoller. waitForRegionalOperation(compute, project, region, autoscalerOperation. getName(),
617
+ null , task, " regional autoscaler $serverGroupName " , BASE_PHASE )
618
+ }
598
619
}
599
620
}
600
621
} else {
@@ -611,46 +632,104 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
611
632
if (willCreateAutoscaler) {
612
633
task. updateStatus BASE_PHASE , " Creating zonal autoscaler for $serverGroupName ... "
613
634
635
+ // Build autoscaler configuration from the deployment description
636
+ // The autoscaler will manage the instance group created above (migCreateOperation.targetLink)
614
637
Autoscaler autoscaler = GCEUtil . buildAutoscaler(serverGroupName,
615
638
migCreateOperation. targetLink,
616
639
description. autoscalingPolicy)
617
640
618
- timeExecute(compute. autoscalers(). insert(project, zone, autoscaler),
641
+ // Google Cloud autoscaler insert operations are asynchronous and return an Operation object.
642
+ //
643
+ // Per Google Cloud documentation:
644
+ // "When you perform any requests that modify data, a zoneOperations or regionOperations resource
645
+ // is returned, and you can query the operation to check the status of your change."
646
+ //
647
+ // Without waiting for autoscaler creation to complete, subsequent deployment steps (health checks, traffic routing)
648
+ // may execute before the autoscaler is active, leading to inconsistent behavior and potential deployment failures.
649
+ //
650
+ // This fix aligns Spinnaker GCP behavior with Spinnaker AWS behavior, where autoscaling group operations are synchronous.
651
+ Operation autoscalerOperation = timeExecute(compute. autoscalers(). insert(project, zone, autoscaler),
619
652
" compute.autoscalers.insert" ,
620
653
TAG_SCOPE , SCOPE_ZONAL , TAG_ZONE , zone)
654
+
655
+ // Wait for zonal autoscaler creation to complete before proceeding with deployment
656
+ // Uses GoogleOperationPoller which implements proper retry logic and handles operation status polling
657
+ if (googleDeployDefaults. enableAsyncOperationWait) {
658
+ googleOperationPoller. waitForZonalOperation(compute, project, zone, autoscalerOperation. getName(),
659
+ null , task, " autoscaler $serverGroupName " , BASE_PHASE )
660
+ }
621
661
}
622
662
}
623
663
}
624
664
625
665
task. updateStatus BASE_PHASE , " Done creating server group $serverGroupName in $location . "
626
666
627
- // Actually update the backend services.
667
+ // Update backend services and wait for operation completion
628
668
if (willUpdateBackendServices) {
629
669
backendServicesToUpdate. each { BackendService backendService ->
630
- safeRetry. doRetry(
670
+ // Execute backend service update with retry logic for transient errors
671
+ def operation = safeRetry. doRetry(
631
672
updateBackendServices(compute, project, backendService. name, backendService),
632
673
" Load balancer backend service" ,
633
674
task,
634
- [400 , 412 ],
675
+ [400 , 412 ], // Retry on Bad Request (400) and Precondition Failed (412)
635
676
[],
636
677
[action : " update" , phase : BASE_PHASE , operation : " updateBackendServices" , (TAG_SCOPE ): SCOPE_GLOBAL ],
637
678
registry
638
679
)
680
+
681
+ if (operation) {
682
+ // Wait for the backend service update operation to complete
683
+ //
684
+ // Per Google Cloud documentation:
685
+ // "Backend service operations are asynchronous and return an Operation resource.
686
+ // You can use an operation resource to manage asynchronous API requests.
687
+ // For global operations, use the globalOperations resource."
688
+ //
689
+ // Without waiting for backend service updates to complete,
690
+ // subsequent health checks and traffic routing may execute before the backend service
691
+ // knows about the new instance group, causing deployment failures.
692
+ //
693
+ // This ensures that health checks and WaitForUpInstancesTask execute only after
694
+ // backend services have been fully updated with the new instance group.
695
+ task. updateStatus BASE_PHASE , " Waiting for backend service ${ backendService.name} update to complete..."
696
+ googleOperationPoller. waitForGlobalOperation(compute, project, operation. getName(),
697
+ null , task, " backend service ${ backendService.name} " , BASE_PHASE )
698
+ }
699
+
639
700
task. updateStatus BASE_PHASE , " Done associating server group $serverGroupName with backend service ${ backendService.name} ."
640
701
}
641
702
}
642
703
704
+ // Update regional backend services and wait for operation completion
643
705
if (willUpdateRegionalBackendServices) {
644
706
regionBackendServicesToUpdate. each { BackendService backendService ->
645
- safeRetry. doRetry(
707
+ // Execute regional backend service update with retry logic for transient errors
708
+ def operation = safeRetry. doRetry(
646
709
updateRegionBackendServices(compute, project, region, backendService. name, backendService),
647
710
" Internal load balancer backend service" ,
648
711
task,
649
- [400 , 412 ],
712
+ [400 , 412 ], // Retry on Bad Request (400) and Precondition Failed (412)
650
713
[],
651
714
[action : " update" , phase : BASE_PHASE , operation : " updateRegionBackendServices" , (TAG_SCOPE ): SCOPE_REGIONAL , (TAG_REGION ): region],
652
715
registry
653
716
)
717
+
718
+ if (operation) {
719
+ // Wait for the regional backend service update operation to complete
720
+ //
721
+ // Per Google Cloud documentation:
722
+ // "Regional backend service operations are asynchronous and return an Operation resource.
723
+ // You can use an operation resource to manage asynchronous API requests.
724
+ // For regional operations, use the regionOperations resource."
725
+ //
726
+ // Similar to global backend services, regional backend services require explicit waiting to ensure the new instance group
727
+ // is properly registered before health checks and traffic routing decisions are made.
728
+ task. updateStatus BASE_PHASE , " Waiting for regional backend service ${ backendService.name} update to complete..."
729
+ googleOperationPoller. waitForRegionalOperation(compute, project, region, operation. getName(),
730
+ null , task, " regional backend service ${ backendService.name} " , BASE_PHASE )
731
+ }
732
+
654
733
task. updateStatus BASE_PHASE , " Done associating server group $serverGroupName with backend service ${ backendService.name} ."
655
734
}
656
735
}
@@ -667,6 +746,24 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
667
746
}
668
747
}
669
748
749
+ /**
750
+ * Creates a closure to update regional backend services with new instance groups.
751
+ *
752
+ * Per Google Cloud documentation:
753
+ * "Backend service operations are asynchronous and return an Operation resource.
754
+ * You can use an operation resource to manage asynchronous API requests.
755
+ * Operations can be global, regional or zonal. For regional operations, use the regionOperations resource."
756
+ *
757
+ * The returned Operation object must be polled until completion to ensure the backend service
758
+ * update has been fully applied before proceeding with subsequent deployment steps.
759
+ *
760
+ * @param compute GCP Compute API client
761
+ * @param project GCP project ID
762
+ * @param region GCP region for regional backend service
763
+ * @param backendServiceName Name of the backend service to update
764
+ * @param backendService Backend service configuration with new backends to add
765
+ * @return Closure that returns an Operation object for the update request
766
+ */
670
767
private Closure updateRegionBackendServices (Compute compute , String project , String region , String backendServiceName , BackendService backendService ) {
671
768
return {
672
769
BackendService serviceToUpdate = timeExecute(
@@ -678,14 +775,32 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
678
775
}
679
776
backendService?. backends?. each { serviceToUpdate. backends << it }
680
777
serviceToUpdate. getBackends(). unique { backend -> backend. group }
681
- timeExecute(
778
+ return timeExecute(
682
779
compute. regionBackendServices(). update(project, region, backendServiceName, serviceToUpdate),
683
780
" compute.regionBackendServices.update" ,
684
781
TAG_SCOPE , SCOPE_REGIONAL , TAG_REGION , region)
685
- null
686
782
}
687
783
}
688
784
785
+ /**
786
+ * Creates a closure to update global backend services with new instance groups.
787
+ *
788
+ * Per Google Cloud documentation:
789
+ * "Backend service operations are asynchronous and return an Operation resource.
790
+ * You can use an operation resource to manage asynchronous API requests.
791
+ * Operations can be global, regional or zonal. For global operations, use the globalOperations resource."
792
+ *
793
+ * The returned Operation object must be polled until completion to ensure the backend service
794
+ * update has been fully applied before proceeding with subsequent deployment steps. This prevents
795
+ * race conditions where health checks or traffic routing occurs before the backend service knows
796
+ * about the new instance group.
797
+ *
798
+ * @param compute GCP Compute API client
799
+ * @param project GCP project ID
800
+ * @param backendServiceName Name of the backend service to update
801
+ * @param backendService Backend service configuration with new backends to add
802
+ * @return Closure that returns an Operation object for the update request
803
+ */
689
804
private Closure updateBackendServices (Compute compute , String project , String backendServiceName , BackendService backendService ) {
690
805
return {
691
806
BackendService serviceToUpdate = timeExecute(
@@ -697,11 +812,10 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
697
812
}
698
813
backendService?. backends?. each { serviceToUpdate. backends << it }
699
814
serviceToUpdate. getBackends(). unique { backend -> backend. group }
700
- timeExecute(
815
+ return timeExecute(
701
816
compute. backendServices(). update(project, backendServiceName, serviceToUpdate),
702
817
" compute.backendServices.update" ,
703
818
TAG_SCOPE , SCOPE_GLOBAL )
704
- null
705
819
}
706
820
}
707
821
@@ -731,6 +845,14 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
731
845
return userData
732
846
}
733
847
848
+ @PostConstruct
849
+ void logEnableAsyncOperationWaitWarning () {
850
+ if (googleDeployDefaults?. enableAsyncOperationWait) {
851
+ log. warn(" [enableAsyncOperationWait]: If you see unjustified long waits or other issues caused by this flag, " +
852
+ " please drop a note in Spinnaker Slack or open a GitHub Issue with the related details." )
853
+ }
854
+ }
855
+
734
856
static class GoogleInstanceTemplate implements GoogleLabeledResource {
735
857
Map<String , String > labels
736
858
}
0 commit comments