Skip to content

Commit 21102ec

Browse files
authored
fix(google): Add retry and status polling logic (#6379)
* chore(google): Add retry and status polling logic for autoscalers and backend services * chore(google): Added enableAsyncOperationWait=true flag for legacy behaviour * chore(google): Log enableAsyncOperationWait flag warning just once
1 parent 1730ad7 commit 21102ec

File tree

3 files changed

+244
-11
lines changed

3 files changed

+244
-11
lines changed

clouddriver-google/src/main/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandler.groovy

Lines changed: 133 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import com.google.api.services.compute.model.InstanceGroupManagerAutoHealingPoli
2929
import com.google.api.services.compute.model.InstanceProperties
3030
import com.google.api.services.compute.model.InstanceTemplate
3131
import com.google.api.services.compute.model.NamedPort
32+
import com.google.api.services.compute.model.Operation
3233
import com.netflix.frigga.Names
3334
import com.netflix.spectator.api.Registry
3435
import com.netflix.spinnaker.cats.cache.Cache
@@ -64,6 +65,7 @@ import com.netflix.spinnaker.moniker.Namer
6465
import groovy.util.logging.Slf4j
6566
import org.springframework.beans.factory.annotation.Autowired
6667
import org.springframework.stereotype.Component
68+
import javax.annotation.PostConstruct
6769

6870
import static com.google.common.base.Preconditions.checkArgument
6971
import static com.netflix.spinnaker.clouddriver.google.deploy.GCEUtil.BACKEND_SERVICE_NAMES
@@ -587,14 +589,33 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
587589
if (willCreateAutoscaler) {
588590
task.updateStatus BASE_PHASE, "Creating regional autoscaler for $serverGroupName..."
589591

592+
// Build autoscaler configuration from the deployment description
593+
// The autoscaler will manage the instance group created above (migCreateOperation.targetLink)
590594
Autoscaler autoscaler = GCEUtil.buildAutoscaler(serverGroupName,
591595
migCreateOperation.targetLink,
592596
description.autoscalingPolicy)
593597

594-
timeExecute(
598+
// Google Cloud autoscaler insert operations are asynchronous and return an Operation object.
599+
//
600+
// Per Google Cloud documentation:
601+
// "When you perform any requests that modify data, a zoneOperations or regionOperations resource
602+
// is returned, and you can query the operation to check the status of your change."
603+
//
604+
// Without waiting for autoscaler creation to complete, subsequent deployment steps (health checks, traffic routing)
605+
// may execute before the autoscaler is active, leading to inconsistent behavior and potential deployment failures.
606+
//
607+
// This fix aligns Spinnaker GCP behavior with Spinnaker AWS behavior, where autoscaling group operations are synchronous.
608+
Operation autoscalerOperation = timeExecute(
595609
compute.regionAutoscalers().insert(project, region, autoscaler),
596610
"compute.regionAutoscalers.insert",
597611
TAG_SCOPE, SCOPE_REGIONAL, TAG_REGION, region)
612+
613+
// Wait for regional autoscaler creation to complete before proceeding with deployment
614+
// Uses GoogleOperationPoller which implements proper retry logic and handles operation status polling
615+
if (googleDeployDefaults.enableAsyncOperationWait) {
616+
googleOperationPoller.waitForRegionalOperation(compute, project, region, autoscalerOperation.getName(),
617+
null, task, "regional autoscaler $serverGroupName", BASE_PHASE)
618+
}
598619
}
599620
}
600621
} else {
@@ -611,46 +632,104 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
611632
if (willCreateAutoscaler) {
612633
task.updateStatus BASE_PHASE, "Creating zonal autoscaler for $serverGroupName..."
613634

635+
// Build autoscaler configuration from the deployment description
636+
// The autoscaler will manage the instance group created above (migCreateOperation.targetLink)
614637
Autoscaler autoscaler = GCEUtil.buildAutoscaler(serverGroupName,
615638
migCreateOperation.targetLink,
616639
description.autoscalingPolicy)
617640

618-
timeExecute(compute.autoscalers().insert(project, zone, autoscaler),
641+
// Google Cloud autoscaler insert operations are asynchronous and return an Operation object.
642+
//
643+
// Per Google Cloud documentation:
644+
// "When you perform any requests that modify data, a zoneOperations or regionOperations resource
645+
// is returned, and you can query the operation to check the status of your change."
646+
//
647+
// Without waiting for autoscaler creation to complete, subsequent deployment steps (health checks, traffic routing)
648+
// may execute before the autoscaler is active, leading to inconsistent behavior and potential deployment failures.
649+
//
650+
// This fix aligns Spinnaker GCP behavior with Spinnaker AWS behavior, where autoscaling group operations are synchronous.
651+
Operation autoscalerOperation = timeExecute(compute.autoscalers().insert(project, zone, autoscaler),
619652
"compute.autoscalers.insert",
620653
TAG_SCOPE, SCOPE_ZONAL, TAG_ZONE, zone)
654+
655+
// Wait for zonal autoscaler creation to complete before proceeding with deployment
656+
// Uses GoogleOperationPoller which implements proper retry logic and handles operation status polling
657+
if (googleDeployDefaults.enableAsyncOperationWait) {
658+
googleOperationPoller.waitForZonalOperation(compute, project, zone, autoscalerOperation.getName(),
659+
null, task, "autoscaler $serverGroupName", BASE_PHASE)
660+
}
621661
}
622662
}
623663
}
624664

625665
task.updateStatus BASE_PHASE, "Done creating server group $serverGroupName in $location."
626666

627-
// Actually update the backend services.
667+
// Update backend services and wait for operation completion
628668
if (willUpdateBackendServices) {
629669
backendServicesToUpdate.each { BackendService backendService ->
630-
safeRetry.doRetry(
670+
// Execute backend service update with retry logic for transient errors
671+
def operation = safeRetry.doRetry(
631672
updateBackendServices(compute, project, backendService.name, backendService),
632673
"Load balancer backend service",
633674
task,
634-
[400, 412],
675+
[400, 412], // Retry on Bad Request (400) and Precondition Failed (412)
635676
[],
636677
[action: "update", phase: BASE_PHASE, operation: "updateBackendServices", (TAG_SCOPE): SCOPE_GLOBAL],
637678
registry
638679
)
680+
681+
if (operation) {
682+
// Wait for the backend service update operation to complete
683+
//
684+
// Per Google Cloud documentation:
685+
// "Backend service operations are asynchronous and return an Operation resource.
686+
// You can use an operation resource to manage asynchronous API requests.
687+
// For global operations, use the globalOperations resource."
688+
//
689+
// Without waiting for backend service updates to complete,
690+
// subsequent health checks and traffic routing may execute before the backend service
691+
// knows about the new instance group, causing deployment failures.
692+
//
693+
// This ensures that health checks and WaitForUpInstancesTask execute only after
694+
// backend services have been fully updated with the new instance group.
695+
task.updateStatus BASE_PHASE, "Waiting for backend service ${backendService.name} update to complete..."
696+
googleOperationPoller.waitForGlobalOperation(compute, project, operation.getName(),
697+
null, task, "backend service ${backendService.name}", BASE_PHASE)
698+
}
699+
639700
task.updateStatus BASE_PHASE, "Done associating server group $serverGroupName with backend service ${backendService.name}."
640701
}
641702
}
642703

704+
// Update regional backend services and wait for operation completion
643705
if (willUpdateRegionalBackendServices) {
644706
regionBackendServicesToUpdate.each { BackendService backendService ->
645-
safeRetry.doRetry(
707+
// Execute regional backend service update with retry logic for transient errors
708+
def operation = safeRetry.doRetry(
646709
updateRegionBackendServices(compute, project, region, backendService.name, backendService),
647710
"Internal load balancer backend service",
648711
task,
649-
[400, 412],
712+
[400, 412], // Retry on Bad Request (400) and Precondition Failed (412)
650713
[],
651714
[action: "update", phase: BASE_PHASE, operation: "updateRegionBackendServices", (TAG_SCOPE): SCOPE_REGIONAL, (TAG_REGION): region],
652715
registry
653716
)
717+
718+
if (operation) {
719+
// Wait for the regional backend service update operation to complete
720+
//
721+
// Per Google Cloud documentation:
722+
// "Regional backend service operations are asynchronous and return an Operation resource.
723+
// You can use an operation resource to manage asynchronous API requests.
724+
// For regional operations, use the regionOperations resource."
725+
//
726+
// Similar to global backend services, regional backend services require explicit waiting to ensure the new instance group
727+
// is properly registered before health checks and traffic routing decisions are made.
728+
task.updateStatus BASE_PHASE, "Waiting for regional backend service ${backendService.name} update to complete..."
729+
googleOperationPoller.waitForRegionalOperation(compute, project, region, operation.getName(),
730+
null, task, "regional backend service ${backendService.name}", BASE_PHASE)
731+
}
732+
654733
task.updateStatus BASE_PHASE, "Done associating server group $serverGroupName with backend service ${backendService.name}."
655734
}
656735
}
@@ -667,6 +746,24 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
667746
}
668747
}
669748

749+
/**
750+
* Creates a closure to update regional backend services with new instance groups.
751+
*
752+
* Per Google Cloud documentation:
753+
* "Backend service operations are asynchronous and return an Operation resource.
754+
* You can use an operation resource to manage asynchronous API requests.
755+
* Operations can be global, regional or zonal. For regional operations, use the regionOperations resource."
756+
*
757+
* The returned Operation object must be polled until completion to ensure the backend service
758+
* update has been fully applied before proceeding with subsequent deployment steps.
759+
*
760+
* @param compute GCP Compute API client
761+
* @param project GCP project ID
762+
* @param region GCP region for regional backend service
763+
* @param backendServiceName Name of the backend service to update
764+
* @param backendService Backend service configuration with new backends to add
765+
* @return Closure that returns an Operation object for the update request
766+
*/
670767
private Closure updateRegionBackendServices(Compute compute, String project, String region, String backendServiceName, BackendService backendService) {
671768
return {
672769
BackendService serviceToUpdate = timeExecute(
@@ -678,14 +775,32 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
678775
}
679776
backendService?.backends?.each { serviceToUpdate.backends << it }
680777
serviceToUpdate.getBackends().unique { backend -> backend.group }
681-
timeExecute(
778+
return timeExecute(
682779
compute.regionBackendServices().update(project, region, backendServiceName, serviceToUpdate),
683780
"compute.regionBackendServices.update",
684781
TAG_SCOPE, SCOPE_REGIONAL, TAG_REGION, region)
685-
null
686782
}
687783
}
688784

785+
/**
786+
* Creates a closure to update global backend services with new instance groups.
787+
*
788+
* Per Google Cloud documentation:
789+
* "Backend service operations are asynchronous and return an Operation resource.
790+
* You can use an operation resource to manage asynchronous API requests.
791+
* Operations can be global, regional or zonal. For global operations, use the globalOperations resource."
792+
*
793+
* The returned Operation object must be polled until completion to ensure the backend service
794+
* update has been fully applied before proceeding with subsequent deployment steps. This prevents
795+
* race conditions where health checks or traffic routing occurs before the backend service knows
796+
* about the new instance group.
797+
*
798+
* @param compute GCP Compute API client
799+
* @param project GCP project ID
800+
* @param backendServiceName Name of the backend service to update
801+
* @param backendService Backend service configuration with new backends to add
802+
* @return Closure that returns an Operation object for the update request
803+
*/
689804
private Closure updateBackendServices(Compute compute, String project, String backendServiceName, BackendService backendService) {
690805
return {
691806
BackendService serviceToUpdate = timeExecute(
@@ -697,11 +812,10 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
697812
}
698813
backendService?.backends?.each { serviceToUpdate.backends << it }
699814
serviceToUpdate.getBackends().unique { backend -> backend.group }
700-
timeExecute(
815+
return timeExecute(
701816
compute.backendServices().update(project, backendServiceName, serviceToUpdate),
702817
"compute.backendServices.update",
703818
TAG_SCOPE, SCOPE_GLOBAL)
704-
null
705819
}
706820
}
707821

@@ -731,6 +845,14 @@ class BasicGoogleDeployHandler implements DeployHandler<BasicGoogleDeployDescrip
731845
return userData
732846
}
733847

848+
@PostConstruct
849+
void logEnableAsyncOperationWaitWarning() {
850+
if (googleDeployDefaults?.enableAsyncOperationWait) {
851+
log.warn("[enableAsyncOperationWait]: If you see unjustified long waits or other issues caused by this flag, " +
852+
"please drop a note in Spinnaker Slack or open a GitHub Issue with the related details.")
853+
}
854+
}
855+
734856
static class GoogleInstanceTemplate implements GoogleLabeledResource {
735857
Map<String, String> labels
736858
}

clouddriver-google/src/main/groovy/com/netflix/spinnaker/config/GoogleConfiguration.groovy

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,17 @@ class GoogleConfiguration {
6868
List<GoogleDisk> fallbackInstanceTypeDisks = []
6969
List<GoogleInstanceTypeDisk> instanceTypeDisks = []
7070

71+
/**
72+
* Feature flag: when true (default) Clouddriver will actively poll GCP asynchronous operations
73+
* that mutate backend services and autoscalers.
74+
* Setting this to false restores the legacy behaviour where Clouddriver does not wait for
75+
* completion of those operations, which can re-introduce race conditions during red/black
76+
* deployments but matches the historical behaviour.
77+
*
78+
* YAML path: google.defaults.enableAsyncOperationWait
79+
*/
80+
boolean enableAsyncOperationWait = true
81+
7182
GoogleInstanceTypeDisk determineInstanceTypeDisk(String instanceType) {
7283
GoogleInstanceTypeDisk instanceTypeDisk = instanceTypeDisks.find {
7384
it.instanceType == instanceType

clouddriver-google/src/test/groovy/com/netflix/spinnaker/clouddriver/google/deploy/handlers/BasicGoogleDeployHandlerSpec.groovy

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ package com.netflix.spinnaker.clouddriver.google.deploy.handlers
1818

1919
import com.google.api.services.compute.Compute
2020
import com.google.api.services.compute.ComputeRequest
21+
import com.google.api.services.compute.model.Autoscaler
22+
import com.google.api.services.compute.model.Backend
23+
import com.google.api.services.compute.model.BackendService
2124
import com.google.api.services.compute.model.Image
2225
import com.google.api.services.compute.model.ImageList
2326
import com.google.api.services.compute.model.Instance
@@ -26,10 +29,21 @@ import com.google.api.services.compute.model.MachineType
2629
import com.google.api.services.compute.model.MachineTypeList
2730
import com.google.api.services.compute.model.Network
2831
import com.google.api.services.compute.model.NetworkList
32+
import com.google.api.services.compute.model.Operation
33+
import com.netflix.spectator.api.Clock
34+
import com.netflix.spectator.api.Id
35+
import com.netflix.spectator.api.Registry
36+
import com.netflix.spectator.api.Timer
2937
import com.netflix.spinnaker.clouddriver.data.task.Task
3038
import com.netflix.spinnaker.clouddriver.data.task.TaskRepository
39+
import com.netflix.spinnaker.clouddriver.google.deploy.GoogleOperationPoller
40+
import com.netflix.spinnaker.clouddriver.google.deploy.SafeRetry
3141
import com.netflix.spinnaker.clouddriver.google.deploy.description.BasicGoogleDeployDescription
42+
import com.netflix.spinnaker.clouddriver.google.model.GoogleAutoscalingPolicy
3243
import com.netflix.spinnaker.clouddriver.google.security.GoogleCredentials
44+
import com.netflix.spinnaker.clouddriver.google.security.GoogleNamedAccountCredentials
45+
import com.netflix.spinnaker.clouddriver.names.NamerRegistry
46+
import com.netflix.spinnaker.moniker.Namer
3347
import spock.lang.Ignore
3448
import spock.lang.Shared
3549
import spock.lang.Specification
@@ -87,4 +101,90 @@ class BasicGoogleDeployHandlerSpec extends Specification {
87101
mock.list(_, _) >> list
88102
mock
89103
}
104+
105+
def "backend service update creates closure that returns operation"() {
106+
given:
107+
def mockCompute = Mock(Compute)
108+
def mockBackendServices = Mock(Compute.BackendServices)
109+
def mockGet = Mock(Compute.BackendServices.Get)
110+
def mockUpdate = Mock(Compute.BackendServices.Update)
111+
def mockOperation = new Operation(name: "operation-123", status: "DONE")
112+
113+
// Setup registry mocks for timeExecute
114+
def mockClock = Mock(Clock)
115+
def mockTimer = Mock(Timer)
116+
def mockId = Mock(Id)
117+
def mockRegistry = Mock(Registry)
118+
mockRegistry.clock() >> mockClock
119+
mockRegistry.createId(_, _) >> mockId
120+
mockId.withTags(_) >> mockId
121+
mockRegistry.timer(_) >> mockTimer
122+
mockClock.monotonicTime() >> 1000L
123+
124+
def handler = new BasicGoogleDeployHandler()
125+
handler.registry = mockRegistry
126+
127+
// Mock backend service data
128+
def existingBackendService = new BackendService(name: "test-backend-service", backends: [])
129+
def newBackendService = new BackendService(
130+
name: "test-backend-service",
131+
backends: [new Backend(group: "projects/test/zones/us-central1-a/instanceGroups/new-group")]
132+
)
133+
134+
when:
135+
def updateClosure = handler.updateBackendServices(mockCompute, "test-project", "test-backend-service", newBackendService)
136+
def result = updateClosure.call()
137+
138+
then:
139+
// Verify GCP API calls
140+
2 * mockCompute.backendServices() >> mockBackendServices
141+
1 * mockBackendServices.get("test-project", "test-backend-service") >> mockGet
142+
1 * mockGet.execute() >> existingBackendService
143+
1 * mockBackendServices.update("test-project", "test-backend-service", _) >> mockUpdate
144+
1 * mockUpdate.execute() >> mockOperation
145+
146+
result == mockOperation
147+
}
148+
149+
def "autoscaler operations return operations for waiting"() {
150+
setup:
151+
def mockRegistry = Mock(Registry)
152+
def mockClock = Mock(Clock)
153+
def mockTimer = Mock(Timer)
154+
def mockId = Mock(Id)
155+
def mockCompute = Mock(Compute)
156+
def mockAutoscalers = Mock(Compute.Autoscalers)
157+
def mockAutoscalerInsert = Mock(Compute.Autoscalers.Insert)
158+
def mockRegionAutoscalers = Mock(Compute.RegionAutoscalers)
159+
def mockRegAutoscalerInsert = Mock(Compute.RegionAutoscalers.Insert)
160+
161+
mockRegistry.clock() >> mockClock
162+
mockRegistry.createId(_, _) >> mockId
163+
mockId.withTags(_) >> mockId
164+
mockRegistry.timer(_) >> mockTimer
165+
mockClock.monotonicTime() >> 1000L
166+
167+
def zonalOperation = new Operation(name: "zonal-op", status: "DONE")
168+
def regionalOperation = new Operation(name: "regional-op", status: "DONE")
169+
170+
mockCompute.autoscalers() >> mockAutoscalers
171+
mockCompute.regionAutoscalers() >> mockRegionAutoscalers
172+
mockAutoscalers.insert(_, _, _) >> mockAutoscalerInsert
173+
mockAutoscalerInsert.execute() >> zonalOperation
174+
mockRegionAutoscalers.insert(_, _, _) >> mockRegAutoscalerInsert
175+
mockRegAutoscalerInsert.execute() >> regionalOperation
176+
177+
def handler = new BasicGoogleDeployHandler(registry: mockRegistry)
178+
179+
when:
180+
def zonalResult = handler.timeExecute(mockAutoscalerInsert, "compute.autoscalers.insert", "TAG_SCOPE", "SCOPE_ZONAL")
181+
def regionalResult = handler.timeExecute(mockRegAutoscalerInsert, "compute.regionAutoscalers.insert", "TAG_SCOPE", "SCOPE_REGIONAL")
182+
183+
then:
184+
1 * mockAutoscalerInsert.execute() >> zonalOperation
185+
1 * mockRegAutoscalerInsert.execute() >> regionalOperation
186+
187+
zonalResult.getName() == "zonal-op"
188+
regionalResult.getName() == "regional-op"
189+
}
90190
}

0 commit comments

Comments
 (0)