Skip to content

Commit c8c9d4c

Browse files
fix(ecs): evaluate targetgroup healthcheck in instance healthcheck (#6307) (#6371)
* fix(ecs): include healthcheck status from targetgroups in instance healthcheck * test(ecs): add tests for default behavior and targetgroup healthchecks * fix(ecs): check null for loadBalancer attribute * refactor(ecs): improve testing and use targetHealth to determine status only if containerHealthcheck is UNKNOWN * fix(ecs): evaluate TargetHealth check if container deployed does have target groups associated --------- Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> (cherry picked from commit 7ce44a3) Co-authored-by: Edgar Garcia <63310723+edgarulg@users.noreply.github.com>
1 parent 482dda7 commit c8c9d4c

File tree

2 files changed

+188
-4
lines changed

2 files changed

+188
-4
lines changed

clouddriver-ecs/src/main/java/com/netflix/spinnaker/clouddriver/ecs/services/ContainerInformationService.java

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,19 @@
1818

1919
import com.amazonaws.services.ec2.model.Instance;
2020
import com.amazonaws.services.ecs.model.ContainerDefinition;
21+
import com.amazonaws.services.ecs.model.LoadBalancer;
2122
import com.amazonaws.services.ecs.model.NetworkBinding;
2223
import com.amazonaws.services.ecs.model.TaskDefinition;
2324
import com.netflix.spinnaker.clouddriver.ecs.cache.Keys;
2425
import com.netflix.spinnaker.clouddriver.ecs.cache.client.ContainerInstanceCacheClient;
2526
import com.netflix.spinnaker.clouddriver.ecs.cache.client.EcsInstanceCacheClient;
2627
import com.netflix.spinnaker.clouddriver.ecs.cache.client.ServiceCacheClient;
28+
import com.netflix.spinnaker.clouddriver.ecs.cache.client.TargetHealthCacheClient;
2729
import com.netflix.spinnaker.clouddriver.ecs.cache.client.TaskCacheClient;
2830
import com.netflix.spinnaker.clouddriver.ecs.cache.client.TaskDefinitionCacheClient;
2931
import com.netflix.spinnaker.clouddriver.ecs.cache.client.TaskHealthCacheClient;
3032
import com.netflix.spinnaker.clouddriver.ecs.cache.model.ContainerInstance;
33+
import com.netflix.spinnaker.clouddriver.ecs.cache.model.EcsTargetHealth;
3134
import com.netflix.spinnaker.clouddriver.ecs.cache.model.Service;
3235
import com.netflix.spinnaker.clouddriver.ecs.cache.model.Task;
3336
import com.netflix.spinnaker.clouddriver.ecs.cache.model.TaskHealth;
@@ -37,6 +40,7 @@
3740
import java.util.List;
3841
import java.util.Map;
3942
import java.util.Set;
43+
import java.util.stream.Collectors;
4044
import org.springframework.beans.factory.annotation.Autowired;
4145
import org.springframework.stereotype.Component;
4246

@@ -50,6 +54,7 @@ public class ContainerInformationService {
5054
private final TaskDefinitionCacheClient taskDefinitionCacheClient;
5155
private final EcsInstanceCacheClient ecsInstanceCacheClient;
5256
private final ContainerInstanceCacheClient containerInstanceCacheClient;
57+
private final TargetHealthCacheClient targetHealthCacheClient;
5358

5459
@Autowired
5560
public ContainerInformationService(
@@ -59,14 +64,16 @@ public ContainerInformationService(
5964
TaskHealthCacheClient taskHealthCacheClient,
6065
TaskDefinitionCacheClient taskDefinitionCacheClient,
6166
EcsInstanceCacheClient ecsInstanceCacheClient,
62-
ContainerInstanceCacheClient containerInstanceCacheClient) {
67+
ContainerInstanceCacheClient containerInstanceCacheClient,
68+
TargetHealthCacheClient targetHealthCacheClient) {
6369
this.ecsCredentialsConfig = ecsCredentialsConfig;
6470
this.taskCacheClient = taskCacheClient;
6571
this.serviceCacheClient = serviceCacheClient;
6672
this.taskHealthCacheClient = taskHealthCacheClient;
6773
this.taskDefinitionCacheClient = taskDefinitionCacheClient;
6874
this.ecsInstanceCacheClient = ecsInstanceCacheClient;
6975
this.containerInstanceCacheClient = containerInstanceCacheClient;
76+
this.targetHealthCacheClient = targetHealthCacheClient;
7077
}
7178

7279
public List<Map<String, Object>> getHealthStatus(
@@ -101,8 +108,15 @@ public List<Map<String, Object>> getHealthStatus(
101108
// Task-based health
102109
if (task != null) {
103110
boolean hasHealthCheck = false;
111+
EcsTargetHealth targetHealth = null;
104112
if (service != null) {
105113
hasHealthCheck = taskHasHealthCheck(service, accountName, region);
114+
LoadBalancer loadBalancer = service.getLoadBalancers().stream().findFirst().orElse(null);
115+
if (loadBalancer != null) {
116+
String targetGroupKey =
117+
Keys.getTargetHealthKey(accountName, region, loadBalancer.getTargetGroupArn());
118+
targetHealth = targetHealthCacheClient.get(targetGroupKey);
119+
}
106120
}
107121

108122
Map<String, Object> taskPlatformHealth = new HashMap<>();
@@ -111,7 +125,8 @@ public List<Map<String, Object>> getHealthStatus(
111125
taskPlatformHealth.put("healthClass", "platform");
112126
taskPlatformHealth.put(
113127
"state",
114-
toPlatformHealthState(task.getLastStatus(), task.getHealthStatus(), hasHealthCheck));
128+
toPlatformHealthState(
129+
task.getLastStatus(), task.getHealthStatus(), hasHealthCheck, targetHealth));
115130
healthMetrics.add(taskPlatformHealth);
116131
}
117132

@@ -138,13 +153,20 @@ public boolean taskHasHealthCheck(Service service, String accountName, String re
138153
}
139154

140155
private String toPlatformHealthState(
141-
String ecsTaskStatus, String ecsTaskHealthStatus, boolean hasHealthCheck) {
156+
String ecsTaskStatus,
157+
String ecsTaskHealthStatus,
158+
boolean hasHealthCheck,
159+
EcsTargetHealth ecsTargetHealth) {
142160
if (hasHealthCheck && "UNKNOWN".equals(ecsTaskHealthStatus)) {
143161
return "Starting";
144162
} else if ("UNHEALTHY".equals(ecsTaskHealthStatus)) {
145163
return "Down";
146164
}
147165

166+
if (ecsTargetHealth != null) {
167+
return getPlatformHealthStateFromTargetGroup(ecsTargetHealth);
168+
}
169+
148170
switch (ecsTaskStatus) {
149171
case "PROVISIONING":
150172
case "PENDING":
@@ -157,6 +179,27 @@ private String toPlatformHealthState(
157179
}
158180
}
159181

182+
// based on:
183+
// https://docs.aws.amazon.com/elasticloadbalancing/latest/application/target-group-health-checks.html#target-health-states
184+
private String getPlatformHealthStateFromTargetGroup(EcsTargetHealth targetHealth) {
185+
Set<String> statuses =
186+
targetHealth.getTargetHealthDescriptions().stream()
187+
.map(tg -> tg.getTargetHealth().getState())
188+
.collect(Collectors.toSet());
189+
190+
for (String status : statuses) {
191+
if ("healthy".equalsIgnoreCase(status)) {
192+
return "Up";
193+
}
194+
if ("initial".equalsIgnoreCase(status)) {
195+
return "Starting";
196+
}
197+
}
198+
199+
// statuses: unhealthy, unused, draining, unavailable
200+
return "Down";
201+
}
202+
160203
public String getClusterArn(String accountName, String region, String taskId) {
161204
String key = Keys.getTaskKey(accountName, region, taskId);
162205
Task task = taskCacheClient.get(key);

clouddriver-ecs/src/test/groovy/com/netflix/spinnaker/clouddriver/ecs/services/ContainerInformationServiceSpec.groovy

Lines changed: 142 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,11 @@ import com.amazonaws.services.ecs.model.HealthCheck
2424
import com.amazonaws.services.ecs.model.LoadBalancer
2525
import com.amazonaws.services.ecs.model.NetworkBinding
2626
import com.amazonaws.services.ecs.model.TaskDefinition
27+
import com.amazonaws.services.elasticloadbalancingv2.model.TargetHealth
28+
import com.amazonaws.services.elasticloadbalancingv2.model.TargetHealthDescription
2729
import com.netflix.spinnaker.clouddriver.ecs.cache.client.*
2830
import com.netflix.spinnaker.clouddriver.ecs.cache.model.ContainerInstance
31+
import com.netflix.spinnaker.clouddriver.ecs.cache.model.EcsTargetHealth
2932
import com.netflix.spinnaker.clouddriver.ecs.cache.model.Service
3033
import com.netflix.spinnaker.clouddriver.ecs.cache.model.Task
3134
import com.netflix.spinnaker.clouddriver.ecs.cache.model.TaskHealth
@@ -42,6 +45,7 @@ class ContainerInformationServiceSpec extends Specification {
4245
def taskDefinitionCacheClient = Mock(TaskDefinitionCacheClient)
4346
def ecsInstanceCacheClient = Mock(EcsInstanceCacheClient)
4447
def containerInstanceCacheClient = Mock(ContainerInstanceCacheClient)
48+
def targetHealthCacheClient = Mock(TargetHealthCacheClient)
4549

4650
@Subject
4751
def service = new ContainerInformationService(ecsCredentialsConfig,
@@ -50,7 +54,8 @@ class ContainerInformationServiceSpec extends Specification {
5054
taskHealthCacheClient,
5155
taskDefinitionCacheClient,
5256
ecsInstanceCacheClient,
53-
containerInstanceCacheClient)
57+
containerInstanceCacheClient,
58+
targetHealthCacheClient)
5459

5560
def 'should return a proper health status'() {
5661
given:
@@ -247,6 +252,142 @@ class ContainerInformationServiceSpec extends Specification {
247252
'HEALTHY' | 'Up' | 'RUNNING'
248253
}
249254

255+
def 'should return Up health check status if task is running but healthcheck in container definition is null and targetHealthchecks related container is null'() {
256+
given:
257+
def taskId = 'task-id'
258+
def serviceName = 'test-service-name'
259+
def type = 'loadBalancer'
260+
261+
def cachedService = new Service(
262+
serviceName: serviceName,
263+
loadBalancers: [new LoadBalancer()]
264+
)
265+
266+
serviceCacheClient.get(_) >> cachedService
267+
taskCacheClient.get(_) >> new Task(lastStatus: lastStatus, healthStatus: healthStatus)
268+
taskDefinitionCacheClient.get(_) >> new TaskDefinition(containerDefinitions: Lists.newArrayList(new ContainerDefinition(healthCheck: null)))
269+
targetHealthCacheClient.get(_) >> null
270+
271+
def expectedHealthStatus = [
272+
[
273+
instanceId: taskId,
274+
state : 'Unknown',
275+
type : type
276+
],
277+
[
278+
instanceId: taskId,
279+
state : resultStatus,
280+
type : 'ecs',
281+
healthClass: 'platform'
282+
]
283+
]
284+
def retrievedHealthStatus = service.getHealthStatus(taskId, serviceName, 'test-account', 'us-west-1')
285+
286+
expect:
287+
retrievedHealthStatus == expectedHealthStatus
288+
289+
where:
290+
healthStatus | resultStatus | lastStatus
291+
'UNKNOWN' | 'Starting' | 'PROVISIONING'
292+
'UNKNOWN' | 'Starting' | 'PENDING'
293+
'UNKNOWN' | 'Starting' | 'ACTIVATING'
294+
'UNKNOWN' | 'Up' | 'RUNNING'
295+
}
296+
297+
def 'should return health status based on target group if task is running but healthcheck in container definition is null and container has a targetHealthcheck defined'() {
298+
given:
299+
def taskId = 'task-id'
300+
def serviceName = 'test-service-name'
301+
def type = 'loadBalancer'
302+
303+
def cachedService = new Service(
304+
serviceName: serviceName,
305+
loadBalancers: [new LoadBalancer()]
306+
)
307+
308+
serviceCacheClient.get(_) >> cachedService
309+
taskCacheClient.get(_) >> new Task(lastStatus: lastStatus, healthStatus: healthStatus)
310+
taskDefinitionCacheClient.get(_) >> new TaskDefinition(containerDefinitions: Lists.newArrayList(new ContainerDefinition(healthCheck: null)))
311+
targetHealthCacheClient.get(_) >> new EcsTargetHealth(targetHealthDescriptions: List.of(
312+
new TargetHealthDescription(targetHealth: new TargetHealth(state: targetHealthStatus))
313+
))
314+
315+
def expectedHealthStatus = [
316+
[
317+
instanceId: taskId,
318+
state : 'Unknown',
319+
type : type
320+
],
321+
[
322+
instanceId: taskId,
323+
state : resultStatus,
324+
type : 'ecs',
325+
healthClass: 'platform'
326+
]
327+
]
328+
def retrievedHealthStatus = service.getHealthStatus(taskId, serviceName, 'test-account', 'us-west-1')
329+
330+
expect:
331+
retrievedHealthStatus == expectedHealthStatus
332+
333+
where:
334+
healthStatus | resultStatus | lastStatus | targetHealthStatus
335+
'UNKNOWN' | 'Starting' | 'RUNNING' | 'initial'
336+
'UNKNOWN' | 'Up' | 'RUNNING' | 'healthy'
337+
'UNKNOWN' | 'Down' | 'RUNNING' | 'unhealthy'
338+
'UNKNOWN' | 'Down' | 'RUNNING' | 'unused'
339+
'UNKNOWN' | 'Down' | 'RUNNING' | 'draining'
340+
'UNKNOWN' | 'Down' | 'RUNNING' | 'unavailable'
341+
342+
}
343+
344+
def 'should return health status based on target group if task is running but healthcheck in container definition is null and container has multiple targetHealthcheck related'() {
345+
given:
346+
def taskId = 'task-id'
347+
def serviceName = 'test-service-name'
348+
def type = 'loadBalancer'
349+
350+
def cachedService = new Service(
351+
serviceName: serviceName,
352+
loadBalancers: [new LoadBalancer()]
353+
)
354+
355+
serviceCacheClient.get(_) >> cachedService
356+
taskCacheClient.get(_) >> new Task(lastStatus: lastStatus, healthStatus: healthStatus)
357+
taskDefinitionCacheClient.get(_) >> new TaskDefinition(containerDefinitions: Lists.newArrayList(new ContainerDefinition(healthCheck: null)))
358+
targetHealthCacheClient.get(_) >> new EcsTargetHealth(targetHealthDescriptions: List.of(
359+
new TargetHealthDescription(targetHealth: new TargetHealth(state: targetHealthStatus1)),
360+
new TargetHealthDescription(targetHealth: new TargetHealth(state: targetHealthStatus2))
361+
))
362+
363+
def expectedHealthStatus = [
364+
[
365+
instanceId: taskId,
366+
state : 'Unknown',
367+
type : type
368+
],
369+
[
370+
instanceId: taskId,
371+
state : resultStatus,
372+
type : 'ecs',
373+
healthClass: 'platform'
374+
]
375+
]
376+
def retrievedHealthStatus = service.getHealthStatus(taskId, serviceName, 'test-account', 'us-west-1')
377+
378+
expect:
379+
retrievedHealthStatus == expectedHealthStatus
380+
381+
where:
382+
healthStatus | resultStatus | lastStatus | targetHealthStatus1 | targetHealthStatus2
383+
'UNKNOWN' | 'Starting' | 'RUNNING' | 'initial' | 'draining'
384+
'UNKNOWN' | 'Starting' | 'RUNNING' | 'draining' | 'initial'
385+
'UNKNOWN' | 'Up' | 'RUNNING' | 'healthy' | 'draining'
386+
'UNKNOWN' | 'Up' | 'RUNNING' | 'draining' | 'healthy'
387+
'UNKNOWN' | 'Up' | 'RUNNING' | 'unhealthy' | 'healthy'
388+
389+
}
390+
250391
def 'should return a proper private address for a task'() {
251392
given:
252393
def account = 'test-account'

0 commit comments

Comments
 (0)