Skip to content

Commit 89e92d5

Browse files
authored
Make client stats reliable in case of downgrade (#9136)
1 parent 7c0f24f commit 89e92d5

File tree

2 files changed

+117
-5
lines changed

2 files changed

+117
-5
lines changed

dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -325,11 +325,6 @@ private void disable() {
325325
features.discover();
326326
if (!features.supportsMetrics()) {
327327
log.debug("Disabling metric reporting because an agent downgrade was detected");
328-
AgentTaskScheduler.Scheduled<?> cancellation = this.cancellation;
329-
if (null != cancellation) {
330-
cancellation.cancel();
331-
}
332-
this.thread.interrupt();
333328
this.pending.clear();
334329
this.batchPool.clear();
335330
this.inbox.clear();
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
package datadog.trace.common.metrics
2+
3+
import datadog.communication.ddagent.SharedCommunicationObjects
4+
import datadog.trace.api.Config
5+
import datadog.trace.core.test.DDCoreSpecification
6+
7+
import java.util.concurrent.CountDownLatch
8+
9+
import static datadog.trace.agent.test.server.http.TestHttpServer.httpServer
10+
11+
class MetricsReliabilityTest extends DDCoreSpecification {
12+
13+
static class State {
14+
boolean agentMetricsAvailable
15+
boolean receivedStats
16+
boolean receivedClientComputedHeader
17+
CountDownLatch latch
18+
def reset(agentMetricsAvailable) {
19+
this.agentMetricsAvailable = agentMetricsAvailable
20+
receivedStats = false
21+
receivedClientComputedHeader = false
22+
latch = new CountDownLatch(1)
23+
}
24+
}
25+
26+
static newAgent(State state) {
27+
httpServer {
28+
handlers {
29+
get("/info") {
30+
response.send('{"endpoints":[' + (state.agentMetricsAvailable ? '"/v0.6/stats", ' : '')
31+
+ '"/v0.4/traces"]}')
32+
state.latch.countDown()
33+
}
34+
post("/v0.6/stats", {
35+
state.receivedStats = true
36+
response.status(state.agentMetricsAvailable ? 200 : 404).send()
37+
})
38+
put("/v0.4/traces", {
39+
state.receivedClientComputedHeader = "true" == request.getHeader('Datadog-Client-Computed-Stats')
40+
response.status(200).send()
41+
})
42+
}
43+
}
44+
}
45+
46+
def "metrics should reliably handle momentary downgrades"() {
47+
setup:
48+
def state = new State()
49+
state.reset(true)
50+
def agent = newAgent(state)
51+
agent.start()
52+
def props = new Properties()
53+
props.put("trace.agent.url", agent.getAddress().toString())
54+
props.put("trace.tracer.metrics.enabled", "true")
55+
def config = Config.get(props)
56+
def sharedComm = new SharedCommunicationObjects()
57+
sharedComm.createRemaining(config)
58+
def featuresDiscovery = sharedComm.featuresDiscovery(config)
59+
def tracer = tracerBuilder().sharedCommunicationObjects(sharedComm).config(config).build()
60+
61+
when: "metrics enabled and discovery is performed"
62+
featuresDiscovery.discover()
63+
64+
then: "should support metrics"
65+
state.latch.await()
66+
assert featuresDiscovery.supportsMetrics()
67+
68+
when: "a span is published"
69+
tracer.startSpan("test", "test").finish()
70+
tracer.flush()
71+
tracer.flushMetrics()
72+
73+
then: "should have sent statistics and informed the agent that we calculate the stats"
74+
assert state.receivedClientComputedHeader
75+
assert state.receivedStats
76+
77+
when: "simulate an agent downgrade"
78+
state.reset(false)
79+
tracer.startSpan("test", "test").finish()
80+
tracer.flush()
81+
tracer.flushMetrics()
82+
83+
then: "a discovery should have done - we do not support anymore stats calculation"
84+
state.latch.await()
85+
assert !featuresDiscovery.supportsMetrics()
86+
87+
when: "a span is published"
88+
tracer.startSpan("test", "test").finish()
89+
tracer.flush()
90+
tracer.flushMetrics()
91+
92+
then: "should have not sent statistics and informed the agent that we don't calculate the stats anymore"
93+
assert !state.receivedClientComputedHeader
94+
assert !state.receivedStats
95+
96+
when: "we detect that the agent can calculate the stats again"
97+
state.reset(true)
98+
featuresDiscovery.discover()
99+
100+
then: "we should understand it"
101+
state.latch.await()
102+
assert featuresDiscovery.supportsMetrics()
103+
104+
when: "a span is published"
105+
tracer.startSpan("test", "test").finish()
106+
tracer.flush()
107+
tracer.flushMetrics()
108+
109+
then: "we should have sent the stats and informed the agent to not calculate the stats on the trace payload"
110+
assert state.receivedClientComputedHeader
111+
assert state.receivedStats
112+
113+
cleanup:
114+
tracer.close()
115+
agent.stop()
116+
}
117+
}

0 commit comments

Comments
 (0)