Skip to content

Commit a2206f0

Browse files
committed
Prevent invalid timestamps in page_fault_count metric
Right after container is started, kubelet returns time == start_time for that metric, which causes Monitoring API to return 400 error.
1 parent d2a5314 commit a2206f0

File tree

3 files changed

+18
-2
lines changed

3 files changed

+18
-2
lines changed

kubelet-to-gcm/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
OUT_DIR = build
1616
PACKAGE = github.com/GoogleCloudPlatform/k8s-stackdriver/kubelet-to-gcm
1717
PREFIX = staging-k8s.gcr.io
18-
TAG = 1.3.4
18+
TAG = 1.3.5
1919

2020
# Rules for building the real image for deployment to gcr.io
2121

kubelet-to-gcm/monitor/kubelet/translate.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,8 @@ func translateMemory(memory *stats.MemoryStats, tsFactory *timeSeriesFactory, st
519519
return nil, fmt.Errorf("Memory information missing.")
520520
}
521521

522-
if pageFaultsMD != nil {
522+
// Only send page fault metric if start time is before current time. Right after container is started, kubelet can return start time == end time. This doesn't seem to happen with other metrics.
523+
if pageFaultsMD != nil && memory.Time.Time.After(startTime) {
523524
if memory.MajorPageFaults == nil {
524525
return nil, fmt.Errorf("MajorPageFaults missing in MemoryStats %v", memory)
525526
}

kubelet-to-gcm/monitor/kubelet/translate_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,8 @@ func TestTranslateContainers(t *testing.T) {
221221
noLogStatsContainer.Logs = nil
222222
noRootfsStatsContainer := *getContainerStats(false)
223223
noRootfsStatsContainer.Rootfs = nil
224+
badTimestampOnPageFaultContrainer := *getContainerStats(false)
225+
badTimestampOnPageFaultContrainer.Memory.Time = badTimestampOnPageFaultContrainer.StartTime
224226
legacyTsPerContainer := 11
225227
tsPerContainer := 8
226228
testCases := []struct {
@@ -307,6 +309,16 @@ func TestTranslateContainers(t *testing.T) {
307309
),
308310
},
309311
},
312+
{
313+
name: "bad timestamp for page_fault_count",
314+
ExpectedLegacyTSCount: legacyTsPerContainer - 2,
315+
ExpectedTSCount: tsPerContainer - 2,
316+
pods: []stats.PodStats{
317+
getPodStats(
318+
badTimestampOnPageFaultContrainer,
319+
),
320+
},
321+
},
310322
}
311323

312324
for _, tc := range testCases {
@@ -349,6 +361,9 @@ func getContainerStats(skipUsageNanoCores bool) *stats.ContainerStats {
349361
if skipUsageNanoCores {
350362
v.CPU.UsageNanoCores = nil
351363
}
364+
if v.Memory.Time.Time.Before(v.StartTime.Time) {
365+
v.Memory.Time, v.StartTime = v.StartTime, v.Memory.Time
366+
}
352367
return v
353368
}
354369

0 commit comments

Comments
 (0)