Skip to content

Commit 482283b

Browse files
authored
Add CF_TIP_BOOT_FAILED_COUNT metric for cuttlefish boot failures (#4713)
This commit introduces the `CF_TIP_BOOT_FAILED_COUNT` metric to track failures when booting Cuttlefish with a tip-of-tree build. This metric is intended to be used as the basis for an alerting mechanism. By setting up an alerting policy on this metric, we can be notified when the number of tip-of-tree boot failures exceeds a defined threshold.
1 parent 1e1c2b6 commit 482283b

File tree

3 files changed

+94
-0
lines changed

3 files changed

+94
-0
lines changed

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@
2323
monitor.BooleanField('success'),
2424
])
2525

26+
CF_TIP_BOOT_FAILED_COUNT = monitor.CounterMetric(
27+
'tip_boot_failure',
28+
description=
29+
'Count of failure in booting up cuttlefish with tip-of-the-tree build ',
30+
field_spec=[
31+
monitor.StringField('build_id'),
32+
monitor.BooleanField('is_succeeded'),
33+
])
34+
2635
JOB_BAD_BUILD_COUNT = monitor.CounterMetric(
2736
'task/fuzz/job/bad_build_count',
2837
description=("Count of fuzz task's bad build count "

src/clusterfuzz/_internal/platforms/android/flash.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from clusterfuzz._internal.base import persistent_cache
2222
from clusterfuzz._internal.datastore import locks
2323
from clusterfuzz._internal.metrics import logs
24+
from clusterfuzz._internal.metrics import monitoring_metrics
2425
from clusterfuzz._internal.system import archive
2526
from clusterfuzz._internal.system import environment
2627
from clusterfuzz._internal.system import shell
@@ -206,9 +207,17 @@ def flash_to_latest_build_if_needed():
206207
locks.release_lock(flash_lock_key_name, by_zone=True)
207208

208209
if adb.get_device_state() != 'device':
210+
monitoring_metrics.CF_TIP_BOOT_FAILED_COUNT.increment({
211+
'build_id': build_info['bid'],
212+
'is_succeeded': False
213+
})
209214
logs.error('Unable to find device. Reimaging failed.')
210215
adb.bad_state_reached()
211216

217+
monitoring_metrics.CF_TIP_BOOT_FAILED_COUNT.increment({
218+
'build_id': build_info['bid'],
219+
'is_succeeded': True
220+
})
212221
logs.info('Reimaging finished.')
213222

214223
# Reset all of our persistent keys after wipe.

src/clusterfuzz/_internal/tests/core/metrics/monitor_test.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,15 @@
1515
# pylint: disable=protected-access
1616

1717
import os
18+
import queue
1819
import time
1920
import unittest
21+
from unittest.mock import patch
2022

2123
from clusterfuzz._internal.metrics import monitor
2224
from clusterfuzz._internal.metrics import monitoring_metrics
25+
from clusterfuzz._internal.platforms.android import flash
26+
from clusterfuzz._internal.system import environment
2327
from clusterfuzz._internal.tests.test_libs import helpers
2428

2529

@@ -69,10 +73,82 @@ class MonitorTest(unittest.TestCase):
6973
def setUp(self):
7074
helpers.patch(self, [
7175
'clusterfuzz._internal.metrics.monitor.check_module_loaded',
76+
'clusterfuzz._internal.base.persistent_cache.get_value',
77+
'clusterfuzz._internal.base.persistent_cache.set_value',
78+
'clusterfuzz._internal.base.persistent_cache.delete_value',
79+
'clusterfuzz._internal.platforms.android.settings.is_google_device',
80+
'clusterfuzz._internal.platforms.android.fetch_artifact.get_latest_artifact_info',
81+
'clusterfuzz._internal.system.environment.is_android_cuttlefish',
82+
'clusterfuzz._internal.platforms.android.flash.download_latest_build',
83+
'clusterfuzz._internal.platforms.android.adb.connect_to_cuttlefish_device',
84+
'clusterfuzz._internal.platforms.android.adb.recreate_cuttlefish_device',
85+
'clusterfuzz._internal.platforms.android.adb.get_device_state',
86+
'clusterfuzz._internal.platforms.android.adb.bad_state_reached',
7287
])
7388
self.mock.check_module_loaded.return_value = True
89+
self.mock.get_value.return_value = None
90+
self.mock.is_google_device.return_value = True
91+
self.mock.get_latest_artifact_info.return_value = {
92+
'bid': 'test-bid',
93+
'branch': 'test-branch',
94+
'target': 'test-target'
95+
}
96+
self.mock.is_android_cuttlefish.return_value = True
97+
environment.set_value('BUILD_BRANCH', 'test-branch')
98+
environment.set_value('BUILD_TARGET', 'test-target')
7499
monitor.metrics_store().reset_for_testing()
75100

101+
def _setup_monitoring_daemon(self, mock_client):
102+
"""Setup and start monitoring daemon."""
103+
monitor.credentials._use_anonymous_credentials = lambda: False
104+
monitor._monitoring_v3_client = mock_client.return_value
105+
monitor.FLUSH_INTERVAL_SECONDS = 10
106+
monitor._monitoring_daemon = monitor._MonitoringDaemon(
107+
monitor._flush_metrics, monitor.FLUSH_INTERVAL_SECONDS)
108+
monitor.utils.get_application_id = lambda: 'google.com:clusterfuzz'
109+
os.environ['BOT_NAME'] = 'bot-1'
110+
monitor._initialize_monitored_resource()
111+
monitor._monitored_resource.labels['zone'] = 'us-central1-b'
112+
call_queue = queue.Queue()
113+
mock_create_time_series = mock_client.return_value.create_time_series
114+
mock_create_time_series.side_effect = (
115+
lambda **kwargs: call_queue.put(kwargs))
116+
monitor._monitoring_daemon.start()
117+
return call_queue
118+
119+
def _assert_cuttlefish_boot_metric(self, time_series, is_succeeded):
120+
"""Asserts Cuttlefish boot failure metric presence and correctness in time series."""
121+
for ts in time_series:
122+
if ts.metric.type == "custom.googleapis.com/tip_boot_failure":
123+
if is_succeeded is not None and ts.metric.labels['is_succeeded'] != str(
124+
is_succeeded):
125+
continue
126+
self.assertEqual(ts.metric.labels['is_succeeded'], str(is_succeeded))
127+
self.assertEqual(ts.metric.labels['build_id'], "test-bid")
128+
129+
@patch(
130+
'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient')
131+
def test_cuttlefish_boot_success_metric(self, mock_client):
132+
"""Tests the metric emission for a successful Cuttlefish boot."""
133+
call_queue = self._setup_monitoring_daemon(mock_client)
134+
self.mock.get_device_state.return_value = 'device'
135+
flash.flash_to_latest_build_if_needed()
136+
args = call_queue.get(timeout=20)
137+
time_series = args['time_series']
138+
self._assert_cuttlefish_boot_metric(time_series, True)
139+
monitor._monitoring_daemon.stop()
140+
141+
@patch(
142+
'clusterfuzz._internal.metrics.monitor.monitoring_v3.MetricServiceClient')
143+
def test_cuttlefish_boot_failure_metric(self, mock_client):
144+
"""Tests the metric emission for a failed Cuttlefish boot."""
145+
call_queue = self._setup_monitoring_daemon(mock_client)
146+
flash.flash_to_latest_build_if_needed()
147+
args = call_queue.get(timeout=20)
148+
time_series = args['time_series']
149+
self._assert_cuttlefish_boot_metric(time_series, False)
150+
monitor._monitoring_daemon.stop()
151+
76152
def test_counter_metric_success(self):
77153
self.assertIsInstance(
78154
monitor.CounterMetric('t', 'desc', field_spec=None),

0 commit comments

Comments
 (0)