Skip to content

Commit 0be9f65

Browse files
authored
feat: server monitors (#394)
* feat: add wildfire monitors * feat: add infra team channel for notification * feat: enable all servers * feat: remove duplicate notification
1 parent aa3c983 commit 0be9f65

File tree

7 files changed

+436
-24
lines changed

7 files changed

+436
-24
lines changed

workflow-cli/configuration-opensearch/alerting/agent-heartbeat.monitor.json

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@
8181
"actions": [
8282
{
8383
"id": "<%= idgen('action-teams', server.name, agent.index) %>",
84-
"name": "Notify Teams Channel",
84+
"name": "Notify Appinfra Teams Channel",
8585
"destination_id": "appinfra-msteams",
8686
"message_template": {
8787
"source": "{ \"text\": \"Monitor {{ctx.monitor.name}} just entered alert status. Please investigate the issue.\n - Trigger: {{ctx.trigger.name}}\n - Severity: {{ctx.trigger.severity}}\n - Period start: {{ctx.periodStart}}\n - Period end: {{ctx.periodEnd}}\" }",
@@ -118,10 +118,10 @@
118118
{
119119
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('nonproduction') ? 'true' : 'false' %>",
120120
"id": "<%= idgen('action-nonprod-wf', server.name, agent.index) %>",
121-
"name": "notify",
121+
"name": "Notify Wildfire Nonprod Teams Channel",
122122
"destination_id": "wf-nonprod-msteams",
123123
"message_template": {
124-
"source": "{\"text\": \"No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
124+
"source": "{\"text\": \"Monitor {{ctx.monitor.name}}: No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
125125
"lang": "mustache"
126126
},
127127
"throttle_enabled": true,
@@ -134,32 +134,13 @@
134134
"unit": "MINUTES"
135135
}
136136
},
137-
{
138-
"$$OMIT": "<%= !serverTag('wildfire') || serverTag('production') ? 'true' : 'false' %>",
139-
"id": "<%= idgen('action-nonprod-wf', server.name, agent.index) %>",
140-
"name": "notify",
141-
"destination_id": "wf-nonprod-msteams",
142-
"message_template": {
143-
"source": "{\"text\": \"No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
144-
"lang": "mustache"
145-
},
146-
"throttle_enabled": true,
147-
"subject_template": {
148-
"source": "",
149-
"lang": "mustache"
150-
},
151-
"throttle": {
152-
"value": 240,
153-
"unit": "MINUTES"
154-
}
155-
},
156137
{
157138
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('production') ? 'true' : 'false' %>",
158139
"id": "<%= idgen('action-prod-wf', server.name, agent.index) %>",
159-
"name": "notify",
140+
"name": "Notify Wildfire Prod Teams Channel",
160141
"destination_id": "wf-prod-msteams",
161142
"message_template": {
162-
"source": "{\"text\": \"No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
143+
"source": "{\"text\": \"Monitor {{ctx.monitor.name}}: No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
163144
"lang": "mustache"
164145
},
165146
"throttle_enabled": true,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"type": "server"
3+
}
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
{
2+
"name": "nrids_server_cpu_<%= server.name %>",
3+
"type": "monitor",
4+
"monitor_type": "query_level_monitor",
5+
"enabled": true,
6+
"schedule": {
7+
"period": {
8+
"interval": 15,
9+
"unit": "MINUTES"
10+
}
11+
},
12+
"inputs": [
13+
{
14+
"search": {
15+
"indices": [
16+
"nrm-metrics-*"
17+
],
18+
"query": {
19+
"size": 0,
20+
"aggregations": {
21+
"metric": {
22+
"avg": {
23+
"field": "host.cpu.usage"
24+
}
25+
}
26+
},
27+
"query": {
28+
"bool": {
29+
"filter": [
30+
{
31+
"range": {
32+
"@timestamp": {
33+
"from": "{{period_end}}||-30m",
34+
"to": "{{period_end}}",
35+
"include_lower": true,
36+
"include_upper": true,
37+
"format": "epoch_millis",
38+
"boost": 1.0
39+
}
40+
}
41+
},
42+
{
43+
"term": {
44+
"host.hostname": {
45+
"value": "<%= server.name %>",
46+
"boost": 1.0
47+
}
48+
}
49+
}
50+
],
51+
"adjust_pure_negative": true,
52+
"boost": 1.0
53+
}
54+
}
55+
}
56+
}
57+
}
58+
],
59+
"triggers": [
60+
{
61+
"query_level_trigger": {
62+
"id": "<%= idgen('trigger', server.name) %>",
63+
"name": "AbnormalCPU from server <%= server.name %>",
64+
"severity": "4",
65+
"condition": {
66+
"script": {
67+
"source": "return ctx.results[0].aggregations.metric.value == null ? false : ctx.results[0].aggregations.metric.value > 95",
68+
"lang": "painless"
69+
}
70+
},
71+
"actions": [
72+
{
73+
"id": "<%= idgen('action-teams', server.name) %>",
74+
"name": "Notify Appinfra Teams Channel",
75+
"destination_id": "appinfra-msteams",
76+
"message_template": {
77+
"source": "{ \"text\": \"Monitor {{ctx.monitor.name}} just entered alert status. Please investigate the issue.\n - Trigger: {{ctx.trigger.name}}\n - Severity: {{ctx.trigger.severity}}\n - Period start: {{ctx.periodStart}}\n - Period end: {{ctx.periodEnd}}\" }",
78+
"lang" : "mustache"
79+
},
80+
"throttle_enabled": true,
81+
"throttle": {
82+
"value": 15,
83+
"unit": "MINUTES"
84+
},
85+
"subject_template": {
86+
"source": "",
87+
"lang" : "mustache"
88+
}
89+
},
90+
{
91+
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('nonproduction') ? 'true' : 'false' %>",
92+
"id": "<%= idgen('action-nonprod-wf', server.name) %>",
93+
"name": "Notify Wildfire Nonprod Teams Channel",
94+
"destination_id": "wf-nonprod-msteams",
95+
"message_template": {
96+
"source": "{\"text\": \"Abnormal high CPU usage alerting received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
97+
"lang": "mustache"
98+
},
99+
"throttle_enabled": true,
100+
"subject_template": {
101+
"source": "<%= server.name %> CPU Usage Alert",
102+
"lang": "mustache"
103+
},
104+
"throttle": {
105+
"value": 1440,
106+
"unit": "MINUTES"
107+
}
108+
},
109+
{
110+
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('production') ? 'true' : 'false' %>",
111+
"id": "<%= idgen('action-prod-wf', server.name) %>",
112+
"name": "Notify Wildfire Prod Teams Channel",
113+
"destination_id": "wf-prod-msteams",
114+
"message_template": {
115+
"source": "{\"text\": \"Abnormal high CPU usage alerting received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
116+
"lang": "mustache"
117+
},
118+
"throttle_enabled": true,
119+
"subject_template": {
120+
"source": "<%= server.name %> CPU Usage Alert",
121+
"lang": "mustache"
122+
},
123+
"throttle": {
124+
"value": 30,
125+
"unit": "MINUTES"
126+
}
127+
}
128+
]
129+
}
130+
}
131+
],
132+
"data_sources": {
133+
"alerts_history_index": ".opendistro-alerting-alert-history-write",
134+
"alerts_history_index_pattern": "<.opendistro-alerting-alert-history-{now/d}-1>",
135+
"alerts_index": ".opendistro-alerting-alerts",
136+
"findings_enabled": false,
137+
"findings_index": ".opensearch-alerting-finding-history-write",
138+
"findings_index_pattern": "<.opensearch-alerting-finding-history-{now/d}-1>",
139+
"query_index": ".opensearch-alerting-queries",
140+
"query_index_mappings_by_type": {}
141+
}
142+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"type": "server"
3+
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
{
2+
"name": "nrids_server_disk_<%= server.name %>",
3+
"type": "monitor",
4+
"monitor_type": "query_level_monitor",
5+
"enabled": true,
6+
"schedule": {
7+
"period": {
8+
"interval": 15,
9+
"unit": "MINUTES"
10+
}
11+
},
12+
"inputs": [
13+
{
14+
"search": {
15+
"indices": [
16+
"nrm-metrics-*"
17+
],
18+
"query": {
19+
"size": 0,
20+
"aggregations": {
21+
"metric": {
22+
"avg": {
23+
"field": "host.disk.used_percentage"
24+
}
25+
}
26+
},
27+
"query": {
28+
"bool": {
29+
"filter": [
30+
{
31+
"range": {
32+
"@timestamp": {
33+
"from": "{{period_end}}||-30m",
34+
"to": "{{period_end}}",
35+
"include_lower": true,
36+
"include_upper": true,
37+
"format": "epoch_millis",
38+
"boost": 1.0
39+
}
40+
}
41+
},
42+
{
43+
"term": {
44+
"host.hostname": {
45+
"value": "<%= server.name %>",
46+
"boost": 1.0
47+
}
48+
}
49+
}
50+
]
51+
}
52+
}
53+
}
54+
}
55+
}
56+
],
57+
"triggers": [
58+
{
59+
"query_level_trigger": {
60+
"id": "<%= idgen('trigger', server.name) %>",
61+
"name": "AbnormalDisk from server <%= server.name %>",
62+
"severity": "4",
63+
"condition": {
64+
"script": {
65+
"source": "return ctx.results[0].aggregations.metric.value == null ? false : ctx.results[0].aggregations.metric.value > 0.999",
66+
"lang": "painless"
67+
}
68+
},
69+
"actions": [
70+
{
71+
"id": "<%= idgen('action-teams', server.name) %>",
72+
"name": "Notify Appinfra Teams Channel",
73+
"destination_id": "appinfra-msteams",
74+
"message_template": {
75+
"source": "{ \"text\": \"Monitor {{ctx.monitor.name}} just entered alert status. Please investigate the issue.\n - Trigger: {{ctx.trigger.name}}\n - Severity: {{ctx.trigger.severity}}\n - Period start: {{ctx.periodStart}}\n - Period end: {{ctx.periodEnd}}\" }",
76+
"lang" : "mustache"
77+
},
78+
"throttle_enabled": true,
79+
"throttle": {
80+
"value": 15,
81+
"unit": "MINUTES"
82+
},
83+
"subject_template": {
84+
"source": "",
85+
"lang" : "mustache"
86+
}
87+
},
88+
{
89+
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('nonproduction') ? 'true' : 'false' %>",
90+
"id": "<%= idgen('action-nonprod-wf', server.name) %>",
91+
"name": "Notify Wildfire Nonprod Teams Channel",
92+
"destination_id": "wf-nonprod-msteams",
93+
"message_template": {
94+
"source": "{\"text\": \"AbnormalDisk alerting received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
95+
"lang": "mustache"
96+
},
97+
"throttle_enabled": true,
98+
"subject_template": {
99+
"source": "<%= server.name %> Low Disk Space",
100+
"lang": "mustache"
101+
},
102+
"throttle": {
103+
"value": 1440,
104+
"unit": "MINUTES"
105+
}
106+
},
107+
{
108+
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('production') ? 'true' : 'false' %>",
109+
"id": "<%= idgen('action-prod-wf', server.name) %>",
110+
"name": "Notify Wildfire Prod Teams Channel",
111+
"destination_id": "wf-prod-msteams",
112+
"message_template": {
113+
"source": "{\"text\": \"AbnormalDisk alerting received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
114+
"lang": "mustache"
115+
},
116+
"throttle_enabled": true,
117+
"subject_template": {
118+
"source": "<%= server.name %> Low Disk Space",
119+
"lang": "mustache"
120+
},
121+
"throttle": {
122+
"value": 30,
123+
"unit": "MINUTES"
124+
}
125+
}
126+
]
127+
}
128+
}
129+
],
130+
"data_sources": {
131+
"alerts_history_index": ".opendistro-alerting-alert-history-write",
132+
"alerts_history_index_pattern": "<.opendistro-alerting-alert-history-{now/d}-1>",
133+
"alerts_index": ".opendistro-alerting-alerts",
134+
"findings_enabled": false,
135+
"findings_index": ".opensearch-alerting-finding-history-write",
136+
"findings_index_pattern": "<.opensearch-alerting-finding-history-{now/d}-1>",
137+
"query_index": ".opensearch-alerting-queries",
138+
"query_index_mappings_by_type": {}
139+
}
140+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"type": "server"
3+
}

0 commit comments

Comments
 (0)