Skip to content

Commit d3c6ff9

Browse files
committed
feat: add epsilon spar app monitor (in progress)
1 parent f0c1347 commit d3c6ff9

File tree

4 files changed

+240
-11
lines changed

4 files changed

+240
-11
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"type": "service",
3+
"serviceId": "644c4d322e2f63acef6bb84a",
4+
"environments": ["test", "production"]
5+
}
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
{
2+
"name": "nrids_app_<%= service.name %>_<%= environment %>",
3+
"type": "monitor",
4+
"monitor_type": "query_level_monitor",
5+
"enabled": true,
6+
"schedule": {
7+
"period": {
8+
"interval": 1,
9+
"unit": "MINUTES"
10+
}
11+
},
12+
"inputs": [
13+
{
14+
"search": {
15+
"indices": [
16+
"nrm-metrics"
17+
],
18+
"query": {
19+
"size": 0,
20+
"aggregations": {},
21+
"query": {
22+
"bool": {
23+
"must": [
24+
{
25+
"range": {
26+
"http.response.status_code": {
27+
"gt": 499
28+
}
29+
}
30+
},
31+
{
32+
"term": {
33+
"service.environment": "<%= environment %>"
34+
}
35+
},
36+
{
37+
"range": {
38+
"@timestamp": {
39+
"gte": "now-5m"
40+
}
41+
}
42+
}
43+
]
44+
}
45+
}
46+
}
47+
}
48+
}
49+
],
50+
"triggers": [
51+
{
52+
"query_level_trigger": {
53+
"id": "<%= idgen('trigger', service.name, environment) %>",
54+
"name": "<%= service.name %> (<%= environment %>) is responding with http errors",
55+
"severity": "1",
56+
"condition": {
57+
"script": {
58+
"source": "ctx.results[0].hits.total.value == 0",
59+
"lang": "painless"
60+
}
61+
},
62+
"actions": [
63+
{
64+
"id": "<%= idgen('action-teams', service.name, environment) %>",
65+
"name": "Notify Teams Channel",
66+
"destination_id": "appinfra-msteams",
67+
"message_template": {
68+
"source": "{ \"text\": \"Monitor {{ctx.monitor.name}} just entered alert status. Please investigate the issue.\n - Trigger: {{ctx.trigger.name}}\n - Severity: {{ctx.trigger.severity}}\n - Period start: {{ctx.periodStart}}\n - Period end: {{ctx.periodEnd}}\" }",
69+
"lang" : "mustache"
70+
},
71+
"throttle_enabled": true,
72+
"throttle": {
73+
"value": 10,
74+
"unit": "MINUTES"
75+
},
76+
"subject_template": {
77+
"source": "",
78+
"lang" : "mustache"
79+
}
80+
},
81+
{
82+
"id": "<%= idgen('action-sqs', server.name, agent.index) %>",
83+
"name": "Notify Automation Queue",
84+
"destination_id": "automation-sqs-sns",
85+
"message_template": {
86+
"source": "{ \"type\": \"agent_down\", \"server\": \"<%= server.name %>\", \"agent\": \"fluent-bit.<%= agent.index %>\", \"periodStart\": \"{{ctx.periodStart}}\", \"periodEnd\": \"{{ctx.periodEnd}}\" }",
87+
"lang" : "mustache"
88+
},
89+
"throttle_enabled": true,
90+
"throttle": {
91+
"value": 10,
92+
"unit": "MINUTES"
93+
},
94+
"subject_template": {
95+
"source": "Notify",
96+
"lang" : "mustache"
97+
}
98+
},
99+
{
100+
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('nonproduction') ? 'true' : 'false' %>",
101+
"id": "<%= idgen('action-nonprod-wf', server.name, agent.index) %>",
102+
"name": "notify",
103+
"destination_id": "wf-nonprod-msteams",
104+
"message_template": {
105+
"source": "{\"text\": \"No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
106+
"lang": "mustache"
107+
},
108+
"throttle_enabled": true,
109+
"subject_template": {
110+
"source": "",
111+
"lang": "mustache"
112+
},
113+
"throttle": {
114+
"value": 240,
115+
"unit": "MINUTES"
116+
}
117+
},
118+
{
119+
"$$OMIT": "<%= !serverTag('wildfire') || serverTag('production') ? 'true' : 'false' %>",
120+
"id": "<%= idgen('action-nonprod-wf', server.name, agent.index) %>",
121+
"name": "notify",
122+
"destination_id": "wf-nonprod-msteams",
123+
"message_template": {
124+
"source": "{\"text\": \"No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
125+
"lang": "mustache"
126+
},
127+
"throttle_enabled": true,
128+
"subject_template": {
129+
"source": "",
130+
"lang": "mustache"
131+
},
132+
"throttle": {
133+
"value": 240,
134+
"unit": "MINUTES"
135+
}
136+
},
137+
{
138+
"$$OMIT": "<%= !serverTag('wildfire') || !serverTag('production') ? 'true' : 'false' %>",
139+
"id": "<%= idgen('action-prod-wf', server.name, agent.index) %>",
140+
"name": "notify",
141+
"destination_id": "wf-prod-msteams",
142+
"message_template": {
143+
"source": "{\"text\": \"No logs received from <%= server.name %> between {{ctx.periodStart}} and {{ctx.periodEnd}}\"}",
144+
"lang": "mustache"
145+
},
146+
"throttle_enabled": true,
147+
"subject_template": {
148+
"source": "",
149+
"lang": "mustache"
150+
},
151+
"throttle": {
152+
"value": 30,
153+
"unit": "MINUTES"
154+
}
155+
}
156+
]
157+
}
158+
}
159+
],
160+
"data_sources": {
161+
"alerts_history_index": ".opendistro-alerting-alert-history-write",
162+
"alerts_history_index_pattern": "<.opendistro-alerting-alert-history-{now/d}-1>",
163+
"alerts_index": ".opendistro-alerting-alerts",
164+
"findings_enabled": false,
165+
"findings_index": ".opensearch-alerting-finding-history-write",
166+
"findings_index_pattern": "<.opensearch-alerting-finding-history-{now/d}-1>",
167+
"query_index": ".opensearch-alerting-queries",
168+
"query_index_mappings_by_type": {}
169+
}
170+
}

workflow-cli/src/broker/broker.api.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,16 @@ export class BrokerApi {
3636
};
3737
}
3838

39-
public async getProjectServices(): Promise<GraphServerInstallsResponseDto[]> {
39+
public getServiceDetails(serviceId: string) {
40+
return axios.get(
41+
`v1/collection/service/${serviceId}/details`,
42+
this.axiosOptions,
43+
);
44+
}
45+
46+
public async getGraphServerInstalls(): Promise<
47+
GraphServerInstallsResponseDto[]
48+
> {
4049
if (!this.serverInstallsReq) {
4150
this.serverInstallsReq = axios.get<GraphServerInstallsResponseDto[]>(
4251
'v1/graph/data/server-installs',

workflow-cli/src/services/opensearch-monitor.service.ts

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,32 @@ import { TYPES } from '../inversify.types';
1212

1313
const ID_MAX_LENGTH = 20;
1414

15-
export interface MonitorConfig {
15+
export interface MonitorConfiguration {
1616
name: string;
1717
server: string;
1818
agent: string;
1919
query_level_trigger_id: string;
2020
teams_channel_action_id: string;
2121
automation_queue_action_id: string;
2222
}
23+
export interface AlertAgentConfiguration {
24+
type: 'agent';
25+
}
26+
export interface AlertServerConfiguration {
27+
type: 'server';
28+
}
29+
30+
export type EnvironmentNames = 'tools' | 'development' | 'test' | 'production';
31+
export interface AlertServiceConfiguration {
32+
type: 'service';
33+
serviceId: string;
34+
environments: EnvironmentNames[];
35+
}
36+
37+
export type AlertConfiguration =
38+
| AlertAgentConfiguration
39+
| AlertServerConfiguration
40+
| AlertServiceConfiguration;
2341

2442
const ALERT_CONFIG_DIR = path.resolve(
2543
__dirname,
@@ -45,9 +63,23 @@ export default class OpenSearchMonitorService extends AwsService {
4563
}
4664
return 1;
4765
}
66+
4867
public async sync(settings: WorkflowSettings): Promise<any> {
49-
const servers = await this.brokerApi.getProjectServices();
68+
const servers = await this.brokerApi.getGraphServerInstalls();
69+
const environments: EnvironmentNames[] = [
70+
'tools',
71+
'development',
72+
'test',
73+
'production',
74+
];
5075
let monitors: any[] = [];
76+
const idgen = (...args: any) => {
77+
return crypto
78+
.createHash('sha256')
79+
.update(args.join())
80+
.digest('hex')
81+
.substring(0, ID_MAX_LENGTH);
82+
};
5183

5284
if (settings.dryRun) {
5385
console.log('Dry run: No changes will be made');
@@ -61,26 +93,39 @@ export default class OpenSearchMonitorService extends AwsService {
6193
path.resolve(ALERT_CONFIG_DIR, alertFile),
6294
{ encoding: 'utf8' },
6395
);
64-
const alertConfig = JSON.parse(alertConfigStr);
96+
const alertConfig: AlertConfiguration = JSON.parse(alertConfigStr);
6597
const alertMonitorStr = fs.readFileSync(
6698
path.join(ALERT_CONFIG_DIR, `${alertFile.slice(0, -12)}.monitor.json`),
6799
{ encoding: 'utf8' },
68100
);
69101

102+
if (alertConfig.type === 'service') {
103+
const serviceData = await this.brokerApi.getServiceDetails(
104+
alertConfig.serviceId,
105+
);
106+
for (const environment of environments) {
107+
if (alertConfig.environments.indexOf(environment) === -1) {
108+
continue;
109+
}
110+
monitors.push(
111+
JSON.parse(
112+
ejs.render(alertMonitorStr, {
113+
service: serviceData,
114+
environment,
115+
idgen,
116+
}),
117+
),
118+
);
119+
}
120+
}
121+
70122
for (const server of servers) {
71123
const fbInstance = this.getFluentBitInstance(server.instances);
72124
if (!fbInstance) {
73125
// skip
74126
continue;
75127
}
76128
const agentCount = this.getAgentCount(fbInstance);
77-
const idgen = (...args: any) => {
78-
return crypto
79-
.createHash('sha256')
80-
.update(args.join())
81-
.digest('hex')
82-
.substring(0, ID_MAX_LENGTH);
83-
};
84129
const installHas = (id: string) => {
85130
return (
86131
fbInstance.edgeProp &&

0 commit comments

Comments
 (0)