From a6e727c7d446206815200313d3935f91fa0b73a7 Mon Sep 17 00:00:00 2001 From: DarkIsDude Date: Thu, 23 Apr 2026 16:24:30 +0200 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9C=A8=20define=20new=20alerts=20and=20e?= =?UTF-8?q?nable=20monitor=20for=20Zenko=20Operator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue: ZENKO-5262 --- .github/workflows/alerts.yaml | 10 + monitoring/zenko-operator/alerts.test.yaml | 223 +++++++++++++++++++++ monitoring/zenko-operator/alerts.yaml | 73 +++++++ solution/build.sh | 5 + 4 files changed, 311 insertions(+) create mode 100644 monitoring/zenko-operator/alerts.test.yaml create mode 100644 monitoring/zenko-operator/alerts.yaml diff --git a/.github/workflows/alerts.yaml b/.github/workflows/alerts.yaml index d378eb72d4..cfaf2def81 100644 --- a/.github/workflows/alerts.yaml +++ b/.github/workflows/alerts.yaml @@ -87,3 +87,13 @@ jobs: mongo_jobs=zenko/data-db-mongodb-sharded-shard.* lifecycle_jobs=artesca-data-backbeat-lifecycle-.*-headless github_token: ${{ steps.app-token.outputs.token }} + + - name: Render and test zenko-operator alerts + uses: scality/action-prom-render-test@1.0.3 + with: + alert_file_path: monitoring/zenko-operator/alerts.yaml + test_file_path: monitoring/zenko-operator/alerts.test.yaml + alert_inputs: | + namespace=zenko + job=zenko-operator-metrics + github_token: ${{ steps.app-token.outputs.token }} diff --git a/monitoring/zenko-operator/alerts.test.yaml b/monitoring/zenko-operator/alerts.test.yaml new file mode 100644 index 0000000000..55f7cc308b --- /dev/null +++ b/monitoring/zenko-operator/alerts.test.yaml @@ -0,0 +1,223 @@ +evaluation_interval: 1m +rule_files: + - alerts.rendered.yaml + +tests: + # ZenkoOperatorReconcileErrors - no errors + ################################################################################################## + - name: No reconcile errors + interval: 1m + input_series: + - series: zenko_operator_reconcile_errors_total{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data"} + values: 0x10 + alert_rule_test: + - alertname: ZenkoOperatorReconcileErrors + eval_time: 10m + exp_alerts: [] + + # ZenkoOperatorReconcileErrors - errors trigger alert after for: 2m + ################################################################################################## + - name: Reconcile errors trigger alert + interval: 1m + input_series: + - series: zenko_operator_reconcile_errors_total{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data"} + values: 0+1x20 + alert_rule_test: + - alertname: ZenkoOperatorReconcileErrors + eval_time: 2m + exp_alerts: [] + - alertname: ZenkoOperatorReconcileErrors + eval_time: 4m + exp_alerts: + - exp_labels: + severity: warning + job: zenko-operator-metrics + controller: zenko + namespace: zenko + name: artesca-data + exp_annotations: + summary: 'Zenko operator reconciliation errors detected' + description: 'The Zenko operator controller zenko is experiencing reconciliation errors for instance zenko/artesca-data.' + + # ZenkoOperatorDeploymentFailed - condition never true + ################################################################################################## + - name: No deployment failure + interval: 1m + input_series: + - series: zenko_operator_instance_condition{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",condition="DeploymentFailure"} + values: 0x20 + alert_rule_test: + - alertname: ZenkoOperatorDeploymentFailed + eval_time: 20m + exp_alerts: [] + + # ZenkoOperatorDeploymentFailed - condition fires after for: 5m + ################################################################################################## + - name: Deployment failure triggers alert + interval: 1m + input_series: + - series: zenko_operator_instance_condition{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",condition="DeploymentFailure"} + values: 0x3 1x20 + alert_rule_test: + - alertname: ZenkoOperatorDeploymentFailed + eval_time: 7m + exp_alerts: [] + - alertname: ZenkoOperatorDeploymentFailed + eval_time: 9m + exp_alerts: + - exp_labels: + severity: warning + job: zenko-operator-metrics + controller: zenko + namespace: zenko + name: artesca-data + condition: DeploymentFailure + exp_annotations: + summary: 'Zenko operator deployment failure' + description: 'Zenko instance zenko/artesca-data has a DeploymentFailure condition. The operator is unable to reconcile the instance to the desired state.' + + # ZenkoOperatorSecurityOptionEnabled - skipTLSVerify never enabled + ################################################################################################## + - name: No security option enabled + interval: 1m + input_series: + - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="skipTLSVerify"} + values: 0x30 + alert_rule_test: + - alertname: ZenkoOperatorSecurityOptionEnabled + eval_time: 30m + exp_alerts: [] + + # ZenkoOperatorSecurityOptionEnabled - skipTLSVerify fires after for: 15m + ################################################################################################## + - name: skipTLSVerify triggers security alert + interval: 1m + input_series: + - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="skipTLSVerify"} + values: 0x3 1x30 + alert_rule_test: + - alertname: ZenkoOperatorSecurityOptionEnabled + eval_time: 17m + exp_alerts: [] + - alertname: ZenkoOperatorSecurityOptionEnabled + eval_time: 19m + exp_alerts: + - exp_labels: + severity: warning + job: zenko-operator-metrics + controller: zenko + namespace: zenko + name: artesca-data + option: skipTLSVerify + exp_annotations: + summary: 'Security-bypassing option enabled on Zenko instance' + description: 'The option skipTLSVerify is enabled on Zenko instance zenko/artesca-data. This option bypasses security controls and must not remain active in production environments.' + + # ZenkoOperatorSecurityOptionEnabled - noImplicitDeny fires after for: 15m + ################################################################################################## + - name: noImplicitDeny triggers security alert + interval: 1m + input_series: + - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="noImplicitDeny"} + values: 0x3 1x30 + alert_rule_test: + - alertname: ZenkoOperatorSecurityOptionEnabled + eval_time: 19m + exp_alerts: + - exp_labels: + severity: warning + job: zenko-operator-metrics + controller: zenko + namespace: zenko + name: artesca-data + option: noImplicitDeny + exp_annotations: + summary: 'Security-bypassing option enabled on Zenko instance' + description: 'The option noImplicitDeny is enabled on Zenko instance zenko/artesca-data. This option bypasses security controls and must not remain active in production environments.' + + # ZenkoOperatorTestConfigActive - cron override fires after for: 1h + ################################################################################################## + - name: Cron override does not fire before 1h + interval: 1m + input_series: + - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="lifecycleConductorCronOverride"} + values: 1x120 + alert_rule_test: + - alertname: ZenkoOperatorTestConfigActive + eval_time: 59m + exp_alerts: [] + + - name: Cron override triggers test config alert after 1h + interval: 1m + input_series: + - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="lifecycleConductorCronOverride"} + values: 1x120 + alert_rule_test: + - alertname: ZenkoOperatorTestConfigActive + eval_time: 61m + exp_alerts: + - exp_labels: + severity: warning + job: zenko-operator-metrics + controller: zenko + namespace: zenko + name: artesca-data + option: lifecycleConductorCronOverride + exp_annotations: + summary: 'Test/debug configuration active on Zenko instance' + description: 'The option lifecycleConductorCronOverride has been active on Zenko instance zenko/artesca-data for more than 1 hour. This option is intended for testing only and should not remain enabled in production.' + + # ZenkoOperatorDRPhaseFailed - Bootstrap:Failed fires after for: 1m + ################################################################################################## + - name: DR phase not failed + interval: 1m + input_series: + - series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Bootstrap:Failed"} + values: 0x10 + alert_rule_test: + - alertname: ZenkoOperatorDRPhaseFailed + eval_time: 10m + exp_alerts: [] + + - name: Bootstrap Failed triggers DR phase alert + interval: 1m + input_series: + - series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Bootstrap:Failed"} + values: 0x2 1x20 + alert_rule_test: + - alertname: ZenkoOperatorDRPhaseFailed + eval_time: 2m + exp_alerts: [] + - alertname: ZenkoOperatorDRPhaseFailed + eval_time: 4m + exp_alerts: + - exp_labels: + severity: critical + job: zenko-operator-metrics + controller: drsink + namespace: zenko + name: artesca-data-dr + phase: Bootstrap:Failed + exp_annotations: + summary: 'DR controller entered a failed phase' + description: 'DR controller drsink for instance zenko/artesca-data-dr is in phase Bootstrap:Failed. Manual intervention is required to recover.' + + - name: Failback Failed triggers DR phase alert + interval: 1m + input_series: + - series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Failback:Failed"} + values: 0x2 1x20 + alert_rule_test: + - alertname: ZenkoOperatorDRPhaseFailed + eval_time: 4m + exp_alerts: + - exp_labels: + severity: critical + job: zenko-operator-metrics + controller: drsink + namespace: zenko + name: artesca-data-dr + phase: Failback:Failed + exp_annotations: + summary: 'DR controller entered a failed phase' + description: 'DR controller drsink for instance zenko/artesca-data-dr is in phase Failback:Failed. Manual intervention is required to recover.' diff --git a/monitoring/zenko-operator/alerts.yaml b/monitoring/zenko-operator/alerts.yaml new file mode 100644 index 0000000000..5baddbfde5 --- /dev/null +++ b/monitoring/zenko-operator/alerts.yaml @@ -0,0 +1,73 @@ +x-inputs: + - name: namespace + type: constant + value: zenko + - name: job + type: constant + value: zenko-operator-metrics + +groups: +- name: ZenkoOperator + rules: + + - alert: ZenkoOperatorReconcileErrors + expr: | + rate(zenko_operator_reconcile_errors_total{namespace="${namespace}", job="${job}"}[5m]) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: 'Zenko operator reconciliation errors detected' + description: >- + The Zenko operator controller {{ $labels.controller }} is experiencing reconciliation + errors for instance {{ $labels.namespace }}/{{ $labels.name }}. + + - alert: ZenkoOperatorDeploymentFailed + expr: | + zenko_operator_instance_condition{namespace="${namespace}", job="${job}", condition="DeploymentFailure"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: 'Zenko operator deployment failure' + description: >- + Zenko instance {{ $labels.namespace }}/{{ $labels.name }} has a DeploymentFailure condition. + The operator is unable to reconcile the instance to the desired state. + + - alert: ZenkoOperatorSecurityOptionEnabled + expr: | + zenko_operator_debug_option{namespace="${namespace}", job="${job}", option=~"skipTLSVerify|noImplicitDeny"} == 1 + for: 15m + labels: + severity: warning + annotations: + summary: 'Security-bypassing option enabled on Zenko instance' + description: >- + The option {{ $labels.option }} is enabled on Zenko instance + {{ $labels.namespace }}/{{ $labels.name }}. This option bypasses security + controls and must not remain active in production environments. + + - alert: ZenkoOperatorTestConfigActive + expr: | + zenko_operator_debug_option{namespace="${namespace}", job="${job}", option=~".*CronOverride|.*Verbose|triggerTransitions.*|triggerExpirations.*"} == 1 + for: 1h + labels: + severity: warning + annotations: + summary: 'Test/debug configuration active on Zenko instance' + description: >- + The option {{ $labels.option }} has been active on Zenko instance + {{ $labels.namespace }}/{{ $labels.name }} for more than 1 hour. + This option is intended for testing only and should not remain enabled in production. + + - alert: ZenkoOperatorDRPhaseFailed + expr: | + zenko_operator_instance_phase{namespace="${namespace}", job="${job}", phase=~"Bootstrap:Failed|Failback:Failed"} == 1 + for: 1m + labels: + severity: critical + annotations: + summary: 'DR controller entered a failed phase' + description: >- + DR controller {{ $labels.controller }} for instance {{ $labels.namespace }}/{{ $labels.name }} + is in phase {{ $labels.phase }}. Manual intervention is required to recover. diff --git a/solution/build.sh b/solution/build.sh index 21fe95024e..e0798dc0b9 100755 --- a/solution/build.sh +++ b/solution/build.sh @@ -77,6 +77,11 @@ spec: image: name: zenko-operator tag: '$(zenko_operator_tag)' + metrics: + enabled: true + scheme: http + port: 8080 + path: /metrics EOF } From 9a9f3f460ac1f6113ff8cdcfa4211dbeeeff6518 Mon Sep 17 00:00:00 2001 From: DarkIsDude Date: Thu, 23 Apr 2026 18:39:06 +0200 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=99=88=20Add=20rendered=20tests=20ale?= =?UTF-8?q?rts=20to=20gitignore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue: ZENKO-5262 --- monitoring/zenko-operator/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 monitoring/zenko-operator/.gitignore diff --git a/monitoring/zenko-operator/.gitignore b/monitoring/zenko-operator/.gitignore new file mode 100644 index 0000000000..502c44bbfa --- /dev/null +++ b/monitoring/zenko-operator/.gitignore @@ -0,0 +1 @@ +alerts.rendered.yaml From 62cece659f1e1e87b09ab728ed7aab9f12535bab Mon Sep 17 00:00:00 2001 From: DarkIsDude Date: Fri, 24 Apr 2026 12:10:31 +0200 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=94=A5=20remove=20alerts=20from=20zen?= =?UTF-8?q?ko=20and=20move=20them=20to=20zenko-operator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue: ZENKO-5262 --- .github/workflows/alerts.yaml | 10 - monitoring/zenko-operator/.gitignore | 1 - monitoring/zenko-operator/alerts.test.yaml | 223 --------------------- monitoring/zenko-operator/alerts.yaml | 73 ------- solution/deps.yaml | 1 + 5 files changed, 1 insertion(+), 307 deletions(-) delete mode 100644 monitoring/zenko-operator/.gitignore delete mode 100644 monitoring/zenko-operator/alerts.test.yaml delete mode 100644 monitoring/zenko-operator/alerts.yaml diff --git a/.github/workflows/alerts.yaml b/.github/workflows/alerts.yaml index cfaf2def81..d378eb72d4 100644 --- a/.github/workflows/alerts.yaml +++ b/.github/workflows/alerts.yaml @@ -87,13 +87,3 @@ jobs: mongo_jobs=zenko/data-db-mongodb-sharded-shard.* lifecycle_jobs=artesca-data-backbeat-lifecycle-.*-headless github_token: ${{ steps.app-token.outputs.token }} - - - name: Render and test zenko-operator alerts - uses: scality/action-prom-render-test@1.0.3 - with: - alert_file_path: monitoring/zenko-operator/alerts.yaml - test_file_path: monitoring/zenko-operator/alerts.test.yaml - alert_inputs: | - namespace=zenko - job=zenko-operator-metrics - github_token: ${{ steps.app-token.outputs.token }} diff --git a/monitoring/zenko-operator/.gitignore b/monitoring/zenko-operator/.gitignore deleted file mode 100644 index 502c44bbfa..0000000000 --- a/monitoring/zenko-operator/.gitignore +++ /dev/null @@ -1 +0,0 @@ -alerts.rendered.yaml diff --git a/monitoring/zenko-operator/alerts.test.yaml b/monitoring/zenko-operator/alerts.test.yaml deleted file mode 100644 index 55f7cc308b..0000000000 --- a/monitoring/zenko-operator/alerts.test.yaml +++ /dev/null @@ -1,223 +0,0 @@ -evaluation_interval: 1m -rule_files: - - alerts.rendered.yaml - -tests: - # ZenkoOperatorReconcileErrors - no errors - ################################################################################################## - - name: No reconcile errors - interval: 1m - input_series: - - series: zenko_operator_reconcile_errors_total{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data"} - values: 0x10 - alert_rule_test: - - alertname: ZenkoOperatorReconcileErrors - eval_time: 10m - exp_alerts: [] - - # ZenkoOperatorReconcileErrors - errors trigger alert after for: 2m - ################################################################################################## - - name: Reconcile errors trigger alert - interval: 1m - input_series: - - series: zenko_operator_reconcile_errors_total{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data"} - values: 0+1x20 - alert_rule_test: - - alertname: ZenkoOperatorReconcileErrors - eval_time: 2m - exp_alerts: [] - - alertname: ZenkoOperatorReconcileErrors - eval_time: 4m - exp_alerts: - - exp_labels: - severity: warning - job: zenko-operator-metrics - controller: zenko - namespace: zenko - name: artesca-data - exp_annotations: - summary: 'Zenko operator reconciliation errors detected' - description: 'The Zenko operator controller zenko is experiencing reconciliation errors for instance zenko/artesca-data.' - - # ZenkoOperatorDeploymentFailed - condition never true - ################################################################################################## - - name: No deployment failure - interval: 1m - input_series: - - series: zenko_operator_instance_condition{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",condition="DeploymentFailure"} - values: 0x20 - alert_rule_test: - - alertname: ZenkoOperatorDeploymentFailed - eval_time: 20m - exp_alerts: [] - - # ZenkoOperatorDeploymentFailed - condition fires after for: 5m - ################################################################################################## - - name: Deployment failure triggers alert - interval: 1m - input_series: - - series: zenko_operator_instance_condition{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",condition="DeploymentFailure"} - values: 0x3 1x20 - alert_rule_test: - - alertname: ZenkoOperatorDeploymentFailed - eval_time: 7m - exp_alerts: [] - - alertname: ZenkoOperatorDeploymentFailed - eval_time: 9m - exp_alerts: - - exp_labels: - severity: warning - job: zenko-operator-metrics - controller: zenko - namespace: zenko - name: artesca-data - condition: DeploymentFailure - exp_annotations: - summary: 'Zenko operator deployment failure' - description: 'Zenko instance zenko/artesca-data has a DeploymentFailure condition. The operator is unable to reconcile the instance to the desired state.' - - # ZenkoOperatorSecurityOptionEnabled - skipTLSVerify never enabled - ################################################################################################## - - name: No security option enabled - interval: 1m - input_series: - - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="skipTLSVerify"} - values: 0x30 - alert_rule_test: - - alertname: ZenkoOperatorSecurityOptionEnabled - eval_time: 30m - exp_alerts: [] - - # ZenkoOperatorSecurityOptionEnabled - skipTLSVerify fires after for: 15m - ################################################################################################## - - name: skipTLSVerify triggers security alert - interval: 1m - input_series: - - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="skipTLSVerify"} - values: 0x3 1x30 - alert_rule_test: - - alertname: ZenkoOperatorSecurityOptionEnabled - eval_time: 17m - exp_alerts: [] - - alertname: ZenkoOperatorSecurityOptionEnabled - eval_time: 19m - exp_alerts: - - exp_labels: - severity: warning - job: zenko-operator-metrics - controller: zenko - namespace: zenko - name: artesca-data - option: skipTLSVerify - exp_annotations: - summary: 'Security-bypassing option enabled on Zenko instance' - description: 'The option skipTLSVerify is enabled on Zenko instance zenko/artesca-data. This option bypasses security controls and must not remain active in production environments.' - - # ZenkoOperatorSecurityOptionEnabled - noImplicitDeny fires after for: 15m - ################################################################################################## - - name: noImplicitDeny triggers security alert - interval: 1m - input_series: - - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="noImplicitDeny"} - values: 0x3 1x30 - alert_rule_test: - - alertname: ZenkoOperatorSecurityOptionEnabled - eval_time: 19m - exp_alerts: - - exp_labels: - severity: warning - job: zenko-operator-metrics - controller: zenko - namespace: zenko - name: artesca-data - option: noImplicitDeny - exp_annotations: - summary: 'Security-bypassing option enabled on Zenko instance' - description: 'The option noImplicitDeny is enabled on Zenko instance zenko/artesca-data. This option bypasses security controls and must not remain active in production environments.' - - # ZenkoOperatorTestConfigActive - cron override fires after for: 1h - ################################################################################################## - - name: Cron override does not fire before 1h - interval: 1m - input_series: - - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="lifecycleConductorCronOverride"} - values: 1x120 - alert_rule_test: - - alertname: ZenkoOperatorTestConfigActive - eval_time: 59m - exp_alerts: [] - - - name: Cron override triggers test config alert after 1h - interval: 1m - input_series: - - series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="lifecycleConductorCronOverride"} - values: 1x120 - alert_rule_test: - - alertname: ZenkoOperatorTestConfigActive - eval_time: 61m - exp_alerts: - - exp_labels: - severity: warning - job: zenko-operator-metrics - controller: zenko - namespace: zenko - name: artesca-data - option: lifecycleConductorCronOverride - exp_annotations: - summary: 'Test/debug configuration active on Zenko instance' - description: 'The option lifecycleConductorCronOverride has been active on Zenko instance zenko/artesca-data for more than 1 hour. This option is intended for testing only and should not remain enabled in production.' - - # ZenkoOperatorDRPhaseFailed - Bootstrap:Failed fires after for: 1m - ################################################################################################## - - name: DR phase not failed - interval: 1m - input_series: - - series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Bootstrap:Failed"} - values: 0x10 - alert_rule_test: - - alertname: ZenkoOperatorDRPhaseFailed - eval_time: 10m - exp_alerts: [] - - - name: Bootstrap Failed triggers DR phase alert - interval: 1m - input_series: - - series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Bootstrap:Failed"} - values: 0x2 1x20 - alert_rule_test: - - alertname: ZenkoOperatorDRPhaseFailed - eval_time: 2m - exp_alerts: [] - - alertname: ZenkoOperatorDRPhaseFailed - eval_time: 4m - exp_alerts: - - exp_labels: - severity: critical - job: zenko-operator-metrics - controller: drsink - namespace: zenko - name: artesca-data-dr - phase: Bootstrap:Failed - exp_annotations: - summary: 'DR controller entered a failed phase' - description: 'DR controller drsink for instance zenko/artesca-data-dr is in phase Bootstrap:Failed. Manual intervention is required to recover.' - - - name: Failback Failed triggers DR phase alert - interval: 1m - input_series: - - series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Failback:Failed"} - values: 0x2 1x20 - alert_rule_test: - - alertname: ZenkoOperatorDRPhaseFailed - eval_time: 4m - exp_alerts: - - exp_labels: - severity: critical - job: zenko-operator-metrics - controller: drsink - namespace: zenko - name: artesca-data-dr - phase: Failback:Failed - exp_annotations: - summary: 'DR controller entered a failed phase' - description: 'DR controller drsink for instance zenko/artesca-data-dr is in phase Failback:Failed. Manual intervention is required to recover.' diff --git a/monitoring/zenko-operator/alerts.yaml b/monitoring/zenko-operator/alerts.yaml deleted file mode 100644 index 5baddbfde5..0000000000 --- a/monitoring/zenko-operator/alerts.yaml +++ /dev/null @@ -1,73 +0,0 @@ -x-inputs: - - name: namespace - type: constant - value: zenko - - name: job - type: constant - value: zenko-operator-metrics - -groups: -- name: ZenkoOperator - rules: - - - alert: ZenkoOperatorReconcileErrors - expr: | - rate(zenko_operator_reconcile_errors_total{namespace="${namespace}", job="${job}"}[5m]) > 0 - for: 2m - labels: - severity: warning - annotations: - summary: 'Zenko operator reconciliation errors detected' - description: >- - The Zenko operator controller {{ $labels.controller }} is experiencing reconciliation - errors for instance {{ $labels.namespace }}/{{ $labels.name }}. - - - alert: ZenkoOperatorDeploymentFailed - expr: | - zenko_operator_instance_condition{namespace="${namespace}", job="${job}", condition="DeploymentFailure"} == 1 - for: 5m - labels: - severity: warning - annotations: - summary: 'Zenko operator deployment failure' - description: >- - Zenko instance {{ $labels.namespace }}/{{ $labels.name }} has a DeploymentFailure condition. - The operator is unable to reconcile the instance to the desired state. - - - alert: ZenkoOperatorSecurityOptionEnabled - expr: | - zenko_operator_debug_option{namespace="${namespace}", job="${job}", option=~"skipTLSVerify|noImplicitDeny"} == 1 - for: 15m - labels: - severity: warning - annotations: - summary: 'Security-bypassing option enabled on Zenko instance' - description: >- - The option {{ $labels.option }} is enabled on Zenko instance - {{ $labels.namespace }}/{{ $labels.name }}. This option bypasses security - controls and must not remain active in production environments. - - - alert: ZenkoOperatorTestConfigActive - expr: | - zenko_operator_debug_option{namespace="${namespace}", job="${job}", option=~".*CronOverride|.*Verbose|triggerTransitions.*|triggerExpirations.*"} == 1 - for: 1h - labels: - severity: warning - annotations: - summary: 'Test/debug configuration active on Zenko instance' - description: >- - The option {{ $labels.option }} has been active on Zenko instance - {{ $labels.namespace }}/{{ $labels.name }} for more than 1 hour. - This option is intended for testing only and should not remain enabled in production. - - - alert: ZenkoOperatorDRPhaseFailed - expr: | - zenko_operator_instance_phase{namespace="${namespace}", job="${job}", phase=~"Bootstrap:Failed|Failback:Failed"} == 1 - for: 1m - labels: - severity: critical - annotations: - summary: 'DR controller entered a failed phase' - description: >- - DR controller {{ $labels.controller }} for instance {{ $labels.namespace }}/{{ $labels.name }} - is in phase {{ $labels.phase }}. Manual intervention is required to recover. diff --git a/solution/deps.yaml b/solution/deps.yaml index 1a9f564eb0..5ff15c3143 100644 --- a/solution/deps.yaml +++ b/solution/deps.yaml @@ -129,6 +129,7 @@ vault: envsubst: VAULT_TAG zenko-operator: sourceRegistry: ghcr.io/scality + dashboard: zenko-operator/zenko-operator-dashboards image: zenko-operator tag: v1.8.5 envsubst: ZENKO_OPERATOR_TAG