From e64718cb6bd3d0ac24c69e19ec0070910fa94a18 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Wed, 19 Aug 2020 16:41:02 +0200 Subject: [PATCH] pkg: add prometheus_operator_reconcile_operations_total metric (#3415) * pkg: add prometheus_operator_reconcile_operations_total metric We already have the `prometheus_operator_reconcile_errors_total` metric to track the number of reconciliation attempts that failed but we miss the number of attempts which makes it harder to alert on it. With this change, we can compute the ratio of reconciliations that failed. Signed-off-by: Simon Pasquier * Update alert definition with new metric --- example/mixin/alerts.yaml | 6 +++--- jsonnet/mixin/alerts/alerts.libsonnet | 4 ++-- pkg/alertmanager/operator.go | 1 + pkg/operator/operator.go | 13 ++++++++++++- pkg/prometheus/operator.go | 1 + pkg/thanos/operator.go | 1 + 6 files changed, 20 insertions(+), 6 deletions(-) diff --git a/example/mixin/alerts.yaml b/example/mixin/alerts.yaml index 92fc1f2a2..c4fcb1d0f 100644 --- a/example/mixin/alerts.yaml +++ b/example/mixin/alerts.yaml @@ -13,11 +13,11 @@ groups: severity: warning - alert: PrometheusOperatorReconcileErrors annotations: - description: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace - }} Namespace. + description: '{{ $value | humanizePercentage }} of reconciling operations failed + for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' summary: Errors while reconciling controller. expr: | - rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m])) > 0.1 for: 10m labels: severity: warning diff --git a/jsonnet/mixin/alerts/alerts.libsonnet b/jsonnet/mixin/alerts/alerts.libsonnet index 7e0d328ab..7782f7646 100644 --- a/jsonnet/mixin/alerts/alerts.libsonnet +++ b/jsonnet/mixin/alerts/alerts.libsonnet @@ -21,13 +21,13 @@ { alert: 'PrometheusOperatorReconcileErrors', expr: ||| - rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{%(prometheusOperatorSelector)s}[5m])) > 0.1 ||| % $._config, labels: { severity: 'warning', }, annotations: { - description: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.', + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.', summary: 'Errors while reconciling controller.', }, 'for': '10m', diff --git a/pkg/alertmanager/operator.go b/pkg/alertmanager/operator.go index 979414549..91748d0da 100644 --- a/pkg/alertmanager/operator.go +++ b/pkg/alertmanager/operator.go @@ -282,6 +282,7 @@ func (c *Operator) processNextWorkItem() bool { } defer c.queue.Done(key) + c.metrics.ReconcileCounter().Inc() err := c.sync(key.(string)) if err == nil { c.queue.Forget(key) diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 8ca015f1a..b0c7f7efc 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -31,6 +31,7 @@ type Metrics struct { listFailedCounter prometheus.Counter watchCounter prometheus.Counter watchFailedCounter prometheus.Counter + reconcileCounter prometheus.Counter reconcileErrorsCounter prometheus.Counter stsDeleteCreateCounter prometheus.Counter // triggerByCounter is a set of counters keeping track of the amount @@ -46,9 +47,13 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics { reg := prometheus.WrapRegistererWith(prometheus.Labels{"controller": name}, r) m := Metrics{ reg: reg, + reconcileCounter: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_operator_reconcile_operations_total", + Help: "Total number of reconcile operations", + }), reconcileErrorsCounter: prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_operator_reconcile_errors_total", - Help: "Number of errors that occurred while reconciling the statefulset", + Help: "Number of errors that occurred during reconcile operations", }), triggerByCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "prometheus_operator_triggered_total", @@ -77,6 +82,7 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics { }), } m.reg.MustRegister( + m.reconcileCounter, m.reconcileErrorsCounter, m.triggerByCounter, m.stsDeleteCreateCounter, @@ -88,6 +94,11 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics { return &m } +// ReconcileCounter returns a counter to track attempted reconciliations. +func (m *Metrics) ReconcileCounter() prometheus.Counter { + return m.reconcileCounter +} + // ReconcileErrorsCounter returns a counter to track reconciliation errors. func (m *Metrics) ReconcileErrorsCounter() prometheus.Counter { return m.reconcileErrorsCounter diff --git a/pkg/prometheus/operator.go b/pkg/prometheus/operator.go index 9bc4cb4d4..fe717f21d 100644 --- a/pkg/prometheus/operator.go +++ b/pkg/prometheus/operator.go @@ -1093,6 +1093,7 @@ func (c *Operator) processNextWorkItem() bool { } defer c.queue.Done(key) + c.metrics.ReconcileCounter().Inc() err := c.sync(key.(string)) if err == nil { c.queue.Forget(key) diff --git a/pkg/thanos/operator.go b/pkg/thanos/operator.go index 5f2efb573..18cdf2df6 100644 --- a/pkg/thanos/operator.go +++ b/pkg/thanos/operator.go @@ -554,6 +554,7 @@ func (o *Operator) processNextWorkItem() bool { } defer o.queue.Done(key) + o.metrics.ReconcileCounter().Inc() err := o.sync(key.(string)) if err == nil { o.queue.Forget(key)