diff --git a/example/mixin/alerts.yaml b/example/mixin/alerts.yaml index 92fc1f2a2..c4fcb1d0f 100644 --- a/example/mixin/alerts.yaml +++ b/example/mixin/alerts.yaml @@ -13,11 +13,11 @@ groups: severity: warning - alert: PrometheusOperatorReconcileErrors annotations: - description: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace - }} Namespace. + description: '{{ $value | humanizePercentage }} of reconciling operations failed + for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' summary: Errors while reconciling controller. expr: | - rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m])) > 0.1 for: 10m labels: severity: warning diff --git a/jsonnet/mixin/alerts/alerts.libsonnet b/jsonnet/mixin/alerts/alerts.libsonnet index 7e0d328ab..7782f7646 100644 --- a/jsonnet/mixin/alerts/alerts.libsonnet +++ b/jsonnet/mixin/alerts/alerts.libsonnet @@ -21,13 +21,13 @@ { alert: 'PrometheusOperatorReconcileErrors', expr: ||| - rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m]) > 0.1 + (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{%(prometheusOperatorSelector)s}[5m])) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{%(prometheusOperatorSelector)s}[5m])) > 0.1 ||| % $._config, labels: { severity: 'warning', }, annotations: { - description: 'Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace.', + description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.', summary: 'Errors while reconciling controller.', }, 'for': '10m', diff --git a/pkg/alertmanager/operator.go b/pkg/alertmanager/operator.go index 979414549..91748d0da 100644 --- a/pkg/alertmanager/operator.go +++ b/pkg/alertmanager/operator.go @@ -282,6 +282,7 @@ func (c *Operator) processNextWorkItem() bool { } defer c.queue.Done(key) + c.metrics.ReconcileCounter().Inc() err := c.sync(key.(string)) if err == nil { c.queue.Forget(key) diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 8ca015f1a..b0c7f7efc 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -31,6 +31,7 @@ type Metrics struct { listFailedCounter prometheus.Counter watchCounter prometheus.Counter watchFailedCounter prometheus.Counter + reconcileCounter prometheus.Counter reconcileErrorsCounter prometheus.Counter stsDeleteCreateCounter prometheus.Counter // triggerByCounter is a set of counters keeping track of the amount @@ -46,9 +47,13 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics { reg := prometheus.WrapRegistererWith(prometheus.Labels{"controller": name}, r) m := Metrics{ reg: reg, + reconcileCounter: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_operator_reconcile_operations_total", + Help: "Total number of reconcile operations", + }), reconcileErrorsCounter: prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_operator_reconcile_errors_total", - Help: "Number of errors that occurred while reconciling the statefulset", + Help: "Number of errors that occurred during reconcile operations", }), triggerByCounter: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "prometheus_operator_triggered_total", @@ -77,6 +82,7 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics { }), } m.reg.MustRegister( + m.reconcileCounter, m.reconcileErrorsCounter, m.triggerByCounter, m.stsDeleteCreateCounter, @@ -88,6 +94,11 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics { return &m } +// ReconcileCounter returns a counter to track attempted reconciliations. +func (m *Metrics) ReconcileCounter() prometheus.Counter { + return m.reconcileCounter +} + // ReconcileErrorsCounter returns a counter to track reconciliation errors. func (m *Metrics) ReconcileErrorsCounter() prometheus.Counter { return m.reconcileErrorsCounter diff --git a/pkg/prometheus/operator.go b/pkg/prometheus/operator.go index 9bc4cb4d4..fe717f21d 100644 --- a/pkg/prometheus/operator.go +++ b/pkg/prometheus/operator.go @@ -1093,6 +1093,7 @@ func (c *Operator) processNextWorkItem() bool { } defer c.queue.Done(key) + c.metrics.ReconcileCounter().Inc() err := c.sync(key.(string)) if err == nil { c.queue.Forget(key) diff --git a/pkg/thanos/operator.go b/pkg/thanos/operator.go index 5f2efb573..18cdf2df6 100644 --- a/pkg/thanos/operator.go +++ b/pkg/thanos/operator.go @@ -554,6 +554,7 @@ func (o *Operator) processNextWorkItem() bool { } defer o.queue.Done(key) + o.metrics.ReconcileCounter().Inc() err := o.sync(key.(string)) if err == nil { o.queue.Forget(key)