*: add metrics for managed resources and sync status (#3421)

This change adds two metrics: * `prometheus_operator_managed_resources`, the number of resources (service monitors, pod monitors, probes and rules) managed by the controller per state (selected/rejected). * `prometheus_operator_syncs`, number of sync operations per status (ok/failed) and controller. It removes the `prometheus_operator_resources` metric which wasn't accurate since it didn't take into account the objects that were filtered/rejected by the operator. The number of managed Prometheus, Alertmanager and Thanos objects can be derived from the `prometheus_operator_syncs` metric using `sum without(status) (prometheus_operator_syncs)`. It also adds a new `PrometheusOperatorSyncFailed` alert that fires when `prometheus_operator_syncs{status="failed"}` is greater than 0. Signed-off-by: Simon Pasquier <spasquie@redhat.com>
2026-02-05 15:46:31 +01:00 · 2020-10-06 17:32:21 +02:00
parent 923fb60041
commit 4cae64bf07
8 changed files with 203 additions and 39 deletions
--- a/example/mixin/alerts.yaml
+++ b/example/mixin/alerts.yaml
@@ -21,6 +21,16 @@ groups:
    for: 15m
    labels:
      severity: warning
+  - alert: PrometheusOperatorSyncFailed
+    annotations:
+      description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
+        namespace fails to reconcile {{ $value }} objects.
+      summary: Last controller reconciliation failed
+    expr: |
+      min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
+    for: 10m
+    labels:
+      severity: warning
  - alert: PrometheusOperatorReconcileErrors
    annotations:
      description: '{{ $value | humanizePercentage }} of reconciling operations failed
--- a/jsonnet/mixin/alerts/alerts.libsonnet
+++ b/jsonnet/mixin/alerts/alerts.libsonnet
@@ -32,6 +32,20 @@
            },
            'for': '15m',
          },
+          {
+            alert: 'PrometheusOperatorSyncFailed',
+            expr: |||
+              min_over_time(prometheus_operator_syncs{status="failed",%(prometheusOperatorSelector)s}[5m]) > 0
+            ||| % $._config,
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              description: 'Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.',
+              summary: 'Last controller reconciliation failed',
+            },
+            'for': '10m',
+          },
          {
            alert: 'PrometheusOperatorReconcileErrors',
            expr: |||
--- a/pkg/alertmanager/operator.go
+++ b/pkg/alertmanager/operator.go
@@ -280,6 +280,7 @@ func (c *Operator) processNextWorkItem(ctx context.Context) bool {

 	c.metrics.ReconcileCounter().Inc()
 	err := c.sync(ctx, key.(string))
+	c.metrics.SetSyncStatus(key.(string), err == nil)
 	if err == nil {
 		c.queue.Forget(key)
 		return true
@@ -399,6 +400,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
 	aobj, err := c.alrtInfs.Get(key)

 	if apierrors.IsNotFound(err) {
+		c.metrics.ForgetObject(key)
 		// Dependent resources are cleaned up by K8s via OwnerReferences
 		return nil
 	}
--- a/pkg/operator/operator.go
+++ b/pkg/operator/operator.go
@@ -16,10 +16,12 @@ package operator

 import (
 	"context"
+	"sync"
 	"time"

 	"github.com/go-kit/kit/log"
 	"github.com/go-kit/kit/log/level"
+
 	"github.com/prometheus/client_golang/prometheus"
 	appsv1 "k8s.io/api/apps/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -28,6 +30,21 @@ import (
 	"k8s.io/client-go/tools/cache"
 )

+var (
+	syncsDesc = prometheus.NewDesc(
+		"prometheus_operator_syncs",
+		"Number of objects per sync status (ok/failed)",
+		[]string{"status"},
+		nil,
+	)
+	resourcesDesc = prometheus.NewDesc(
+		"prometheus_operator_managed_resources",
+		"Number of resources managed by the operator's controller per state (selected/rejected)",
+		[]string{"resource", "state"},
+		nil,
+	)
+)
+
 // Metrics represents metrics associated to an operator.
 type Metrics struct {
 	reg prometheus.Registerer
@@ -45,6 +62,16 @@ type Metrics struct {
 	// corresponding actions (add, delete, update).
 	triggerByCounter *prometheus.CounterVec
 	ready            prometheus.Gauge
+
+	// mtx protects all fields below.
+	mtx       sync.RWMutex
+	syncs     map[string]bool
+	resources map[resourceKey]map[string]int
+}
+
+type resourceKey struct {
+	resource string
+	state    resourceState
 }

 // NewMetrics initializes operator metrics and registers them with the given registerer.
@@ -90,7 +117,11 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
 			Name: "prometheus_operator_ready",
 			Help: "1 when the controller is ready to reconcile resources, 0 otherwise",
 		}),
+
+		syncs:     make(map[string]bool),
+		resources: make(map[resourceKey]map[string]int),
 	}
+
 	m.reg.MustRegister(
 		m.reconcileCounter,
 		m.reconcileErrorsCounter,
@@ -101,7 +132,9 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
 		m.watchCounter,
 		m.watchFailedCounter,
 		m.ready,
+		&m,
 	)
+
 	return &m
 }

@@ -121,8 +154,68 @@ func (m *Metrics) StsDeleteCreateCounter() prometheus.Counter {
 }

 // TriggerByCounter returns a counter to track operator actions by operation (add/delete/update) and action.
-func (m *Metrics) TriggerByCounter(triggered_by, action string) prometheus.Counter {
-	return m.triggerByCounter.WithLabelValues(triggered_by, action)
+func (m *Metrics) TriggerByCounter(triggeredBy, action string) prometheus.Counter {
+	return m.triggerByCounter.WithLabelValues(triggeredBy, action)
+}
+
+const (
+	selected int = iota
+	rejected
+)
+
+type resourceState int
+
+func (r resourceState) String() string {
+	switch int(r) {
+	case selected:
+		return "selected"
+	case rejected:
+		return "rejected"
+	}
+	return ""
+}
+
+// SetSelectedResources sets the number of resources that the controller selected for the given object's key.
+func (m *Metrics) SetSelectedResources(objKey, resource string, v int) {
+	m.setResources(objKey, resourceKey{resource: resource, state: resourceState(selected)}, v)
+}
+
+// SetRejectedResources sets the number of resources that the controller rejected for the given object's key.
+func (m *Metrics) SetRejectedResources(objKey, resource string, v int) {
+	m.setResources(objKey, resourceKey{resource: resource, state: resourceState(rejected)}, v)
+}
+
+func (m *Metrics) setResources(objKey string, resKey resourceKey, v int) {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	if _, found := m.resources[resKey]; !found {
+		m.resources[resourceKey{resource: resKey.resource, state: resourceState(selected)}] = make(map[string]int)
+		m.resources[resourceKey{resource: resKey.resource, state: resourceState(rejected)}] = make(map[string]int)
+	}
+
+	m.resources[resKey][objKey] = v
+}
+
+// SetSyncStatus tracks the status of the last sync operation for the given object.
+func (m *Metrics) SetSyncStatus(objKey string, success bool) {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	m.syncs[objKey] = success
+}
+
+// ForgetObject removes the metrics tracked for the given object's key.
+// It should be called when the controller detects that the object has been deleted.
+func (m *Metrics) ForgetObject(objKey string) {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	delete(m.syncs, objKey)
+
+	for k := range m.resources {
+		delete(m.resources[k], objKey)
+	}
 }

 // Ready returns a gauge to track whether the controller is ready or not.
@@ -135,6 +228,54 @@ func (m *Metrics) MustRegister(metrics ...prometheus.Collector) {
 	m.reg.MustRegister(metrics...)
 }

+// Describe implements the prometheus.Collector interface.
+func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
+	ch <- resourcesDesc
+	ch <- syncsDesc
+}
+
+// Collect implements the prometheus.Collector interface.
+func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
+	m.mtx.RLock()
+	defer m.mtx.RUnlock()
+
+	var ok, failed float64
+	for _, success := range m.syncs {
+		if success {
+			ok++
+		} else {
+			failed++
+		}
+	}
+
+	ch <- prometheus.MustNewConstMetric(
+		syncsDesc,
+		prometheus.GaugeValue,
+		ok,
+		"ok",
+	)
+	ch <- prometheus.MustNewConstMetric(
+		syncsDesc,
+		prometheus.GaugeValue,
+		failed,
+		"failed",
+	)
+
+	for rKey := range m.resources {
+		var total int
+		for _, v := range m.resources[rKey] {
+			total += v
+		}
+		ch <- prometheus.MustNewConstMetric(
+			resourcesDesc,
+			prometheus.GaugeValue,
+			float64(total),
+			rKey.resource,
+			rKey.state.String(),
+		)
+	}
+}
+
 type instrumentedListerWatcher struct {
 	next        cache.ListerWatcher
 	listTotal   prometheus.Counter
@@ -174,40 +315,6 @@ func (i *instrumentedListerWatcher) Watch(options metav1.ListOptions) (watch.Int
 	return ret, err
 }

-type storeCollector struct {
-	desc  *prometheus.Desc
-	store cache.Store
-}
-
-// NewStoreCollector returns a metrics collector that returns the current number of resources in the store.
-func NewStoreCollector(resource string, s cache.Store) prometheus.Collector {
-	return &storeCollector{
-		desc: prometheus.NewDesc(
-			"prometheus_operator_resources",
-			"Number of resources managed by the operator's controller",
-			nil,
-			map[string]string{
-				"resource": resource,
-			},
-		),
-		store: s,
-	}
-}
-
-// Describe implements the prometheus.Collector interface.
-func (c *storeCollector) Describe(ch chan<- *prometheus.Desc) {
-	ch <- c.desc
-}
-
-// Collect implements the prometheus.Collector interface.
-func (c *storeCollector) Collect(ch chan<- prometheus.Metric) {
-	ch <- prometheus.MustNewConstMetric(
-		c.desc,
-		prometheus.GaugeValue,
-		float64(len(c.store.List())),
-	)
-}
-
 // SanitizeSTS removes values for APIVersion and Kind from the VolumeClaimTemplates.
 // This prevents update failures due to these fields changing when applied.
 // See https://github.com/kubernetes/kubernetes/issues/87583
--- a/pkg/prometheus/operator.go
+++ b/pkg/prometheus/operator.go
@@ -1046,6 +1046,7 @@ func (c *Operator) processNextWorkItem(ctx context.Context) bool {

 	c.metrics.ReconcileCounter().Inc()
 	err := c.sync(ctx, key.(string))
+	c.metrics.SetSyncStatus(key.(string), err == nil)
 	if err == nil {
 		c.queue.Forget(key)
 		return true
@@ -1140,6 +1141,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
 	pobj, err := c.promInfs.Get(key)

 	if apierrors.IsNotFound(err) {
+		c.metrics.ForgetObject(key)
 		// Dependent resources are cleaned up by K8s via OwnerReferences
 		return nil
 	}
@@ -1611,6 +1613,7 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
 		})
 	}

+	var rejected int
 	res := make(map[string]*monitoringv1.ServiceMonitor, len(serviceMonitors))
 	for namespaceAndName, sm := range serviceMonitors {
 		var err error
@@ -1642,6 +1645,7 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
 		}

 		if err != nil {
+			rejected++
 			level.Warn(c.logger).Log(
 				"msg", "skipping servicemonitor",
 				"error", err.Error(),
@@ -1661,6 +1665,11 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
 	}
 	level.Debug(c.logger).Log("msg", "selected ServiceMonitors", "servicemonitors", strings.Join(smKeys, ","), "namespace", p.Namespace, "prometheus", p.Name)

+	if pKey, ok := c.keyFunc(p); ok {
+		c.metrics.SetSelectedResources(pKey, monitoringv1.ServiceMonitorsKind, len(res))
+		c.metrics.SetRejectedResources(pKey, monitoringv1.ServiceMonitorsKind, rejected)
+	}
+
 	return res, nil
 }

@@ -1700,6 +1709,7 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
 		})
 	}

+	var rejected int
 	res := make(map[string]*monitoringv1.PodMonitor, len(podMonitors))
 	for namespaceAndName, pm := range podMonitors {
 		var err error
@@ -1723,6 +1733,7 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
 		}

 		if err != nil {
+			rejected++
 			level.Warn(c.logger).Log(
 				"msg", "skipping podmonitor",
 				"error", err.Error(),
@@ -1742,6 +1753,11 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
 	}
 	level.Debug(c.logger).Log("msg", "selected PodMonitors", "podmonitors", strings.Join(pmKeys, ","), "namespace", p.Namespace, "prometheus", p.Name)

+	if pKey, ok := c.keyFunc(p); ok {
+		c.metrics.SetSelectedResources(pKey, monitoringv1.PodMonitorsKind, len(res))
+		c.metrics.SetRejectedResources(pKey, monitoringv1.PodMonitorsKind, rejected)
+	}
+
 	return res, nil
 }

@@ -1784,6 +1800,11 @@ func (c *Operator) selectProbes(p *monitoringv1.Prometheus) (map[string]*monitor

 	level.Debug(c.logger).Log("msg", "selected Probes", "probes", strings.Join(probes, ","), "namespace", p.Namespace, "prometheus", p.Name)

+	if pKey, ok := c.keyFunc(p); ok {
+		c.metrics.SetSelectedResources(pKey, monitoringv1.ProbesKind, len(res))
+		c.metrics.SetRejectedResources(pKey, monitoringv1.ProbesKind, 0)
+	}
+
 	return res, nil
 }

--- a/pkg/prometheus/rules.go
+++ b/pkg/prometheus/rules.go
@@ -213,14 +213,18 @@ func (c *Operator) selectRules(p *monitoringv1.Prometheus, namespaces []string)
 		"prometheus", p.Name,
 	)

+	if pKey, ok := c.keyFunc(p); ok {
+		c.metrics.SetSelectedResources(pKey, monitoringv1.PrometheusRuleKind, len(rules))
+		c.metrics.SetRejectedResources(pKey, monitoringv1.PrometheusRuleKind, 0)
+	}
+
 	return rules, nil
 }

 func generateContent(promRule monitoringv1.PrometheusRuleSpec) (string, error) {
-
 	content, err := yaml.Marshal(promRule)
 	if err != nil {
-		return "", errors.Wrap(err, "failed to unmarshal content")
+		return "", errors.Wrap(err, "failed to marshal content")
 	}
 	return string(content), nil
 }
--- a/pkg/thanos/operator.go
+++ b/pkg/thanos/operator.go
@@ -561,6 +561,7 @@ func (o *Operator) processNextWorkItem(ctx context.Context) bool {

 	o.metrics.ReconcileCounter().Inc()
 	err := o.sync(ctx, key.(string))
+	o.metrics.SetSyncStatus(key.(string), err == nil)
 	if err == nil {
 		o.queue.Forget(key)
 		return true
@@ -576,6 +577,7 @@ func (o *Operator) processNextWorkItem(ctx context.Context) bool {
 func (o *Operator) sync(ctx context.Context, key string) error {
 	trobj, err := o.thanosRulerInfs.Get(key)
 	if apierrors.IsNotFound(err) {
+		o.metrics.ForgetObject(key)
 		// Dependent resources are cleaned up by K8s via OwnerReferences
 		return nil
 	}
--- a/pkg/thanos/rules.go
+++ b/pkg/thanos/rules.go
@@ -213,6 +213,10 @@ func (o *Operator) selectRules(t *monitoringv1.ThanosRuler, namespaces []string)
 		"thanos", t.Name,
 	)

+	if tKey, ok := o.keyFunc(t); ok {
+		o.metrics.SetSelectedResources(tKey, monitoringv1.PrometheusRuleKind, len(rules))
+		o.metrics.SetRejectedResources(tKey, monitoringv1.PrometheusRuleKind, 0)
+	}
 	return rules, nil
 }

@@ -220,7 +224,7 @@ func generateContent(promRule monitoringv1.PrometheusRuleSpec) (string, error) {

 	content, err := yaml.Marshal(promRule)
 	if err != nil {
-		return "", errors.Wrap(err, "failed to unmarshal content")
+		return "", errors.Wrap(err, "failed to marshal content")
 	}
 	return string(content), nil
 }