1
0
mirror of https://github.com/coreos/prometheus-operator.git synced 2026-02-05 15:46:31 +01:00

*: add metrics for managed resources and sync status (#3421)

This change adds two metrics:
* `prometheus_operator_managed_resources`, the number of resources
  (service monitors, pod monitors, probes and rules) managed by the
  controller per state (selected/rejected).
* `prometheus_operator_syncs`, number of sync operations per status
  (ok/failed) and controller.

It removes the `prometheus_operator_resources` metric which wasn't
accurate since it didn't take into account the objects that were
filtered/rejected by the operator.

The number of managed Prometheus, Alertmanager and Thanos objects can be
derived from the `prometheus_operator_syncs` metric using `sum
without(status) (prometheus_operator_syncs)`.

It also adds a new `PrometheusOperatorSyncFailed` alert that fires when
`prometheus_operator_syncs{status="failed"}` is greater than 0.

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
Simon Pasquier
2020-10-06 17:32:21 +02:00
committed by GitHub
parent 923fb60041
commit 4cae64bf07
8 changed files with 203 additions and 39 deletions

View File

@@ -21,6 +21,16 @@ groups:
for: 15m
labels:
severity: warning
- alert: PrometheusOperatorSyncFailed
annotations:
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
namespace fails to reconcile {{ $value }} objects.
summary: Last controller reconciliation failed
expr: |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorReconcileErrors
annotations:
description: '{{ $value | humanizePercentage }} of reconciling operations failed

View File

@@ -32,6 +32,20 @@
},
'for': '15m',
},
{
alert: 'PrometheusOperatorSyncFailed',
expr: |||
min_over_time(prometheus_operator_syncs{status="failed",%(prometheusOperatorSelector)s}[5m]) > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
description: 'Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.',
summary: 'Last controller reconciliation failed',
},
'for': '10m',
},
{
alert: 'PrometheusOperatorReconcileErrors',
expr: |||

View File

@@ -280,6 +280,7 @@ func (c *Operator) processNextWorkItem(ctx context.Context) bool {
c.metrics.ReconcileCounter().Inc()
err := c.sync(ctx, key.(string))
c.metrics.SetSyncStatus(key.(string), err == nil)
if err == nil {
c.queue.Forget(key)
return true
@@ -399,6 +400,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
aobj, err := c.alrtInfs.Get(key)
if apierrors.IsNotFound(err) {
c.metrics.ForgetObject(key)
// Dependent resources are cleaned up by K8s via OwnerReferences
return nil
}

View File

@@ -16,10 +16,12 @@ package operator
import (
"context"
"sync"
"time"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/prometheus/client_golang/prometheus"
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -28,6 +30,21 @@ import (
"k8s.io/client-go/tools/cache"
)
var (
syncsDesc = prometheus.NewDesc(
"prometheus_operator_syncs",
"Number of objects per sync status (ok/failed)",
[]string{"status"},
nil,
)
resourcesDesc = prometheus.NewDesc(
"prometheus_operator_managed_resources",
"Number of resources managed by the operator's controller per state (selected/rejected)",
[]string{"resource", "state"},
nil,
)
)
// Metrics represents metrics associated to an operator.
type Metrics struct {
reg prometheus.Registerer
@@ -45,6 +62,16 @@ type Metrics struct {
// corresponding actions (add, delete, update).
triggerByCounter *prometheus.CounterVec
ready prometheus.Gauge
// mtx protects all fields below.
mtx sync.RWMutex
syncs map[string]bool
resources map[resourceKey]map[string]int
}
type resourceKey struct {
resource string
state resourceState
}
// NewMetrics initializes operator metrics and registers them with the given registerer.
@@ -90,7 +117,11 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
Name: "prometheus_operator_ready",
Help: "1 when the controller is ready to reconcile resources, 0 otherwise",
}),
syncs: make(map[string]bool),
resources: make(map[resourceKey]map[string]int),
}
m.reg.MustRegister(
m.reconcileCounter,
m.reconcileErrorsCounter,
@@ -101,7 +132,9 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
m.watchCounter,
m.watchFailedCounter,
m.ready,
&m,
)
return &m
}
@@ -121,8 +154,68 @@ func (m *Metrics) StsDeleteCreateCounter() prometheus.Counter {
}
// TriggerByCounter returns a counter to track operator actions by operation (add/delete/update) and action.
func (m *Metrics) TriggerByCounter(triggered_by, action string) prometheus.Counter {
return m.triggerByCounter.WithLabelValues(triggered_by, action)
func (m *Metrics) TriggerByCounter(triggeredBy, action string) prometheus.Counter {
return m.triggerByCounter.WithLabelValues(triggeredBy, action)
}
const (
selected int = iota
rejected
)
type resourceState int
func (r resourceState) String() string {
switch int(r) {
case selected:
return "selected"
case rejected:
return "rejected"
}
return ""
}
// SetSelectedResources sets the number of resources that the controller selected for the given object's key.
func (m *Metrics) SetSelectedResources(objKey, resource string, v int) {
m.setResources(objKey, resourceKey{resource: resource, state: resourceState(selected)}, v)
}
// SetRejectedResources sets the number of resources that the controller rejected for the given object's key.
func (m *Metrics) SetRejectedResources(objKey, resource string, v int) {
m.setResources(objKey, resourceKey{resource: resource, state: resourceState(rejected)}, v)
}
func (m *Metrics) setResources(objKey string, resKey resourceKey, v int) {
m.mtx.Lock()
defer m.mtx.Unlock()
if _, found := m.resources[resKey]; !found {
m.resources[resourceKey{resource: resKey.resource, state: resourceState(selected)}] = make(map[string]int)
m.resources[resourceKey{resource: resKey.resource, state: resourceState(rejected)}] = make(map[string]int)
}
m.resources[resKey][objKey] = v
}
// SetSyncStatus tracks the status of the last sync operation for the given object.
func (m *Metrics) SetSyncStatus(objKey string, success bool) {
m.mtx.Lock()
defer m.mtx.Unlock()
m.syncs[objKey] = success
}
// ForgetObject removes the metrics tracked for the given object's key.
// It should be called when the controller detects that the object has been deleted.
func (m *Metrics) ForgetObject(objKey string) {
m.mtx.Lock()
defer m.mtx.Unlock()
delete(m.syncs, objKey)
for k := range m.resources {
delete(m.resources[k], objKey)
}
}
// Ready returns a gauge to track whether the controller is ready or not.
@@ -135,6 +228,54 @@ func (m *Metrics) MustRegister(metrics ...prometheus.Collector) {
m.reg.MustRegister(metrics...)
}
// Describe implements the prometheus.Collector interface.
func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
ch <- resourcesDesc
ch <- syncsDesc
}
// Collect implements the prometheus.Collector interface.
func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
m.mtx.RLock()
defer m.mtx.RUnlock()
var ok, failed float64
for _, success := range m.syncs {
if success {
ok++
} else {
failed++
}
}
ch <- prometheus.MustNewConstMetric(
syncsDesc,
prometheus.GaugeValue,
ok,
"ok",
)
ch <- prometheus.MustNewConstMetric(
syncsDesc,
prometheus.GaugeValue,
failed,
"failed",
)
for rKey := range m.resources {
var total int
for _, v := range m.resources[rKey] {
total += v
}
ch <- prometheus.MustNewConstMetric(
resourcesDesc,
prometheus.GaugeValue,
float64(total),
rKey.resource,
rKey.state.String(),
)
}
}
type instrumentedListerWatcher struct {
next cache.ListerWatcher
listTotal prometheus.Counter
@@ -174,40 +315,6 @@ func (i *instrumentedListerWatcher) Watch(options metav1.ListOptions) (watch.Int
return ret, err
}
type storeCollector struct {
desc *prometheus.Desc
store cache.Store
}
// NewStoreCollector returns a metrics collector that returns the current number of resources in the store.
func NewStoreCollector(resource string, s cache.Store) prometheus.Collector {
return &storeCollector{
desc: prometheus.NewDesc(
"prometheus_operator_resources",
"Number of resources managed by the operator's controller",
nil,
map[string]string{
"resource": resource,
},
),
store: s,
}
}
// Describe implements the prometheus.Collector interface.
func (c *storeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.desc
}
// Collect implements the prometheus.Collector interface.
func (c *storeCollector) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(
c.desc,
prometheus.GaugeValue,
float64(len(c.store.List())),
)
}
// SanitizeSTS removes values for APIVersion and Kind from the VolumeClaimTemplates.
// This prevents update failures due to these fields changing when applied.
// See https://github.com/kubernetes/kubernetes/issues/87583

View File

@@ -1046,6 +1046,7 @@ func (c *Operator) processNextWorkItem(ctx context.Context) bool {
c.metrics.ReconcileCounter().Inc()
err := c.sync(ctx, key.(string))
c.metrics.SetSyncStatus(key.(string), err == nil)
if err == nil {
c.queue.Forget(key)
return true
@@ -1140,6 +1141,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
pobj, err := c.promInfs.Get(key)
if apierrors.IsNotFound(err) {
c.metrics.ForgetObject(key)
// Dependent resources are cleaned up by K8s via OwnerReferences
return nil
}
@@ -1611,6 +1613,7 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
})
}
var rejected int
res := make(map[string]*monitoringv1.ServiceMonitor, len(serviceMonitors))
for namespaceAndName, sm := range serviceMonitors {
var err error
@@ -1642,6 +1645,7 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
}
if err != nil {
rejected++
level.Warn(c.logger).Log(
"msg", "skipping servicemonitor",
"error", err.Error(),
@@ -1661,6 +1665,11 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
}
level.Debug(c.logger).Log("msg", "selected ServiceMonitors", "servicemonitors", strings.Join(smKeys, ","), "namespace", p.Namespace, "prometheus", p.Name)
if pKey, ok := c.keyFunc(p); ok {
c.metrics.SetSelectedResources(pKey, monitoringv1.ServiceMonitorsKind, len(res))
c.metrics.SetRejectedResources(pKey, monitoringv1.ServiceMonitorsKind, rejected)
}
return res, nil
}
@@ -1700,6 +1709,7 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
})
}
var rejected int
res := make(map[string]*monitoringv1.PodMonitor, len(podMonitors))
for namespaceAndName, pm := range podMonitors {
var err error
@@ -1723,6 +1733,7 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
}
if err != nil {
rejected++
level.Warn(c.logger).Log(
"msg", "skipping podmonitor",
"error", err.Error(),
@@ -1742,6 +1753,11 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
}
level.Debug(c.logger).Log("msg", "selected PodMonitors", "podmonitors", strings.Join(pmKeys, ","), "namespace", p.Namespace, "prometheus", p.Name)
if pKey, ok := c.keyFunc(p); ok {
c.metrics.SetSelectedResources(pKey, monitoringv1.PodMonitorsKind, len(res))
c.metrics.SetRejectedResources(pKey, monitoringv1.PodMonitorsKind, rejected)
}
return res, nil
}
@@ -1784,6 +1800,11 @@ func (c *Operator) selectProbes(p *monitoringv1.Prometheus) (map[string]*monitor
level.Debug(c.logger).Log("msg", "selected Probes", "probes", strings.Join(probes, ","), "namespace", p.Namespace, "prometheus", p.Name)
if pKey, ok := c.keyFunc(p); ok {
c.metrics.SetSelectedResources(pKey, monitoringv1.ProbesKind, len(res))
c.metrics.SetRejectedResources(pKey, monitoringv1.ProbesKind, 0)
}
return res, nil
}

View File

@@ -213,14 +213,18 @@ func (c *Operator) selectRules(p *monitoringv1.Prometheus, namespaces []string)
"prometheus", p.Name,
)
if pKey, ok := c.keyFunc(p); ok {
c.metrics.SetSelectedResources(pKey, monitoringv1.PrometheusRuleKind, len(rules))
c.metrics.SetRejectedResources(pKey, monitoringv1.PrometheusRuleKind, 0)
}
return rules, nil
}
func generateContent(promRule monitoringv1.PrometheusRuleSpec) (string, error) {
content, err := yaml.Marshal(promRule)
if err != nil {
return "", errors.Wrap(err, "failed to unmarshal content")
return "", errors.Wrap(err, "failed to marshal content")
}
return string(content), nil
}

View File

@@ -561,6 +561,7 @@ func (o *Operator) processNextWorkItem(ctx context.Context) bool {
o.metrics.ReconcileCounter().Inc()
err := o.sync(ctx, key.(string))
o.metrics.SetSyncStatus(key.(string), err == nil)
if err == nil {
o.queue.Forget(key)
return true
@@ -576,6 +577,7 @@ func (o *Operator) processNextWorkItem(ctx context.Context) bool {
func (o *Operator) sync(ctx context.Context, key string) error {
trobj, err := o.thanosRulerInfs.Get(key)
if apierrors.IsNotFound(err) {
o.metrics.ForgetObject(key)
// Dependent resources are cleaned up by K8s via OwnerReferences
return nil
}

View File

@@ -213,6 +213,10 @@ func (o *Operator) selectRules(t *monitoringv1.ThanosRuler, namespaces []string)
"thanos", t.Name,
)
if tKey, ok := o.keyFunc(t); ok {
o.metrics.SetSelectedResources(tKey, monitoringv1.PrometheusRuleKind, len(rules))
o.metrics.SetRejectedResources(tKey, monitoringv1.PrometheusRuleKind, 0)
}
return rules, nil
}
@@ -220,7 +224,7 @@ func generateContent(promRule monitoringv1.PrometheusRuleSpec) (string, error) {
content, err := yaml.Marshal(promRule)
if err != nil {
return "", errors.Wrap(err, "failed to unmarshal content")
return "", errors.Wrap(err, "failed to marshal content")
}
return string(content), nil
}