mirror of
https://github.com/coreos/prometheus-operator.git
synced 2026-02-05 15:46:31 +01:00
*: add metrics for managed resources and sync status (#3421)
This change adds two metrics:
* `prometheus_operator_managed_resources`, the number of resources
(service monitors, pod monitors, probes and rules) managed by the
controller per state (selected/rejected).
* `prometheus_operator_syncs`, number of sync operations per status
(ok/failed) and controller.
It removes the `prometheus_operator_resources` metric which wasn't
accurate since it didn't take into account the objects that were
filtered/rejected by the operator.
The number of managed Prometheus, Alertmanager and Thanos objects can be
derived from the `prometheus_operator_syncs` metric using `sum
without(status) (prometheus_operator_syncs)`.
It also adds a new `PrometheusOperatorSyncFailed` alert that fires when
`prometheus_operator_syncs{status="failed"}` is greater than 0.
Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
@@ -21,6 +21,16 @@ groups:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
|
||||
namespace fails to reconcile {{ $value }} objects.
|
||||
summary: Last controller reconciliation failed
|
||||
expr: |
|
||||
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of reconciling operations failed
|
||||
|
||||
@@ -32,6 +32,20 @@
|
||||
},
|
||||
'for': '15m',
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusOperatorSyncFailed',
|
||||
expr: |||
|
||||
min_over_time(prometheus_operator_syncs{status="failed",%(prometheusOperatorSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects.',
|
||||
summary: 'Last controller reconciliation failed',
|
||||
},
|
||||
'for': '10m',
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusOperatorReconcileErrors',
|
||||
expr: |||
|
||||
|
||||
@@ -280,6 +280,7 @@ func (c *Operator) processNextWorkItem(ctx context.Context) bool {
|
||||
|
||||
c.metrics.ReconcileCounter().Inc()
|
||||
err := c.sync(ctx, key.(string))
|
||||
c.metrics.SetSyncStatus(key.(string), err == nil)
|
||||
if err == nil {
|
||||
c.queue.Forget(key)
|
||||
return true
|
||||
@@ -399,6 +400,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
|
||||
aobj, err := c.alrtInfs.Get(key)
|
||||
|
||||
if apierrors.IsNotFound(err) {
|
||||
c.metrics.ForgetObject(key)
|
||||
// Dependent resources are cleaned up by K8s via OwnerReferences
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -16,10 +16,12 @@ package operator
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/go-kit/kit/log"
|
||||
"github.com/go-kit/kit/log/level"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
@@ -28,6 +30,21 @@ import (
|
||||
"k8s.io/client-go/tools/cache"
|
||||
)
|
||||
|
||||
var (
|
||||
syncsDesc = prometheus.NewDesc(
|
||||
"prometheus_operator_syncs",
|
||||
"Number of objects per sync status (ok/failed)",
|
||||
[]string{"status"},
|
||||
nil,
|
||||
)
|
||||
resourcesDesc = prometheus.NewDesc(
|
||||
"prometheus_operator_managed_resources",
|
||||
"Number of resources managed by the operator's controller per state (selected/rejected)",
|
||||
[]string{"resource", "state"},
|
||||
nil,
|
||||
)
|
||||
)
|
||||
|
||||
// Metrics represents metrics associated to an operator.
|
||||
type Metrics struct {
|
||||
reg prometheus.Registerer
|
||||
@@ -45,6 +62,16 @@ type Metrics struct {
|
||||
// corresponding actions (add, delete, update).
|
||||
triggerByCounter *prometheus.CounterVec
|
||||
ready prometheus.Gauge
|
||||
|
||||
// mtx protects all fields below.
|
||||
mtx sync.RWMutex
|
||||
syncs map[string]bool
|
||||
resources map[resourceKey]map[string]int
|
||||
}
|
||||
|
||||
type resourceKey struct {
|
||||
resource string
|
||||
state resourceState
|
||||
}
|
||||
|
||||
// NewMetrics initializes operator metrics and registers them with the given registerer.
|
||||
@@ -90,7 +117,11 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
|
||||
Name: "prometheus_operator_ready",
|
||||
Help: "1 when the controller is ready to reconcile resources, 0 otherwise",
|
||||
}),
|
||||
|
||||
syncs: make(map[string]bool),
|
||||
resources: make(map[resourceKey]map[string]int),
|
||||
}
|
||||
|
||||
m.reg.MustRegister(
|
||||
m.reconcileCounter,
|
||||
m.reconcileErrorsCounter,
|
||||
@@ -101,7 +132,9 @@ func NewMetrics(name string, r prometheus.Registerer) *Metrics {
|
||||
m.watchCounter,
|
||||
m.watchFailedCounter,
|
||||
m.ready,
|
||||
&m,
|
||||
)
|
||||
|
||||
return &m
|
||||
}
|
||||
|
||||
@@ -121,8 +154,68 @@ func (m *Metrics) StsDeleteCreateCounter() prometheus.Counter {
|
||||
}
|
||||
|
||||
// TriggerByCounter returns a counter to track operator actions by operation (add/delete/update) and action.
|
||||
func (m *Metrics) TriggerByCounter(triggered_by, action string) prometheus.Counter {
|
||||
return m.triggerByCounter.WithLabelValues(triggered_by, action)
|
||||
func (m *Metrics) TriggerByCounter(triggeredBy, action string) prometheus.Counter {
|
||||
return m.triggerByCounter.WithLabelValues(triggeredBy, action)
|
||||
}
|
||||
|
||||
const (
|
||||
selected int = iota
|
||||
rejected
|
||||
)
|
||||
|
||||
type resourceState int
|
||||
|
||||
func (r resourceState) String() string {
|
||||
switch int(r) {
|
||||
case selected:
|
||||
return "selected"
|
||||
case rejected:
|
||||
return "rejected"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// SetSelectedResources sets the number of resources that the controller selected for the given object's key.
|
||||
func (m *Metrics) SetSelectedResources(objKey, resource string, v int) {
|
||||
m.setResources(objKey, resourceKey{resource: resource, state: resourceState(selected)}, v)
|
||||
}
|
||||
|
||||
// SetRejectedResources sets the number of resources that the controller rejected for the given object's key.
|
||||
func (m *Metrics) SetRejectedResources(objKey, resource string, v int) {
|
||||
m.setResources(objKey, resourceKey{resource: resource, state: resourceState(rejected)}, v)
|
||||
}
|
||||
|
||||
func (m *Metrics) setResources(objKey string, resKey resourceKey, v int) {
|
||||
m.mtx.Lock()
|
||||
defer m.mtx.Unlock()
|
||||
|
||||
if _, found := m.resources[resKey]; !found {
|
||||
m.resources[resourceKey{resource: resKey.resource, state: resourceState(selected)}] = make(map[string]int)
|
||||
m.resources[resourceKey{resource: resKey.resource, state: resourceState(rejected)}] = make(map[string]int)
|
||||
}
|
||||
|
||||
m.resources[resKey][objKey] = v
|
||||
}
|
||||
|
||||
// SetSyncStatus tracks the status of the last sync operation for the given object.
|
||||
func (m *Metrics) SetSyncStatus(objKey string, success bool) {
|
||||
m.mtx.Lock()
|
||||
defer m.mtx.Unlock()
|
||||
|
||||
m.syncs[objKey] = success
|
||||
}
|
||||
|
||||
// ForgetObject removes the metrics tracked for the given object's key.
|
||||
// It should be called when the controller detects that the object has been deleted.
|
||||
func (m *Metrics) ForgetObject(objKey string) {
|
||||
m.mtx.Lock()
|
||||
defer m.mtx.Unlock()
|
||||
|
||||
delete(m.syncs, objKey)
|
||||
|
||||
for k := range m.resources {
|
||||
delete(m.resources[k], objKey)
|
||||
}
|
||||
}
|
||||
|
||||
// Ready returns a gauge to track whether the controller is ready or not.
|
||||
@@ -135,6 +228,54 @@ func (m *Metrics) MustRegister(metrics ...prometheus.Collector) {
|
||||
m.reg.MustRegister(metrics...)
|
||||
}
|
||||
|
||||
// Describe implements the prometheus.Collector interface.
|
||||
func (m *Metrics) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- resourcesDesc
|
||||
ch <- syncsDesc
|
||||
}
|
||||
|
||||
// Collect implements the prometheus.Collector interface.
|
||||
func (m *Metrics) Collect(ch chan<- prometheus.Metric) {
|
||||
m.mtx.RLock()
|
||||
defer m.mtx.RUnlock()
|
||||
|
||||
var ok, failed float64
|
||||
for _, success := range m.syncs {
|
||||
if success {
|
||||
ok++
|
||||
} else {
|
||||
failed++
|
||||
}
|
||||
}
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
syncsDesc,
|
||||
prometheus.GaugeValue,
|
||||
ok,
|
||||
"ok",
|
||||
)
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
syncsDesc,
|
||||
prometheus.GaugeValue,
|
||||
failed,
|
||||
"failed",
|
||||
)
|
||||
|
||||
for rKey := range m.resources {
|
||||
var total int
|
||||
for _, v := range m.resources[rKey] {
|
||||
total += v
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
resourcesDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(total),
|
||||
rKey.resource,
|
||||
rKey.state.String(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
type instrumentedListerWatcher struct {
|
||||
next cache.ListerWatcher
|
||||
listTotal prometheus.Counter
|
||||
@@ -174,40 +315,6 @@ func (i *instrumentedListerWatcher) Watch(options metav1.ListOptions) (watch.Int
|
||||
return ret, err
|
||||
}
|
||||
|
||||
type storeCollector struct {
|
||||
desc *prometheus.Desc
|
||||
store cache.Store
|
||||
}
|
||||
|
||||
// NewStoreCollector returns a metrics collector that returns the current number of resources in the store.
|
||||
func NewStoreCollector(resource string, s cache.Store) prometheus.Collector {
|
||||
return &storeCollector{
|
||||
desc: prometheus.NewDesc(
|
||||
"prometheus_operator_resources",
|
||||
"Number of resources managed by the operator's controller",
|
||||
nil,
|
||||
map[string]string{
|
||||
"resource": resource,
|
||||
},
|
||||
),
|
||||
store: s,
|
||||
}
|
||||
}
|
||||
|
||||
// Describe implements the prometheus.Collector interface.
|
||||
func (c *storeCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- c.desc
|
||||
}
|
||||
|
||||
// Collect implements the prometheus.Collector interface.
|
||||
func (c *storeCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
c.desc,
|
||||
prometheus.GaugeValue,
|
||||
float64(len(c.store.List())),
|
||||
)
|
||||
}
|
||||
|
||||
// SanitizeSTS removes values for APIVersion and Kind from the VolumeClaimTemplates.
|
||||
// This prevents update failures due to these fields changing when applied.
|
||||
// See https://github.com/kubernetes/kubernetes/issues/87583
|
||||
|
||||
@@ -1046,6 +1046,7 @@ func (c *Operator) processNextWorkItem(ctx context.Context) bool {
|
||||
|
||||
c.metrics.ReconcileCounter().Inc()
|
||||
err := c.sync(ctx, key.(string))
|
||||
c.metrics.SetSyncStatus(key.(string), err == nil)
|
||||
if err == nil {
|
||||
c.queue.Forget(key)
|
||||
return true
|
||||
@@ -1140,6 +1141,7 @@ func (c *Operator) sync(ctx context.Context, key string) error {
|
||||
pobj, err := c.promInfs.Get(key)
|
||||
|
||||
if apierrors.IsNotFound(err) {
|
||||
c.metrics.ForgetObject(key)
|
||||
// Dependent resources are cleaned up by K8s via OwnerReferences
|
||||
return nil
|
||||
}
|
||||
@@ -1611,6 +1613,7 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
|
||||
})
|
||||
}
|
||||
|
||||
var rejected int
|
||||
res := make(map[string]*monitoringv1.ServiceMonitor, len(serviceMonitors))
|
||||
for namespaceAndName, sm := range serviceMonitors {
|
||||
var err error
|
||||
@@ -1642,6 +1645,7 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
rejected++
|
||||
level.Warn(c.logger).Log(
|
||||
"msg", "skipping servicemonitor",
|
||||
"error", err.Error(),
|
||||
@@ -1661,6 +1665,11 @@ func (c *Operator) selectServiceMonitors(ctx context.Context, p *monitoringv1.Pr
|
||||
}
|
||||
level.Debug(c.logger).Log("msg", "selected ServiceMonitors", "servicemonitors", strings.Join(smKeys, ","), "namespace", p.Namespace, "prometheus", p.Name)
|
||||
|
||||
if pKey, ok := c.keyFunc(p); ok {
|
||||
c.metrics.SetSelectedResources(pKey, monitoringv1.ServiceMonitorsKind, len(res))
|
||||
c.metrics.SetRejectedResources(pKey, monitoringv1.ServiceMonitorsKind, rejected)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
@@ -1700,6 +1709,7 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
|
||||
})
|
||||
}
|
||||
|
||||
var rejected int
|
||||
res := make(map[string]*monitoringv1.PodMonitor, len(podMonitors))
|
||||
for namespaceAndName, pm := range podMonitors {
|
||||
var err error
|
||||
@@ -1723,6 +1733,7 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
rejected++
|
||||
level.Warn(c.logger).Log(
|
||||
"msg", "skipping podmonitor",
|
||||
"error", err.Error(),
|
||||
@@ -1742,6 +1753,11 @@ func (c *Operator) selectPodMonitors(ctx context.Context, p *monitoringv1.Promet
|
||||
}
|
||||
level.Debug(c.logger).Log("msg", "selected PodMonitors", "podmonitors", strings.Join(pmKeys, ","), "namespace", p.Namespace, "prometheus", p.Name)
|
||||
|
||||
if pKey, ok := c.keyFunc(p); ok {
|
||||
c.metrics.SetSelectedResources(pKey, monitoringv1.PodMonitorsKind, len(res))
|
||||
c.metrics.SetRejectedResources(pKey, monitoringv1.PodMonitorsKind, rejected)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
@@ -1784,6 +1800,11 @@ func (c *Operator) selectProbes(p *monitoringv1.Prometheus) (map[string]*monitor
|
||||
|
||||
level.Debug(c.logger).Log("msg", "selected Probes", "probes", strings.Join(probes, ","), "namespace", p.Namespace, "prometheus", p.Name)
|
||||
|
||||
if pKey, ok := c.keyFunc(p); ok {
|
||||
c.metrics.SetSelectedResources(pKey, monitoringv1.ProbesKind, len(res))
|
||||
c.metrics.SetRejectedResources(pKey, monitoringv1.ProbesKind, 0)
|
||||
}
|
||||
|
||||
return res, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -213,14 +213,18 @@ func (c *Operator) selectRules(p *monitoringv1.Prometheus, namespaces []string)
|
||||
"prometheus", p.Name,
|
||||
)
|
||||
|
||||
if pKey, ok := c.keyFunc(p); ok {
|
||||
c.metrics.SetSelectedResources(pKey, monitoringv1.PrometheusRuleKind, len(rules))
|
||||
c.metrics.SetRejectedResources(pKey, monitoringv1.PrometheusRuleKind, 0)
|
||||
}
|
||||
|
||||
return rules, nil
|
||||
}
|
||||
|
||||
func generateContent(promRule monitoringv1.PrometheusRuleSpec) (string, error) {
|
||||
|
||||
content, err := yaml.Marshal(promRule)
|
||||
if err != nil {
|
||||
return "", errors.Wrap(err, "failed to unmarshal content")
|
||||
return "", errors.Wrap(err, "failed to marshal content")
|
||||
}
|
||||
return string(content), nil
|
||||
}
|
||||
|
||||
@@ -561,6 +561,7 @@ func (o *Operator) processNextWorkItem(ctx context.Context) bool {
|
||||
|
||||
o.metrics.ReconcileCounter().Inc()
|
||||
err := o.sync(ctx, key.(string))
|
||||
o.metrics.SetSyncStatus(key.(string), err == nil)
|
||||
if err == nil {
|
||||
o.queue.Forget(key)
|
||||
return true
|
||||
@@ -576,6 +577,7 @@ func (o *Operator) processNextWorkItem(ctx context.Context) bool {
|
||||
func (o *Operator) sync(ctx context.Context, key string) error {
|
||||
trobj, err := o.thanosRulerInfs.Get(key)
|
||||
if apierrors.IsNotFound(err) {
|
||||
o.metrics.ForgetObject(key)
|
||||
// Dependent resources are cleaned up by K8s via OwnerReferences
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -213,6 +213,10 @@ func (o *Operator) selectRules(t *monitoringv1.ThanosRuler, namespaces []string)
|
||||
"thanos", t.Name,
|
||||
)
|
||||
|
||||
if tKey, ok := o.keyFunc(t); ok {
|
||||
o.metrics.SetSelectedResources(tKey, monitoringv1.PrometheusRuleKind, len(rules))
|
||||
o.metrics.SetRejectedResources(tKey, monitoringv1.PrometheusRuleKind, 0)
|
||||
}
|
||||
return rules, nil
|
||||
}
|
||||
|
||||
@@ -220,7 +224,7 @@ func generateContent(promRule monitoringv1.PrometheusRuleSpec) (string, error) {
|
||||
|
||||
content, err := yaml.Marshal(promRule)
|
||||
if err != nil {
|
||||
return "", errors.Wrap(err, "failed to unmarshal content")
|
||||
return "", errors.Wrap(err, "failed to marshal content")
|
||||
}
|
||||
return string(content), nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user