mirror of
https://github.com/prometheus/alertmanager.git
synced 2026-02-05 15:45:34 +01:00
fix(provider): reduce lock contention (#4809)
The provider loops over all alerts per state, which results in 3 loops over all store alerts and one call per alert to the marker per loop. Add a custom collector from counting alerts by state. This reduces the number of calls to store and marker to 1/3. Signed-off-by: Siavash Safi <siavash@cloudflare.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
// Copyright 2016 Prometheus Team
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
@@ -77,22 +77,7 @@ type listeningAlerts struct {
|
||||
}
|
||||
|
||||
func (a *Alerts) registerMetrics(r prometheus.Registerer) {
|
||||
newMemAlertByStatus := func(s types.AlertState) prometheus.GaugeFunc {
|
||||
return promauto.With(r).NewGaugeFunc(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_alerts",
|
||||
Help: "How many alerts by state.",
|
||||
ConstLabels: prometheus.Labels{"state": string(s)},
|
||||
},
|
||||
func() float64 {
|
||||
return float64(a.count(s))
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
newMemAlertByStatus(types.AlertStateActive)
|
||||
newMemAlertByStatus(types.AlertStateSuppressed)
|
||||
newMemAlertByStatus(types.AlertStateUnprocessed)
|
||||
r.MustRegister(&alertsCollector{alerts: a})
|
||||
|
||||
a.subscriberChannelWrites = promauto.With(r).NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
@@ -311,23 +296,46 @@ func (a *Alerts) Put(ctx context.Context, alerts ...*types.Alert) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// count returns the number of non-resolved alerts we currently have stored filtered by the provided state.
|
||||
func (a *Alerts) count(state types.AlertState) int {
|
||||
var count int
|
||||
// countByState returns the number of non-resolved alerts by state.
|
||||
func (a *Alerts) countByState() (active, suppressed, unprocessed int) {
|
||||
for _, alert := range a.alerts.List() {
|
||||
if alert.Resolved() {
|
||||
continue
|
||||
}
|
||||
|
||||
status := a.marker.Status(alert.Fingerprint())
|
||||
if status.State != state {
|
||||
continue
|
||||
switch a.marker.Status(alert.Fingerprint()).State {
|
||||
case types.AlertStateActive:
|
||||
active++
|
||||
case types.AlertStateSuppressed:
|
||||
suppressed++
|
||||
case types.AlertStateUnprocessed:
|
||||
unprocessed++
|
||||
}
|
||||
|
||||
count++
|
||||
}
|
||||
return active, suppressed, unprocessed
|
||||
}
|
||||
|
||||
return count
|
||||
// alertsCollector implements prometheus.Collector to collect all alert count metrics in a single pass.
|
||||
type alertsCollector struct {
|
||||
alerts *Alerts
|
||||
}
|
||||
|
||||
var alertsDesc = prometheus.NewDesc(
|
||||
"alertmanager_alerts",
|
||||
"How many alerts by state.",
|
||||
[]string{"state"}, nil,
|
||||
)
|
||||
|
||||
func (c *alertsCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- alertsDesc
|
||||
}
|
||||
|
||||
func (c *alertsCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
active, suppressed, unprocessed := c.alerts.countByState()
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(alertsDesc, prometheus.GaugeValue, float64(active), string(types.AlertStateActive))
|
||||
ch <- prometheus.MustNewConstMetric(alertsDesc, prometheus.GaugeValue, float64(suppressed), string(types.AlertStateSuppressed))
|
||||
ch <- prometheus.MustNewConstMetric(alertsDesc, prometheus.GaugeValue, float64(unprocessed), string(types.AlertStateUnprocessed))
|
||||
}
|
||||
|
||||
type noopCallback struct{}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright 2016 Prometheus Team
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
@@ -425,22 +425,14 @@ func TestAlertsStoreCallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerts_Count(t *testing.T) {
|
||||
func TestAlerts_CountByState(t *testing.T) {
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, nil, promslog.NewNopLogger(), nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
states := []types.AlertState{types.AlertStateActive, types.AlertStateSuppressed, types.AlertStateUnprocessed}
|
||||
|
||||
countByState := func(st types.AlertState) int {
|
||||
return alerts.count(st)
|
||||
}
|
||||
countTotal := func() int {
|
||||
var count int
|
||||
for _, st := range states {
|
||||
count += countByState(st)
|
||||
}
|
||||
return count
|
||||
active, suppressed, unprocessed := alerts.countByState()
|
||||
return active + suppressed + unprocessed
|
||||
}
|
||||
|
||||
// First, there shouldn't be any alerts.
|
||||
@@ -462,7 +454,8 @@ func TestAlerts_Count(t *testing.T) {
|
||||
|
||||
ctx := context.Background()
|
||||
alerts.Put(ctx, a1)
|
||||
require.Equal(t, 1, countByState(types.AlertStateUnprocessed))
|
||||
_, _, unprocessed := alerts.countByState()
|
||||
require.Equal(t, 1, unprocessed)
|
||||
require.Equal(t, 1, countTotal())
|
||||
require.Eventually(t, func() bool {
|
||||
// When the alert will eventually expire and is considered resolved - it won't count.
|
||||
@@ -485,7 +478,8 @@ func TestAlerts_Count(t *testing.T) {
|
||||
// When insert an alert, and then silence it. It shows up with the correct filter.
|
||||
alerts.Put(ctx, a2)
|
||||
marker.SetActiveOrSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
|
||||
require.Equal(t, 1, countByState(types.AlertStateSuppressed))
|
||||
_, suppressed, _ := alerts.countByState()
|
||||
require.Equal(t, 1, suppressed)
|
||||
require.Equal(t, 1, countTotal())
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
@@ -604,7 +598,8 @@ func TestAlertsConcurrently(t *testing.T) {
|
||||
time.Sleep(expire)
|
||||
require.Eventually(t, func() bool {
|
||||
// When the alert will eventually expire and is considered resolved - it won't count.
|
||||
return a.count(types.AlertStateActive) == 0
|
||||
active, _, _ := a.countByState()
|
||||
return active == 0
|
||||
}, 2*expire, expire)
|
||||
require.Equal(t, int32(0), callback.alerts.Load())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user