1
0
mirror of https://github.com/prometheus/alertmanager.git synced 2026-02-05 15:45:34 +01:00

fix(provider): reduce lock contention (#4809)

The provider loops over all alerts per state, which results in 3 loops
over all store alerts and one call per alert to the marker per loop.

Add a custom collector from counting alerts by state.
This reduces the number of calls to store and marker to 1/3.

Signed-off-by: Siavash Safi <siavash@cloudflare.com>
This commit is contained in:
Siavash Safi
2025-12-18 08:14:59 +01:00
committed by GitHub
parent 1bab012c4c
commit 5d9cce4599
2 changed files with 44 additions and 41 deletions

View File

@@ -1,4 +1,4 @@
// Copyright 2016 Prometheus Team
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
@@ -77,22 +77,7 @@ type listeningAlerts struct {
}
func (a *Alerts) registerMetrics(r prometheus.Registerer) {
newMemAlertByStatus := func(s types.AlertState) prometheus.GaugeFunc {
return promauto.With(r).NewGaugeFunc(
prometheus.GaugeOpts{
Name: "alertmanager_alerts",
Help: "How many alerts by state.",
ConstLabels: prometheus.Labels{"state": string(s)},
},
func() float64 {
return float64(a.count(s))
},
)
}
newMemAlertByStatus(types.AlertStateActive)
newMemAlertByStatus(types.AlertStateSuppressed)
newMemAlertByStatus(types.AlertStateUnprocessed)
r.MustRegister(&alertsCollector{alerts: a})
a.subscriberChannelWrites = promauto.With(r).NewCounterVec(
prometheus.CounterOpts{
@@ -311,23 +296,46 @@ func (a *Alerts) Put(ctx context.Context, alerts ...*types.Alert) error {
return nil
}
// count returns the number of non-resolved alerts we currently have stored filtered by the provided state.
func (a *Alerts) count(state types.AlertState) int {
var count int
// countByState returns the number of non-resolved alerts by state.
func (a *Alerts) countByState() (active, suppressed, unprocessed int) {
for _, alert := range a.alerts.List() {
if alert.Resolved() {
continue
}
status := a.marker.Status(alert.Fingerprint())
if status.State != state {
continue
switch a.marker.Status(alert.Fingerprint()).State {
case types.AlertStateActive:
active++
case types.AlertStateSuppressed:
suppressed++
case types.AlertStateUnprocessed:
unprocessed++
}
count++
}
return active, suppressed, unprocessed
}
return count
// alertsCollector implements prometheus.Collector to collect all alert count metrics in a single pass.
type alertsCollector struct {
alerts *Alerts
}
var alertsDesc = prometheus.NewDesc(
"alertmanager_alerts",
"How many alerts by state.",
[]string{"state"}, nil,
)
func (c *alertsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- alertsDesc
}
func (c *alertsCollector) Collect(ch chan<- prometheus.Metric) {
active, suppressed, unprocessed := c.alerts.countByState()
ch <- prometheus.MustNewConstMetric(alertsDesc, prometheus.GaugeValue, float64(active), string(types.AlertStateActive))
ch <- prometheus.MustNewConstMetric(alertsDesc, prometheus.GaugeValue, float64(suppressed), string(types.AlertStateSuppressed))
ch <- prometheus.MustNewConstMetric(alertsDesc, prometheus.GaugeValue, float64(unprocessed), string(types.AlertStateUnprocessed))
}
type noopCallback struct{}

View File

@@ -1,4 +1,4 @@
// Copyright 2016 Prometheus Team
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
@@ -425,22 +425,14 @@ func TestAlertsStoreCallback(t *testing.T) {
}
}
func TestAlerts_Count(t *testing.T) {
func TestAlerts_CountByState(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, nil, promslog.NewNopLogger(), nil)
require.NoError(t, err)
states := []types.AlertState{types.AlertStateActive, types.AlertStateSuppressed, types.AlertStateUnprocessed}
countByState := func(st types.AlertState) int {
return alerts.count(st)
}
countTotal := func() int {
var count int
for _, st := range states {
count += countByState(st)
}
return count
active, suppressed, unprocessed := alerts.countByState()
return active + suppressed + unprocessed
}
// First, there shouldn't be any alerts.
@@ -462,7 +454,8 @@ func TestAlerts_Count(t *testing.T) {
ctx := context.Background()
alerts.Put(ctx, a1)
require.Equal(t, 1, countByState(types.AlertStateUnprocessed))
_, _, unprocessed := alerts.countByState()
require.Equal(t, 1, unprocessed)
require.Equal(t, 1, countTotal())
require.Eventually(t, func() bool {
// When the alert will eventually expire and is considered resolved - it won't count.
@@ -485,7 +478,8 @@ func TestAlerts_Count(t *testing.T) {
// When insert an alert, and then silence it. It shows up with the correct filter.
alerts.Put(ctx, a2)
marker.SetActiveOrSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
require.Equal(t, 1, countByState(types.AlertStateSuppressed))
_, suppressed, _ := alerts.countByState()
require.Equal(t, 1, suppressed)
require.Equal(t, 1, countTotal())
require.Eventually(t, func() bool {
@@ -604,7 +598,8 @@ func TestAlertsConcurrently(t *testing.T) {
time.Sleep(expire)
require.Eventually(t, func() bool {
// When the alert will eventually expire and is considered resolved - it won't count.
return a.count(types.AlertStateActive) == 0
active, _, _ := a.countByState()
return active == 0
}, 2*expire, expire)
require.Equal(t, int32(0), callback.alerts.Load())
}