1
0
mirror of https://github.com/prometheus/alertmanager.git synced 2026-02-05 15:45:34 +01:00

Alert metric reports different results to what the user sees via API (#2943)

* Alert metric reports different results to what the user sees via API

Fixes #1439 and #2619.

The previous metric is not _technically_ reporting incorrect results as the alerts _are_ still around and will be re-used if that same alert (equal fingerprint) is received before it is GCed. Therefore, I have kept the old metric under a new name `alertmanager_marked_alerts` and repurpose the current metric to match what the user sees in the UI.

Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
gotjosh
2022-06-16 11:16:06 +01:00
committed by GitHub
parent 4d09995c26
commit 805e505288
6 changed files with 187 additions and 18 deletions

View File

@@ -339,7 +339,7 @@ func run() int {
go peer.Settle(ctx, *gossipInterval*10)
}
alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger, prometheus.DefaultRegisterer)
if err != nil {
level.Error(logger).Log("err", err)
return 1

View File

@@ -366,7 +366,7 @@ route:
logger := log.NewNopLogger()
route := NewRoute(conf.Route, nil)
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
@@ -504,7 +504,7 @@ route:
logger := log.NewNopLogger()
route := NewRoute(conf.Route, nil)
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
@@ -625,7 +625,7 @@ func newAlert(labels model.LabelSet) *types.Alert {
func TestDispatcherRace(t *testing.T) {
logger := log.NewNopLogger()
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}
@@ -642,7 +642,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)
logger := log.NewNopLogger()
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
if err != nil {
t.Fatal(err)
}

View File

@@ -20,6 +20,7 @@ import (
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/prometheus/alertmanager/provider"
@@ -33,8 +34,10 @@ const alertChannelLength = 200
type Alerts struct {
cancel context.CancelFunc
alerts *store.Alerts
marker types.Marker
mtx sync.Mutex
alerts *store.Alerts
listeners map[int]listeningAlerts
next int
@@ -62,14 +65,34 @@ type listeningAlerts struct {
done chan struct{}
}
func (a *Alerts) registerMetrics(r prometheus.Registerer) {
newMemAlertByStatus := func(s types.AlertState) prometheus.GaugeFunc {
return prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "alertmanager_alerts",
Help: "How many alerts by state.",
ConstLabels: prometheus.Labels{"state": string(s)},
},
func() float64 {
return float64(a.count(s))
},
)
}
r.MustRegister(newMemAlertByStatus(types.AlertStateActive))
r.MustRegister(newMemAlertByStatus(types.AlertStateSuppressed))
r.MustRegister(newMemAlertByStatus(types.AlertStateUnprocessed))
}
// NewAlerts returns a new alert provider.
func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger) (*Alerts, error) {
func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger, r prometheus.Registerer) (*Alerts, error) {
if alertCallback == nil {
alertCallback = noopCallback{}
}
ctx, cancel := context.WithCancel(ctx)
a := &Alerts{
marker: m,
alerts: store.NewAlerts(),
cancel: cancel,
listeners: map[int]listeningAlerts{},
@@ -98,6 +121,11 @@ func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, al
}
a.mtx.Unlock()
})
if r != nil {
a.registerMetrics(r)
}
go a.alerts.Run(ctx, intervalGC)
return a, nil
@@ -212,6 +240,25 @@ func (a *Alerts) Put(alerts ...*types.Alert) error {
return nil
}
// count returns the number of non-resolved alerts we currently have stored filtered by the provided state.
func (a *Alerts) count(state types.AlertState) int {
var count int
for _, alert := range a.alerts.List() {
if alert.Resolved() {
continue
}
status := a.marker.Status(alert.Fingerprint())
if status.State != state {
continue
}
count++
}
return count
}
type noopCallback struct{}
func (n noopCallback) PreStore(_ *types.Alert, _ bool) error { return nil }

View File

@@ -86,7 +86,7 @@ func init() {
// a listener can not unsubscribe as the lock is hold by `alerts.Lock`.
func TestAlertsSubscribePutStarvation(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
@@ -137,7 +137,7 @@ func TestAlertsSubscribePutStarvation(t *testing.T) {
func TestAlertsPut(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
@@ -165,7 +165,7 @@ func TestAlertsSubscribe(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
@@ -242,7 +242,7 @@ func TestAlertsSubscribe(t *testing.T) {
func TestAlertsGetPending(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
@@ -285,7 +285,7 @@ func TestAlertsGetPending(t *testing.T) {
func TestAlertsGC(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
@@ -322,7 +322,7 @@ func TestAlertsStoreCallback(t *testing.T) {
cb := &limitCountCallback{limit: 3}
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger(), nil)
if err != nil {
t.Fatal(err)
}
@@ -383,6 +383,74 @@ func TestAlertsStoreCallback(t *testing.T) {
}
}
func TestAlerts_Count(t *testing.T) {
marker := types.NewMarker(prometheus.NewRegistry())
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, nil, log.NewNopLogger(), nil)
require.NoError(t, err)
states := []types.AlertState{types.AlertStateActive, types.AlertStateSuppressed, types.AlertStateUnprocessed}
countByState := func(st types.AlertState) int {
return alerts.count(st)
}
countTotal := func() int {
var count int
for _, st := range states {
count += countByState(st)
}
return count
}
// First, there shouldn't be any alerts.
require.Equal(t, 0, countTotal())
// When you insert a new alert that will eventually be active, it should be unprocessed first.
now := time.Now()
a1 := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"bar": "foo"},
Annotations: model.LabelSet{"foo": "bar"},
StartsAt: now,
EndsAt: now.Add(400 * time.Millisecond),
GeneratorURL: "http://example.com/prometheus",
},
UpdatedAt: now,
Timeout: false,
}
alerts.Put(a1)
require.Equal(t, 1, countByState(types.AlertStateUnprocessed))
require.Equal(t, 1, countTotal())
require.Eventually(t, func() bool {
// When the alert will eventually expire and is considered resolved - it won't count.
return countTotal() == 0
}, 600*time.Millisecond, 100*time.Millisecond)
now = time.Now()
a2 := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"bar": "foo"},
Annotations: model.LabelSet{"foo": "bar"},
StartsAt: now,
EndsAt: now.Add(400 * time.Millisecond),
GeneratorURL: "http://example.com/prometheus",
},
UpdatedAt: now,
Timeout: false,
}
// When insert an alert, and then silence it. It shows up with the correct filter.
alerts.Put(a2)
marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
require.Equal(t, 1, countByState(types.AlertStateSuppressed))
require.Equal(t, 1, countTotal())
require.Eventually(t, func() bool {
// When the alert will eventually expire and is considered resolved - it won't count.
return countTotal() == 0
}, 600*time.Millisecond, 100*time.Millisecond)
}
func alertsEqual(a1, a2 *types.Alert) bool {
if a1 == nil || a2 == nil {
return false

View File

@@ -108,11 +108,11 @@ type memMarker struct {
}
func (m *memMarker) registerMetrics(r prometheus.Registerer) {
newAlertMetricByState := func(st AlertState) prometheus.GaugeFunc {
newMarkedAlertMetricByState := func(st AlertState) prometheus.GaugeFunc {
return prometheus.NewGaugeFunc(
prometheus.GaugeOpts{
Name: "alertmanager_alerts",
Help: "How many alerts by state.",
Name: "alertmanager_marked_alerts",
Help: "How many alerts by state are currently marked in the Alertmanager regardless of their expiry.",
ConstLabels: prometheus.Labels{"state": string(st)},
},
func() float64 {
@@ -121,11 +121,13 @@ func (m *memMarker) registerMetrics(r prometheus.Registerer) {
)
}
alertsActive := newAlertMetricByState(AlertStateActive)
alertsSuppressed := newAlertMetricByState(AlertStateSuppressed)
alertsActive := newMarkedAlertMetricByState(AlertStateActive)
alertsSuppressed := newMarkedAlertMetricByState(AlertStateSuppressed)
alertStateUnprocessed := newMarkedAlertMetricByState(AlertStateUnprocessed)
r.MustRegister(alertsActive)
r.MustRegister(alertsSuppressed)
r.MustRegister(alertStateUnprocessed)
}
// Count implements Marker.

View File

@@ -25,6 +25,58 @@ import (
"github.com/stretchr/testify/require"
)
func TestMemMarker_Count(t *testing.T) {
r := prometheus.NewRegistry()
marker := NewMarker(r)
now := time.Now()
states := []AlertState{AlertStateSuppressed, AlertStateActive, AlertStateUnprocessed}
countByState := func(state AlertState) int {
return marker.Count(state)
}
countTotal := func() int {
var count int
for _, s := range states {
count += countByState(s)
}
return count
}
require.Equal(t, 0, countTotal())
a1 := model.Alert{
StartsAt: now.Add(-2 * time.Minute),
EndsAt: now.Add(2 * time.Minute),
Labels: model.LabelSet{"test": "active"},
}
a2 := model.Alert{
StartsAt: now.Add(-2 * time.Minute),
EndsAt: now.Add(2 * time.Minute),
Labels: model.LabelSet{"test": "suppressed"},
}
a3 := model.Alert{
StartsAt: now.Add(-2 * time.Minute),
EndsAt: now.Add(-1 * time.Minute),
Labels: model.LabelSet{"test": "resolved"},
}
// Insert an active alert.
marker.SetSilenced(a1.Fingerprint(), 1, nil, nil)
require.Equal(t, 1, countByState(AlertStateActive))
require.Equal(t, 1, countTotal())
// Insert a suppressed alert.
marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
require.Equal(t, 1, countByState(AlertStateSuppressed))
require.Equal(t, 2, countTotal())
// Insert a resolved alert - it'll count as active.
marker.SetSilenced(a3.Fingerprint(), 1, []string{"1"}, nil)
require.Equal(t, 1, countByState(AlertStateActive))
require.Equal(t, 3, countTotal())
}
func TestAlertMerge(t *testing.T) {
now := time.Now()