mirror of
https://github.com/prometheus/alertmanager.git
synced 2026-02-05 15:45:34 +01:00
Alert metric reports different results to what the user sees via API (#2943)
* Alert metric reports different results to what the user sees via API Fixes #1439 and #2619. The previous metric is not _technically_ reporting incorrect results as the alerts _are_ still around and will be re-used if that same alert (equal fingerprint) is received before it is GCed. Therefore, I have kept the old metric under a new name `alertmanager_marked_alerts` and repurpose the current metric to match what the user sees in the UI. Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
@@ -339,7 +339,7 @@ func run() int {
|
||||
go peer.Settle(ctx, *gossipInterval*10)
|
||||
}
|
||||
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger)
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, *alertGCInterval, nil, logger, prometheus.DefaultRegisterer)
|
||||
if err != nil {
|
||||
level.Error(logger).Log("err", err)
|
||||
return 1
|
||||
|
||||
@@ -366,7 +366,7 @@ route:
|
||||
logger := log.NewNopLogger()
|
||||
route := NewRoute(conf.Route, nil)
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -504,7 +504,7 @@ route:
|
||||
logger := log.NewNopLogger()
|
||||
route := NewRoute(conf.Route, nil)
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -625,7 +625,7 @@ func newAlert(labels model.LabelSet) *types.Alert {
|
||||
func TestDispatcherRace(t *testing.T) {
|
||||
logger := log.NewNopLogger()
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -642,7 +642,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)
|
||||
|
||||
logger := log.NewNopLogger()
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger)
|
||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ import (
|
||||
|
||||
"github.com/go-kit/log"
|
||||
"github.com/go-kit/log/level"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/model"
|
||||
|
||||
"github.com/prometheus/alertmanager/provider"
|
||||
@@ -33,8 +34,10 @@ const alertChannelLength = 200
|
||||
type Alerts struct {
|
||||
cancel context.CancelFunc
|
||||
|
||||
alerts *store.Alerts
|
||||
marker types.Marker
|
||||
|
||||
mtx sync.Mutex
|
||||
alerts *store.Alerts
|
||||
listeners map[int]listeningAlerts
|
||||
next int
|
||||
|
||||
@@ -62,14 +65,34 @@ type listeningAlerts struct {
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
func (a *Alerts) registerMetrics(r prometheus.Registerer) {
|
||||
newMemAlertByStatus := func(s types.AlertState) prometheus.GaugeFunc {
|
||||
return prometheus.NewGaugeFunc(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_alerts",
|
||||
Help: "How many alerts by state.",
|
||||
ConstLabels: prometheus.Labels{"state": string(s)},
|
||||
},
|
||||
func() float64 {
|
||||
return float64(a.count(s))
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
r.MustRegister(newMemAlertByStatus(types.AlertStateActive))
|
||||
r.MustRegister(newMemAlertByStatus(types.AlertStateSuppressed))
|
||||
r.MustRegister(newMemAlertByStatus(types.AlertStateUnprocessed))
|
||||
}
|
||||
|
||||
// NewAlerts returns a new alert provider.
|
||||
func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger) (*Alerts, error) {
|
||||
func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, alertCallback AlertStoreCallback, l log.Logger, r prometheus.Registerer) (*Alerts, error) {
|
||||
if alertCallback == nil {
|
||||
alertCallback = noopCallback{}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
a := &Alerts{
|
||||
marker: m,
|
||||
alerts: store.NewAlerts(),
|
||||
cancel: cancel,
|
||||
listeners: map[int]listeningAlerts{},
|
||||
@@ -98,6 +121,11 @@ func NewAlerts(ctx context.Context, m types.Marker, intervalGC time.Duration, al
|
||||
}
|
||||
a.mtx.Unlock()
|
||||
})
|
||||
|
||||
if r != nil {
|
||||
a.registerMetrics(r)
|
||||
}
|
||||
|
||||
go a.alerts.Run(ctx, intervalGC)
|
||||
|
||||
return a, nil
|
||||
@@ -212,6 +240,25 @@ func (a *Alerts) Put(alerts ...*types.Alert) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// count returns the number of non-resolved alerts we currently have stored filtered by the provided state.
|
||||
func (a *Alerts) count(state types.AlertState) int {
|
||||
var count int
|
||||
for _, alert := range a.alerts.List() {
|
||||
if alert.Resolved() {
|
||||
continue
|
||||
}
|
||||
|
||||
status := a.marker.Status(alert.Fingerprint())
|
||||
if status.State != state {
|
||||
continue
|
||||
}
|
||||
|
||||
count++
|
||||
}
|
||||
|
||||
return count
|
||||
}
|
||||
|
||||
type noopCallback struct{}
|
||||
|
||||
func (n noopCallback) PreStore(_ *types.Alert, _ bool) error { return nil }
|
||||
|
||||
@@ -86,7 +86,7 @@ func init() {
|
||||
// a listener can not unsubscribe as the lock is hold by `alerts.Lock`.
|
||||
func TestAlertsSubscribePutStarvation(t *testing.T) {
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -137,7 +137,7 @@ func TestAlertsSubscribePutStarvation(t *testing.T) {
|
||||
|
||||
func TestAlertsPut(t *testing.T) {
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -165,7 +165,7 @@ func TestAlertsSubscribe(t *testing.T) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
|
||||
alerts, err := NewAlerts(ctx, marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -242,7 +242,7 @@ func TestAlertsSubscribe(t *testing.T) {
|
||||
|
||||
func TestAlertsGetPending(t *testing.T) {
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 30*time.Minute, noopCallback{}, log.NewNopLogger(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -285,7 +285,7 @@ func TestAlertsGetPending(t *testing.T) {
|
||||
|
||||
func TestAlertsGC(t *testing.T) {
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, noopCallback{}, log.NewNopLogger(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -322,7 +322,7 @@ func TestAlertsStoreCallback(t *testing.T) {
|
||||
cb := &limitCountCallback{limit: 3}
|
||||
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, cb, log.NewNopLogger(), nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -383,6 +383,74 @@ func TestAlertsStoreCallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAlerts_Count(t *testing.T) {
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
alerts, err := NewAlerts(context.Background(), marker, 200*time.Millisecond, nil, log.NewNopLogger(), nil)
|
||||
require.NoError(t, err)
|
||||
|
||||
states := []types.AlertState{types.AlertStateActive, types.AlertStateSuppressed, types.AlertStateUnprocessed}
|
||||
|
||||
countByState := func(st types.AlertState) int {
|
||||
return alerts.count(st)
|
||||
}
|
||||
countTotal := func() int {
|
||||
var count int
|
||||
for _, st := range states {
|
||||
count += countByState(st)
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// First, there shouldn't be any alerts.
|
||||
require.Equal(t, 0, countTotal())
|
||||
|
||||
// When you insert a new alert that will eventually be active, it should be unprocessed first.
|
||||
now := time.Now()
|
||||
a1 := &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"bar": "foo"},
|
||||
Annotations: model.LabelSet{"foo": "bar"},
|
||||
StartsAt: now,
|
||||
EndsAt: now.Add(400 * time.Millisecond),
|
||||
GeneratorURL: "http://example.com/prometheus",
|
||||
},
|
||||
UpdatedAt: now,
|
||||
Timeout: false,
|
||||
}
|
||||
|
||||
alerts.Put(a1)
|
||||
require.Equal(t, 1, countByState(types.AlertStateUnprocessed))
|
||||
require.Equal(t, 1, countTotal())
|
||||
require.Eventually(t, func() bool {
|
||||
// When the alert will eventually expire and is considered resolved - it won't count.
|
||||
return countTotal() == 0
|
||||
}, 600*time.Millisecond, 100*time.Millisecond)
|
||||
|
||||
now = time.Now()
|
||||
a2 := &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"bar": "foo"},
|
||||
Annotations: model.LabelSet{"foo": "bar"},
|
||||
StartsAt: now,
|
||||
EndsAt: now.Add(400 * time.Millisecond),
|
||||
GeneratorURL: "http://example.com/prometheus",
|
||||
},
|
||||
UpdatedAt: now,
|
||||
Timeout: false,
|
||||
}
|
||||
|
||||
// When insert an alert, and then silence it. It shows up with the correct filter.
|
||||
alerts.Put(a2)
|
||||
marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
|
||||
require.Equal(t, 1, countByState(types.AlertStateSuppressed))
|
||||
require.Equal(t, 1, countTotal())
|
||||
|
||||
require.Eventually(t, func() bool {
|
||||
// When the alert will eventually expire and is considered resolved - it won't count.
|
||||
return countTotal() == 0
|
||||
}, 600*time.Millisecond, 100*time.Millisecond)
|
||||
}
|
||||
|
||||
func alertsEqual(a1, a2 *types.Alert) bool {
|
||||
if a1 == nil || a2 == nil {
|
||||
return false
|
||||
|
||||
@@ -108,11 +108,11 @@ type memMarker struct {
|
||||
}
|
||||
|
||||
func (m *memMarker) registerMetrics(r prometheus.Registerer) {
|
||||
newAlertMetricByState := func(st AlertState) prometheus.GaugeFunc {
|
||||
newMarkedAlertMetricByState := func(st AlertState) prometheus.GaugeFunc {
|
||||
return prometheus.NewGaugeFunc(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_alerts",
|
||||
Help: "How many alerts by state.",
|
||||
Name: "alertmanager_marked_alerts",
|
||||
Help: "How many alerts by state are currently marked in the Alertmanager regardless of their expiry.",
|
||||
ConstLabels: prometheus.Labels{"state": string(st)},
|
||||
},
|
||||
func() float64 {
|
||||
@@ -121,11 +121,13 @@ func (m *memMarker) registerMetrics(r prometheus.Registerer) {
|
||||
)
|
||||
}
|
||||
|
||||
alertsActive := newAlertMetricByState(AlertStateActive)
|
||||
alertsSuppressed := newAlertMetricByState(AlertStateSuppressed)
|
||||
alertsActive := newMarkedAlertMetricByState(AlertStateActive)
|
||||
alertsSuppressed := newMarkedAlertMetricByState(AlertStateSuppressed)
|
||||
alertStateUnprocessed := newMarkedAlertMetricByState(AlertStateUnprocessed)
|
||||
|
||||
r.MustRegister(alertsActive)
|
||||
r.MustRegister(alertsSuppressed)
|
||||
r.MustRegister(alertStateUnprocessed)
|
||||
}
|
||||
|
||||
// Count implements Marker.
|
||||
|
||||
@@ -25,6 +25,58 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestMemMarker_Count(t *testing.T) {
|
||||
r := prometheus.NewRegistry()
|
||||
marker := NewMarker(r)
|
||||
now := time.Now()
|
||||
|
||||
states := []AlertState{AlertStateSuppressed, AlertStateActive, AlertStateUnprocessed}
|
||||
countByState := func(state AlertState) int {
|
||||
return marker.Count(state)
|
||||
}
|
||||
|
||||
countTotal := func() int {
|
||||
var count int
|
||||
for _, s := range states {
|
||||
count += countByState(s)
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
require.Equal(t, 0, countTotal())
|
||||
|
||||
a1 := model.Alert{
|
||||
StartsAt: now.Add(-2 * time.Minute),
|
||||
EndsAt: now.Add(2 * time.Minute),
|
||||
Labels: model.LabelSet{"test": "active"},
|
||||
}
|
||||
a2 := model.Alert{
|
||||
StartsAt: now.Add(-2 * time.Minute),
|
||||
EndsAt: now.Add(2 * time.Minute),
|
||||
Labels: model.LabelSet{"test": "suppressed"},
|
||||
}
|
||||
a3 := model.Alert{
|
||||
StartsAt: now.Add(-2 * time.Minute),
|
||||
EndsAt: now.Add(-1 * time.Minute),
|
||||
Labels: model.LabelSet{"test": "resolved"},
|
||||
}
|
||||
|
||||
// Insert an active alert.
|
||||
marker.SetSilenced(a1.Fingerprint(), 1, nil, nil)
|
||||
require.Equal(t, 1, countByState(AlertStateActive))
|
||||
require.Equal(t, 1, countTotal())
|
||||
|
||||
// Insert a suppressed alert.
|
||||
marker.SetSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
|
||||
require.Equal(t, 1, countByState(AlertStateSuppressed))
|
||||
require.Equal(t, 2, countTotal())
|
||||
|
||||
// Insert a resolved alert - it'll count as active.
|
||||
marker.SetSilenced(a3.Fingerprint(), 1, []string{"1"}, nil)
|
||||
require.Equal(t, 1, countByState(AlertStateActive))
|
||||
require.Equal(t, 3, countTotal())
|
||||
}
|
||||
|
||||
func TestAlertMerge(t *testing.T) {
|
||||
now := time.Now()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user