1
0
mirror of https://github.com/prometheus/alertmanager.git synced 2026-02-05 15:45:34 +01:00
Files
alertmanager/dispatch/dispatch_test.go
Ben Kochie f656273159 Enable modernize linter (#4750)
Enable the golangci-lint modernize linter.
* Add exception for the omitempty struct issue.
* Apply modernize fixes.

Signed-off-by: SuperQ <superq@gmail.com>
2025-11-18 11:46:15 +01:00

1061 lines
30 KiB
Go

// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dispatch
import (
"context"
"fmt"
"log/slog"
"reflect"
"sort"
"sync"
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promslog"
"github.com/stretchr/testify/require"
"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/notify"
"github.com/prometheus/alertmanager/provider/mem"
"github.com/prometheus/alertmanager/types"
)
const testMaintenanceInterval = 30 * time.Second
func TestAggrGroup(t *testing.T) {
lset := model.LabelSet{
"a": "v1",
"b": "v2",
}
opts := &RouteOpts{
Receiver: "n1",
GroupBy: map[model.LabelName]struct{}{
"a": {},
"b": {},
},
GroupWait: 1 * time.Second,
GroupInterval: 300 * time.Millisecond,
RepeatInterval: 1 * time.Hour,
}
route := &Route{
RouteOpts: *opts,
}
var (
a1 = &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v3",
},
StartsAt: time.Now().Add(time.Minute),
EndsAt: time.Now().Add(time.Hour),
},
UpdatedAt: time.Now(),
}
a2 = &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v4",
},
StartsAt: time.Now().Add(-time.Hour),
EndsAt: time.Now().Add(2 * time.Hour),
},
UpdatedAt: time.Now(),
}
a3 = &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v5",
},
StartsAt: time.Now().Add(time.Minute),
EndsAt: time.Now().Add(5 * time.Minute),
},
UpdatedAt: time.Now(),
}
)
var (
last = time.Now()
current = time.Now()
lastCurMtx = &sync.Mutex{}
alertsCh = make(chan types.AlertSlice)
)
ntfy := func(ctx context.Context, alerts ...*types.Alert) bool {
// Validate that the context is properly populated.
if _, ok := notify.Now(ctx); !ok {
t.Errorf("now missing")
}
if _, ok := notify.GroupKey(ctx); !ok {
t.Errorf("group key missing")
}
if lbls, ok := notify.GroupLabels(ctx); !ok || !reflect.DeepEqual(lbls, lset) {
t.Errorf("wrong group labels: %q", lbls)
}
if rcv, ok := notify.ReceiverName(ctx); !ok || rcv != opts.Receiver {
t.Errorf("wrong receiver: %q", rcv)
}
if ri, ok := notify.RepeatInterval(ctx); !ok || ri != opts.RepeatInterval {
t.Errorf("wrong repeat interval: %q", ri)
}
lastCurMtx.Lock()
last = current
// Subtract a millisecond to allow for races.
current = time.Now().Add(-time.Millisecond)
lastCurMtx.Unlock()
alertsCh <- types.AlertSlice(alerts)
return true
}
removeEndsAt := func(as types.AlertSlice) types.AlertSlice {
for i, a := range as {
ac := *a
ac.EndsAt = time.Time{}
as[i] = &ac
}
return as
}
// Test regular situation where we wait for group_wait to send out alerts.
ag := newAggrGroup(context.Background(), lset, route, nil, types.NewMarker(prometheus.NewRegistry()), promslog.NewNopLogger())
go ag.run(ntfy)
ag.insert(a1)
select {
case <-time.After(2 * opts.GroupWait):
t.Fatalf("expected initial batch after group_wait")
case batch := <-alertsCh:
lastCurMtx.Lock()
s := time.Since(last)
lastCurMtx.Unlock()
if s < opts.GroupWait {
t.Fatalf("received batch too early after %v", s)
}
exp := removeEndsAt(types.AlertSlice{a1})
sort.Sort(batch)
if !reflect.DeepEqual(batch, exp) {
t.Fatalf("expected alerts %v but got %v", exp, batch)
}
}
for range 3 {
// New alert should come in after group interval.
ag.insert(a3)
select {
case <-time.After(2 * opts.GroupInterval):
t.Fatalf("expected new batch after group interval but received none")
case batch := <-alertsCh:
lastCurMtx.Lock()
s := time.Since(last)
lastCurMtx.Unlock()
if s < opts.GroupInterval {
t.Fatalf("received batch too early after %v", s)
}
exp := removeEndsAt(types.AlertSlice{a1, a3})
sort.Sort(batch)
if !reflect.DeepEqual(batch, exp) {
t.Fatalf("expected alerts %v but got %v", exp, batch)
}
}
}
ag.stop()
// Finally, set all alerts to be resolved. After successful notify the aggregation group
// should empty itself.
ag = newAggrGroup(context.Background(), lset, route, nil, types.NewMarker(prometheus.NewRegistry()), promslog.NewNopLogger())
go ag.run(ntfy)
ag.insert(a1)
ag.insert(a2)
batch := <-alertsCh
exp := removeEndsAt(types.AlertSlice{a1, a2})
sort.Sort(batch)
if !reflect.DeepEqual(batch, exp) {
t.Fatalf("expected alerts %v but got %v", exp, batch)
}
for range 3 {
// New alert should come in after group interval.
ag.insert(a3)
select {
case <-time.After(2 * opts.GroupInterval):
t.Fatalf("expected new batch after group interval but received none")
case batch := <-alertsCh:
lastCurMtx.Lock()
s := time.Since(last)
lastCurMtx.Unlock()
if s < opts.GroupInterval {
t.Fatalf("received batch too early after %v", s)
}
exp := removeEndsAt(types.AlertSlice{a1, a2, a3})
sort.Sort(batch)
if !reflect.DeepEqual(batch, exp) {
t.Fatalf("expected alerts %v but got %v", exp, batch)
}
}
}
// Resolve an alert, and it should be removed after the next batch was sent.
a1r := *a1
a1r.EndsAt = time.Now()
ag.insert(&a1r)
exp = append(types.AlertSlice{&a1r}, removeEndsAt(types.AlertSlice{a2, a3})...)
select {
case <-time.After(2 * opts.GroupInterval):
t.Fatalf("expected new batch after group interval but received none")
case batch := <-alertsCh:
lastCurMtx.Lock()
s := time.Since(last)
lastCurMtx.Unlock()
if s < opts.GroupInterval {
t.Fatalf("received batch too early after %v", s)
}
sort.Sort(batch)
if !reflect.DeepEqual(batch, exp) {
t.Fatalf("expected alerts %v but got %v", exp, batch)
}
}
// Resolve all remaining alerts, they should be removed after the next batch was sent.
// Do not add a1r as it should have been deleted following the previous batch.
a2r, a3r := *a2, *a3
resolved := types.AlertSlice{&a2r, &a3r}
for _, a := range resolved {
a.EndsAt = time.Now()
ag.insert(a)
}
select {
case <-time.After(2 * opts.GroupInterval):
t.Fatalf("expected new batch after group interval but received none")
case batch := <-alertsCh:
lastCurMtx.Lock()
s := time.Since(last)
lastCurMtx.Unlock()
if s < opts.GroupInterval {
t.Fatalf("received batch too early after %v", s)
}
sort.Sort(batch)
if !reflect.DeepEqual(batch, resolved) {
t.Fatalf("expected alerts %v but got %v", resolved, batch)
}
if !ag.empty() {
t.Fatalf("Expected aggregation group to be empty after resolving alerts: %v", ag)
}
}
ag.stop()
}
func TestGroupLabels(t *testing.T) {
a := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v3",
},
},
}
route := &Route{
RouteOpts: RouteOpts{
GroupBy: map[model.LabelName]struct{}{
"a": {},
"b": {},
},
GroupByAll: false,
},
}
expLs := model.LabelSet{
"a": "v1",
"b": "v2",
}
ls := getGroupLabels(a, route)
if !reflect.DeepEqual(ls, expLs) {
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
}
}
func TestGroupByAllLabels(t *testing.T) {
a := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v3",
},
},
}
route := &Route{
RouteOpts: RouteOpts{
GroupBy: map[model.LabelName]struct{}{},
GroupByAll: true,
},
}
expLs := model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v3",
}
ls := getGroupLabels(a, route)
if !reflect.DeepEqual(ls, expLs) {
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
}
}
func TestGroups(t *testing.T) {
confData := `receivers:
- name: 'kafka'
- name: 'prod'
- name: 'testing'
route:
group_by: ['alertname']
group_wait: 10ms
group_interval: 10ms
receiver: 'prod'
routes:
- match:
env: 'testing'
receiver: 'testing'
group_by: ['alertname', 'service']
- match:
env: 'prod'
receiver: 'prod'
group_by: ['alertname', 'service', 'cluster']
continue: true
- match:
kafka: 'yes'
receiver: 'kafka'
group_by: ['alertname', 'service', 'cluster']`
conf, err := config.Load(confData)
if err != nil {
t.Fatal(err)
}
logger := promslog.NewNopLogger()
route := NewRoute(conf.Route, nil)
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, reg)
if err != nil {
t.Fatal(err)
}
defer alerts.Close()
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, logger, NewDispatcherMetrics(false, reg))
go dispatcher.Run(time.Now())
defer dispatcher.Stop()
// Create alerts. the dispatcher will automatically create the groups.
inputAlerts := []*types.Alert{
// Matches the parent route.
newAlert(model.LabelSet{"alertname": "OtherAlert", "cluster": "cc", "service": "dd"}),
// Matches the first sub-route.
newAlert(model.LabelSet{"env": "testing", "alertname": "TestingAlert", "service": "api", "instance": "inst1"}),
// Matches the second sub-route.
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}),
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}),
// Matches the second sub-route.
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}),
// Matches the second and third sub-route.
newAlert(model.LabelSet{"env": "prod", "alertname": "HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}),
}
alerts.Put(inputAlerts...)
// Let alerts get processed.
for i := 0; len(recorder.Alerts()) != 7 && i < 10; i++ {
time.Sleep(200 * time.Millisecond)
}
require.Len(t, recorder.Alerts(), 7)
alertGroups, receivers, _ := dispatcher.Groups(context.Background(),
func(*Route) bool {
return true
}, func(*types.Alert, time.Time) bool {
return true
},
)
require.Equal(t, AlertGroups{
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[0]},
Labels: model.LabelSet{
"alertname": "OtherAlert",
},
Receiver: "prod",
GroupKey: "{}:{alertname=\"OtherAlert\"}",
RouteID: "{}",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[1]},
Labels: model.LabelSet{
"alertname": "TestingAlert",
"service": "api",
},
Receiver: "testing",
GroupKey: "{}/{env=\"testing\"}:{alertname=\"TestingAlert\", service=\"api\"}",
RouteID: "{}/{env=\"testing\"}/0",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[2], inputAlerts[3]},
Labels: model.LabelSet{
"alertname": "HighErrorRate",
"service": "api",
"cluster": "aa",
},
Receiver: "prod",
GroupKey: "{}/{env=\"prod\"}:{alertname=\"HighErrorRate\", cluster=\"aa\", service=\"api\"}",
RouteID: "{}/{env=\"prod\"}/1",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[4]},
Labels: model.LabelSet{
"alertname": "HighErrorRate",
"service": "api",
"cluster": "bb",
},
Receiver: "prod",
GroupKey: "{}/{env=\"prod\"}:{alertname=\"HighErrorRate\", cluster=\"bb\", service=\"api\"}",
RouteID: "{}/{env=\"prod\"}/1",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[5]},
Labels: model.LabelSet{
"alertname": "HighLatency",
"service": "db",
"cluster": "bb",
},
Receiver: "kafka",
GroupKey: "{}/{kafka=\"yes\"}:{alertname=\"HighLatency\", cluster=\"bb\", service=\"db\"}",
RouteID: "{}/{kafka=\"yes\"}/2",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[5]},
Labels: model.LabelSet{
"alertname": "HighLatency",
"service": "db",
"cluster": "bb",
},
Receiver: "prod",
GroupKey: "{}/{env=\"prod\"}:{alertname=\"HighLatency\", cluster=\"bb\", service=\"db\"}",
RouteID: "{}/{env=\"prod\"}/1",
},
}, alertGroups)
require.Equal(t, map[model.Fingerprint][]string{
inputAlerts[0].Fingerprint(): {"prod"},
inputAlerts[1].Fingerprint(): {"testing"},
inputAlerts[2].Fingerprint(): {"prod"},
inputAlerts[3].Fingerprint(): {"prod"},
inputAlerts[4].Fingerprint(): {"prod"},
inputAlerts[5].Fingerprint(): {"kafka", "prod"},
}, receivers)
}
func TestGroupsWithLimits(t *testing.T) {
confData := `receivers:
- name: 'kafka'
- name: 'prod'
- name: 'testing'
route:
group_by: ['alertname']
group_wait: 10ms
group_interval: 10ms
receiver: 'prod'
routes:
- match:
env: 'testing'
receiver: 'testing'
group_by: ['alertname', 'service']
- match:
env: 'prod'
receiver: 'prod'
group_by: ['alertname', 'service', 'cluster']
continue: true
- match:
kafka: 'yes'
receiver: 'kafka'
group_by: ['alertname', 'service', 'cluster']`
conf, err := config.Load(confData)
if err != nil {
t.Fatal(err)
}
logger := promslog.NewNopLogger()
route := NewRoute(conf.Route, nil)
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, reg)
if err != nil {
t.Fatal(err)
}
defer alerts.Close()
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
lim := limits{groups: 6}
m := NewDispatcherMetrics(true, reg)
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, lim, logger, m)
go dispatcher.Run(time.Now())
defer dispatcher.Stop()
// Create alerts. the dispatcher will automatically create the groups.
inputAlerts := []*types.Alert{
// Matches the parent route.
newAlert(model.LabelSet{"alertname": "OtherAlert", "cluster": "cc", "service": "dd"}),
// Matches the first sub-route.
newAlert(model.LabelSet{"env": "testing", "alertname": "TestingAlert", "service": "api", "instance": "inst1"}),
// Matches the second sub-route.
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}),
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}),
// Matches the second sub-route.
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}),
// Matches the second and third sub-route.
newAlert(model.LabelSet{"env": "prod", "alertname": "HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}),
}
err = alerts.Put(inputAlerts...)
if err != nil {
t.Fatal(err)
}
// Let alerts get processed.
for i := 0; len(recorder.Alerts()) != 7 && i < 10; i++ {
time.Sleep(200 * time.Millisecond)
}
require.Len(t, recorder.Alerts(), 7)
routeFilter := func(*Route) bool { return true }
alertFilter := func(*types.Alert, time.Time) bool { return true }
alertGroups, _, _ := dispatcher.Groups(context.Background(), routeFilter, alertFilter)
require.Len(t, alertGroups, 6)
require.Equal(t, 0.0, testutil.ToFloat64(m.aggrGroupLimitReached))
// Try to store new alert. This time, we will hit limit for number of groups.
err = alerts.Put(newAlert(model.LabelSet{"env": "prod", "alertname": "NewAlert", "cluster": "new-cluster", "service": "db"}))
if err != nil {
t.Fatal(err)
}
// Let alert get processed.
for i := 0; testutil.ToFloat64(m.aggrGroupLimitReached) == 0 && i < 10; i++ {
time.Sleep(200 * time.Millisecond)
}
require.Equal(t, 1.0, testutil.ToFloat64(m.aggrGroupLimitReached))
// Verify there are still only 6 groups.
alertGroups, _, _ = dispatcher.Groups(context.Background(), routeFilter, alertFilter)
require.Len(t, alertGroups, 6)
}
type recordStage struct {
mtx sync.RWMutex
alerts map[string]map[model.Fingerprint]*types.Alert
}
func (r *recordStage) Alerts() []*types.Alert {
r.mtx.RLock()
defer r.mtx.RUnlock()
alerts := make([]*types.Alert, 0)
for k := range r.alerts {
for _, a := range r.alerts[k] {
alerts = append(alerts, a)
}
}
return alerts
}
func (r *recordStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
r.mtx.Lock()
defer r.mtx.Unlock()
gk, ok := notify.GroupKey(ctx)
if !ok {
panic("GroupKey not present!")
}
if _, ok := r.alerts[gk]; !ok {
r.alerts[gk] = make(map[model.Fingerprint]*types.Alert)
}
for _, a := range alerts {
r.alerts[gk][a.Fingerprint()] = a
}
return ctx, nil, nil
}
var (
// Set the start time in the past to trigger a flush immediately.
t0 = time.Now().Add(-time.Minute)
// Set the end time in the future to avoid deleting the alert.
t1 = t0.Add(2 * time.Minute)
)
func newAlert(labels model.LabelSet) *types.Alert {
return &types.Alert{
Alert: model.Alert{
Labels: labels,
Annotations: model.LabelSet{"foo": "bar"},
StartsAt: t0,
EndsAt: t1,
GeneratorURL: "http://example.com/prometheus",
},
UpdatedAt: t0,
Timeout: false,
}
}
func TestDispatcherRace(t *testing.T) {
logger := promslog.NewNopLogger()
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, reg)
if err != nil {
t.Fatal(err)
}
defer alerts.Close()
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
dispatcher := NewDispatcher(alerts, nil, nil, marker, timeout, testMaintenanceInterval, nil, logger, NewDispatcherMetrics(false, reg))
go dispatcher.Run(time.Now())
dispatcher.Stop()
}
func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T) {
const numAlerts = 5000
logger := promslog.NewNopLogger()
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, reg)
if err != nil {
t.Fatal(err)
}
defer alerts.Close()
route := &Route{
RouteOpts: RouteOpts{
Receiver: "default",
GroupBy: map[model.LabelName]struct{}{"alertname": {}},
GroupWait: 0,
GroupInterval: 1 * time.Hour, // Should never hit in this test.
RepeatInterval: 1 * time.Hour, // Should never hit in this test.
},
}
timeout := func(d time.Duration) time.Duration { return d }
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, logger, NewDispatcherMetrics(false, reg))
go dispatcher.Run(time.Now())
defer dispatcher.Stop()
// Push all alerts.
for i := range numAlerts {
alert := newAlert(model.LabelSet{"alertname": model.LabelValue(fmt.Sprintf("Alert_%d", i))})
require.NoError(t, alerts.Put(alert))
}
// Wait until the alerts have been notified or the waiting timeout expires.
for deadline := time.Now().Add(5 * time.Second); time.Now().Before(deadline); {
if len(recorder.Alerts()) >= numAlerts {
break
}
// Throttle.
time.Sleep(10 * time.Millisecond)
}
// We expect all alerts to be notified immediately, since they all belong to different groups.
require.Len(t, recorder.Alerts(), numAlerts)
}
type limits struct {
groups int
}
func (l limits) MaxNumberOfAggregationGroups() int {
return l.groups
}
func TestDispatcher_DoMaintenance(t *testing.T) {
r := prometheus.NewRegistry()
marker := types.NewMarker(r)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Minute, nil, promslog.NewNopLogger(), r)
if err != nil {
t.Fatal(err)
}
route := &Route{
RouteOpts: RouteOpts{
GroupBy: map[model.LabelName]struct{}{"alertname": {}},
GroupWait: 0,
GroupInterval: 5 * time.Minute, // Should never hit in this test.
},
}
timeout := func(d time.Duration) time.Duration { return d }
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
ctx := context.Background()
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, promslog.NewNopLogger(), NewDispatcherMetrics(false, r))
aggrGroups := make(map[*Route]map[model.Fingerprint]*aggrGroup)
aggrGroups[route] = make(map[model.Fingerprint]*aggrGroup)
// Insert an aggregation group with no alerts.
labels := model.LabelSet{"alertname": "1"}
aggrGroup1 := newAggrGroup(ctx, labels, route, timeout, types.NewMarker(prometheus.NewRegistry()), promslog.NewNopLogger())
aggrGroups[route][aggrGroup1.fingerprint()] = aggrGroup1
dispatcher.aggrGroupsPerRoute = aggrGroups
// Must run otherwise doMaintenance blocks on aggrGroup1.stop().
go aggrGroup1.run(func(context.Context, ...*types.Alert) bool { return true })
// Insert a marker for the aggregation group's group key.
marker.SetMuted(route.ID(), aggrGroup1.GroupKey(), []string{"weekends"})
mutedBy, isMuted := marker.Muted(route.ID(), aggrGroup1.GroupKey())
require.True(t, isMuted)
require.Equal(t, []string{"weekends"}, mutedBy)
// Run the maintenance and the marker should be removed.
dispatcher.doMaintenance()
mutedBy, isMuted = marker.Muted(route.ID(), aggrGroup1.GroupKey())
require.False(t, isMuted)
require.Empty(t, mutedBy)
}
func TestDispatcher_DeleteResolvedAlertsFromMarker(t *testing.T) {
t.Run("successful flush deletes markers for resolved alerts", func(t *testing.T) {
ctx := context.Background()
marker := types.NewMarker(prometheus.NewRegistry())
labels := model.LabelSet{"alertname": "TestAlert"}
route := &Route{
RouteOpts: RouteOpts{
Receiver: "test",
GroupBy: map[model.LabelName]struct{}{"alertname": {}},
GroupWait: 0,
GroupInterval: time.Minute,
RepeatInterval: time.Hour,
},
}
timeout := func(d time.Duration) time.Duration { return d }
logger := promslog.NewNopLogger()
// Create an aggregation group
ag := newAggrGroup(ctx, labels, route, timeout, marker, logger)
// Create test alerts: one active and one resolved
now := time.Now()
activeAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"alertname": "TestAlert",
"instance": "1",
},
StartsAt: now.Add(-time.Hour),
EndsAt: now.Add(time.Hour), // Active alert
},
UpdatedAt: now,
}
resolvedAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"alertname": "TestAlert",
"instance": "2",
},
StartsAt: now.Add(-time.Hour),
EndsAt: now.Add(-time.Minute), // Resolved alert
},
UpdatedAt: now,
}
// Insert alerts into the aggregation group
ag.insert(activeAlert)
ag.insert(resolvedAlert)
// Set markers for both alerts
marker.SetActiveOrSilenced(activeAlert.Fingerprint(), 0, nil, nil)
marker.SetActiveOrSilenced(resolvedAlert.Fingerprint(), 0, nil, nil)
// Verify markers exist before flush
require.True(t, marker.Active(activeAlert.Fingerprint()))
require.True(t, marker.Active(resolvedAlert.Fingerprint()))
// Create a notify function that succeeds
notifyFunc := func(alerts ...*types.Alert) bool {
return true
}
// Flush the alerts
ag.flush(notifyFunc)
// Verify that the resolved alert's marker was deleted
require.True(t, marker.Active(activeAlert.Fingerprint()), "active alert marker should still exist")
require.False(t, marker.Active(resolvedAlert.Fingerprint()), "resolved alert marker should be deleted")
})
t.Run("failed flush does not delete markers", func(t *testing.T) {
ctx := context.Background()
marker := types.NewMarker(prometheus.NewRegistry())
labels := model.LabelSet{"alertname": "TestAlert"}
route := &Route{
RouteOpts: RouteOpts{
Receiver: "test",
GroupBy: map[model.LabelName]struct{}{"alertname": {}},
GroupWait: 0,
GroupInterval: time.Minute,
RepeatInterval: time.Hour,
},
}
timeout := func(d time.Duration) time.Duration { return d }
logger := promslog.NewNopLogger()
// Create an aggregation group
ag := newAggrGroup(ctx, labels, route, timeout, marker, logger)
// Create a resolved alert
now := time.Now()
resolvedAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"alertname": "TestAlert",
"instance": "1",
},
StartsAt: now.Add(-time.Hour),
EndsAt: now.Add(-time.Minute), // Resolved alert
},
UpdatedAt: now,
}
// Insert alert into the aggregation group
ag.insert(resolvedAlert)
// Set marker for the alert
marker.SetActiveOrSilenced(resolvedAlert.Fingerprint(), 0, nil, nil)
// Verify marker exists before flush
require.True(t, marker.Active(resolvedAlert.Fingerprint()))
// Create a notify function that fails
notifyFunc := func(alerts ...*types.Alert) bool {
return false
}
// Flush the alerts (notify will fail)
ag.flush(notifyFunc)
// Verify that the marker was NOT deleted due to failed notification
require.True(t, marker.Active(resolvedAlert.Fingerprint()), "marker should not be deleted when notify fails")
})
t.Run("markers not deleted when alert is modified during flush", func(t *testing.T) {
ctx := context.Background()
marker := types.NewMarker(prometheus.NewRegistry())
labels := model.LabelSet{"alertname": "TestAlert"}
route := &Route{
RouteOpts: RouteOpts{
Receiver: "test",
GroupBy: map[model.LabelName]struct{}{"alertname": {}},
GroupWait: 0,
GroupInterval: time.Minute,
RepeatInterval: time.Hour,
},
}
timeout := func(d time.Duration) time.Duration { return d }
logger := promslog.NewNopLogger()
// Create an aggregation group
ag := newAggrGroup(ctx, labels, route, timeout, marker, logger)
// Create a resolved alert
now := time.Now()
resolvedAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"alertname": "TestAlert",
"instance": "1",
},
StartsAt: now.Add(-time.Hour),
EndsAt: now.Add(-time.Minute), // Resolved alert
},
UpdatedAt: now,
}
// Insert alert into the aggregation group
ag.insert(resolvedAlert)
// Set marker for the alert
marker.SetActiveOrSilenced(resolvedAlert.Fingerprint(), 0, nil, nil)
// Verify marker exists before flush
require.True(t, marker.Active(resolvedAlert.Fingerprint()))
// Create a notify function that modifies the alert before returning
notifyFunc := func(alerts ...*types.Alert) bool {
// Simulate the alert being modified (e.g., firing again) during flush
modifiedAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"alertname": "TestAlert",
"instance": "1",
},
StartsAt: now.Add(-time.Hour),
EndsAt: now.Add(time.Hour), // Active again
},
UpdatedAt: now.Add(time.Second), // More recent update
}
// Update the alert in the store
ag.alerts.Set(modifiedAlert)
return true
}
// Flush the alerts
ag.flush(notifyFunc)
// Verify that the marker was NOT deleted because the alert was modified
// during the flush (DeleteIfNotModified should have failed)
require.True(t, marker.Active(resolvedAlert.Fingerprint()), "marker should not be deleted when alert is modified during flush")
})
}
func TestDispatchOnStartup(t *testing.T) {
logger := promslog.NewNopLogger()
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, reg)
if err != nil {
t.Fatal(err)
}
defer alerts.Close()
// Set up a route with GroupBy to separate alerts into different aggregation groups.
route := &Route{
RouteOpts: RouteOpts{
Receiver: "default",
GroupBy: map[model.LabelName]struct{}{"instance": {}},
GroupWait: 1 * time.Second,
GroupInterval: 3 * time.Minute,
RepeatInterval: 1 * time.Hour,
},
}
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
timeout := func(d time.Duration) time.Duration { return d }
// Set start time to 3 seconds in the future
now := time.Now()
startDelay := 2 * time.Second
startTime := time.Now().Add(startDelay)
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, testMaintenanceInterval, nil, logger, NewDispatcherMetrics(false, reg))
go dispatcher.Run(startTime)
defer dispatcher.Stop()
// Create 2 similar alerts with start times in the past
alert1 := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "TestAlert1", "instance": "1"},
Annotations: model.LabelSet{"foo": "bar"},
StartsAt: now.Add(-1 * time.Hour),
EndsAt: now.Add(time.Hour),
GeneratorURL: "http://example.com/prometheus",
},
UpdatedAt: now,
Timeout: false,
}
alert2 := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "TestAlert2", "instance": "2"},
Annotations: model.LabelSet{"foo": "bar"},
StartsAt: now.Add(-1 * time.Hour),
EndsAt: now.Add(time.Hour),
GeneratorURL: "http://example.com/prometheus",
},
UpdatedAt: now,
Timeout: false,
}
// Send alert1
require.NoError(t, alerts.Put(alert1))
var recordedAlerts []*types.Alert
// Expect a recorded alert after startTime + GroupWait which is in future
require.Eventually(t, func() bool {
recordedAlerts = recorder.Alerts()
return len(recordedAlerts) == 1
}, startDelay+route.RouteOpts.GroupWait, 500*time.Millisecond)
require.Equal(t, alert1.Fingerprint(), recordedAlerts[0].Fingerprint(), "expected alert1 to be dispatched after GroupWait")
// Send alert2
require.NoError(t, alerts.Put(alert2))
// Expect a recorded alert after GroupInterval
require.Eventually(t, func() bool {
recordedAlerts = recorder.Alerts()
return len(recordedAlerts) == 2
}, route.RouteOpts.GroupInterval, 100*time.Millisecond)
// Sort alerts by fingerprint for deterministic ordering
sort.Slice(recordedAlerts, func(i, j int) bool {
return recordedAlerts[i].Fingerprint() < recordedAlerts[j].Fingerprint()
})
require.Equal(t, alert2.Fingerprint(), recordedAlerts[1].Fingerprint(), "expected alert2 to be dispatched after GroupInterval")
// Verify both alerts are present
fingerprints := make(map[model.Fingerprint]bool)
for _, a := range recordedAlerts {
fingerprints[a.Fingerprint()] = true
}
require.True(t, fingerprints[alert1.Fingerprint()], "expected alert1 to be present")
require.True(t, fingerprints[alert2.Fingerprint()], "expected alert2 to be present")
}