1
0
mirror of https://github.com/prometheus/alertmanager.git synced 2026-02-05 15:45:34 +01:00
Files
alertmanager/silence/silence_bench_test.go
Siavash Safi 18939cee8f feat: add distributed tracing support (#4745)
Add tracing support using otel to the the following components:
- api: extract trace and span IDs from request context
- provider: mem put
- dispatch: split logic and use better naming
- inhibit: source and target traces, mutes, etc. drop metrics
- silence: query, expire, mutes
- notify: add distributed tracing support to stages and all http requests

Note: inhibitor metrics are dropped since we have tracing now and they
are not needed. We have not released any version with these metrics so
we can drop them safely, this is not a breaking change.

This change borrows part of the implementation from #3673
Fixes #3670

Signed-off-by: Dave Henderson <dhenderson@gmail.com>
Signed-off-by: Siavash Safi <siavash@cloudflare.com>
Co-authored-by: Dave Henderson <dhenderson@gmail.com>
2025-12-05 22:58:44 +01:00

656 lines
18 KiB
Go

// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package silence
import (
"context"
"fmt"
"math/rand"
"strconv"
"sync"
"testing"
"time"
"github.com/coder/quartz"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promslog"
"github.com/stretchr/testify/require"
"github.com/prometheus/alertmanager/silence/silencepb"
"github.com/prometheus/alertmanager/types"
)
// BenchmarkMutes benchmarks the Mutes method for the Muter interface for
// different numbers of silences with varying match ratios.
func BenchmarkMutes(b *testing.B) {
b.Run("0 total, 0 matching", func(b *testing.B) {
benchmarkMutes(b, 0, 0)
})
b.Run("1 total, 1 matching", func(b *testing.B) {
benchmarkMutes(b, 1, 1)
})
b.Run("100 total, 10 matching", func(b *testing.B) {
benchmarkMutes(b, 100, 10)
})
b.Run("1000 total, 1 matching", func(b *testing.B) {
benchmarkMutes(b, 1000, 1)
})
b.Run("1000 total, 10 matching", func(b *testing.B) {
benchmarkMutes(b, 1000, 10)
})
b.Run("1000 total, 100 matching", func(b *testing.B) {
benchmarkMutes(b, 1000, 100)
})
b.Run("10000 total, 0 matching", func(b *testing.B) {
benchmarkMutes(b, 10000, 10)
})
b.Run("10000 total, 10 matching", func(b *testing.B) {
benchmarkMutes(b, 10000, 10)
})
b.Run("10000 total, 1000 matching", func(b *testing.B) {
benchmarkMutes(b, 10000, 1000)
})
}
func benchmarkMutes(b *testing.B, totalSilences, matchingSilences int) {
require.LessOrEqual(b, matchingSilences, totalSilences)
silences, err := New(Options{Metrics: prometheus.NewRegistry()})
require.NoError(b, err)
clock := quartz.NewMock(b).WithLogger(quartz.NoOpLogger)
silences.clock = clock
now := clock.Now()
// Calculate interval to intersperse matching silences
var interval int
if matchingSilences > 0 {
interval = totalSilences / matchingSilences
}
// Create silences with matching ones interspersed throughout
matchingCreated := 0
for i := range totalSilences {
var s *silencepb.Silence
// Create matching silences at calculated intervals, but make sure there are always enough
if matchingCreated < matchingSilences && (i%interval == 0 || i == totalSilences-matchingSilences+matchingCreated) {
// Create a matching silence
s = &silencepb.Silence{
Matchers: []*silencepb.Matcher{{
Type: silencepb.Matcher_EQUAL,
Name: "foo",
Pattern: "bar",
}},
StartsAt: now,
EndsAt: now.Add(time.Minute),
}
matchingCreated++
} else {
// Create a non-matching silence
s = &silencepb.Silence{
Matchers: []*silencepb.Matcher{{
Type: silencepb.Matcher_EQUAL,
Name: "job",
Pattern: "job" + strconv.Itoa(i),
}},
StartsAt: now,
EndsAt: now.Add(time.Minute),
}
}
require.NoError(b, silences.Set(b.Context(), s))
}
m := types.NewMarker(prometheus.NewRegistry())
s := NewSilencer(silences, m, promslog.NewNopLogger())
for b.Loop() {
s.Mutes(context.Background(), model.LabelSet{"foo": "bar"})
}
b.StopTimer()
// The alert should be marked as silenced for each matching silence.
activeIDs, pendingIDs, _, silenced := m.Silenced(model.LabelSet{"foo": "bar"}.Fingerprint())
require.True(b, silenced || matchingSilences == 0)
require.Empty(b, pendingIDs)
require.Len(b, activeIDs, matchingSilences)
}
// BenchmarkMutesIncremental tests the incremental query optimization when a small
// number of silences are added to a large existing set. This measures the real-world
// scenario that the QSince optimization is designed for.
func BenchmarkMutesIncremental(b *testing.B) {
cases := []struct {
name string
baseSize int
}{
{"1000 base silences", 1000},
{"3000 base silences", 3000},
{"7000 base silences", 7000},
{"10000 base silences", 10000},
}
for _, tc := range cases {
b.Run(tc.name, func(b *testing.B) {
silences, err := New(Options{Metrics: prometheus.NewRegistry()})
require.NoError(b, err)
clock := quartz.NewMock(b).WithLogger(quartz.NoOpLogger)
silences.clock = clock
now := clock.Now()
// Create base set of silences - most don't match, some do
// This simulates a realistic production scenario
// Intersperse matching silences throughout the base set
for i := 0; i < tc.baseSize; i++ {
var s *silencepb.Silence
if i%2000 == 0 && i > 0 {
// Sprinkle 1 silence matching every 2000
s = &silencepb.Silence{
Matchers: []*silencepb.Matcher{
{
Type: silencepb.Matcher_EQUAL,
Name: "service",
Pattern: "test",
},
{
Type: silencepb.Matcher_EQUAL,
Name: "instance",
Pattern: "instance1",
},
},
StartsAt: now,
EndsAt: now.Add(time.Hour),
}
} else {
s = &silencepb.Silence{
Matchers: []*silencepb.Matcher{{
Type: silencepb.Matcher_EQUAL,
Name: "job",
Pattern: "job" + strconv.Itoa(i),
}},
StartsAt: now,
EndsAt: now.Add(time.Hour),
}
}
require.NoError(b, silences.Set(b.Context(), s))
}
marker := types.NewMarker(prometheus.NewRegistry())
silencer := NewSilencer(silences, marker, promslog.NewNopLogger())
// Warm up: Establish marker state (markerVersion = current version)
// This simulates a system that has been running for a while
lset := model.LabelSet{"service": "test", "instance": "instance1"}
silencer.Mutes(context.Background(), lset)
// Benchmark: Measure Mutes() performance with incremental additions
// Every other iteration adds 1 new silence, all iterations call Mutes()
// This simulates realistic traffic with a mix of incremental and cached queries
// With QSince optimization, this should only scan new silences when added
b.ResetTimer()
iteration := 0
for b.Loop() {
// Don't measure the Set() time, only Mutes()
b.StopTimer()
// Add one new silence every other iteration to simulate realistic traffic
// where Mutes() is sometimes called without new silences
if iteration%2 == 0 {
var s *silencepb.Silence
if iteration%20 == 0 && iteration > 0 {
// Only 1 in 20 silences matches the labelset
s = &silencepb.Silence{
Matchers: []*silencepb.Matcher{
{
Type: silencepb.Matcher_EQUAL,
Name: "service",
Pattern: "test",
},
{
Type: silencepb.Matcher_EQUAL,
Name: "instance",
Pattern: "instance1",
},
},
StartsAt: now,
EndsAt: now.Add(time.Hour),
}
} else {
// Most don't match
s = &silencepb.Silence{
Matchers: []*silencepb.Matcher{{
Type: silencepb.Matcher_EQUAL,
Name: "instance",
Pattern: "host" + strconv.Itoa(iteration),
}},
StartsAt: now,
EndsAt: now.Add(time.Hour),
}
}
require.NoError(b, silences.Set(b.Context(), s))
}
b.StartTimer()
// Now query - should use incremental path or cached paths
silencer.Mutes(context.Background(), lset)
iteration++
}
})
}
}
// BenchmarkQuery benchmarks the Query method for the Silences struct
// for different numbers of silences. Not all silences match the query
// to prevent compiler and runtime optimizations from affecting the benchmarks.
func BenchmarkQuery(b *testing.B) {
b.Run("100 silences", func(b *testing.B) {
benchmarkQuery(b, 100)
})
b.Run("1000 silences", func(b *testing.B) {
benchmarkQuery(b, 1000)
})
b.Run("10000 silences", func(b *testing.B) {
benchmarkQuery(b, 10000)
})
}
func benchmarkQuery(b *testing.B, numSilences int) {
s, err := New(Options{Metrics: prometheus.NewRegistry()})
require.NoError(b, err)
clock := quartz.NewMock(b).WithLogger(quartz.NoOpLogger)
s.clock = clock
now := clock.Now()
lset := model.LabelSet{"aaaa": "AAAA", "bbbb": "BBBB", "cccc": "CCCC"}
// Create silences using Set() to properly populate indices
for i := range numSilences {
id := strconv.Itoa(i)
// Include an offset to avoid optimizations.
patA := "A{4}|" + id
patB := id // Does not match.
if i%10 == 0 {
// Every 10th time, have an actually matching pattern.
patB = "B(B|C)B.|" + id
}
sil := &silencepb.Silence{
Matchers: []*silencepb.Matcher{
{Type: silencepb.Matcher_REGEXP, Name: "aaaa", Pattern: patA},
{Type: silencepb.Matcher_REGEXP, Name: "bbbb", Pattern: patB},
},
StartsAt: now.Add(-time.Minute),
EndsAt: now.Add(time.Hour),
UpdatedAt: now.Add(-time.Hour),
}
require.NoError(b, s.Set(b.Context(), sil))
}
// Run things once to populate the matcherCache.
sils, _, err := s.Query(
b.Context(),
QState(types.SilenceStateActive),
QMatches(lset),
)
require.NoError(b, err)
require.Len(b, sils, numSilences/10)
for b.Loop() {
sils, _, err := s.Query(
b.Context(),
QState(types.SilenceStateActive),
QMatches(lset),
)
require.NoError(b, err)
require.Len(b, sils, numSilences/10)
}
}
// BenchmarkQueryParallel benchmarks concurrent queries to demonstrate
// the performance improvement from using read locks (RLock) instead of
// write locks (Lock). With the pre-compiled matcher cache, multiple
// queries can now execute in parallel.
func BenchmarkQueryParallel(b *testing.B) {
b.Run("100 silences", func(b *testing.B) {
benchmarkQueryParallel(b, 100)
})
b.Run("1000 silences", func(b *testing.B) {
benchmarkQueryParallel(b, 1000)
})
b.Run("10000 silences", func(b *testing.B) {
benchmarkQueryParallel(b, 10000)
})
}
func benchmarkQueryParallel(b *testing.B, numSilences int) {
s, err := New(Options{Metrics: prometheus.NewRegistry()})
require.NoError(b, err)
clock := quartz.NewMock(b).WithLogger(quartz.NoOpLogger)
s.clock = clock
now := clock.Now()
lset := model.LabelSet{"aaaa": "AAAA", "bbbb": "BBBB", "cccc": "CCCC"}
// Create silences with pre-compiled matchers
for i := range numSilences {
id := strconv.Itoa(i)
patA := "A{4}|" + id
patB := id
if i%10 == 0 {
patB = "B(B|C)B.|" + id
}
sil := &silencepb.Silence{
Matchers: []*silencepb.Matcher{
{Type: silencepb.Matcher_REGEXP, Name: "aaaa", Pattern: patA},
{Type: silencepb.Matcher_REGEXP, Name: "bbbb", Pattern: patB},
},
StartsAt: now.Add(-time.Minute),
EndsAt: now.Add(time.Hour),
UpdatedAt: now.Add(-time.Hour),
}
require.NoError(b, s.Set(b.Context(), sil))
}
// Verify initial query works
sils, _, err := s.Query(
b.Context(),
QState(types.SilenceStateActive),
QMatches(lset),
)
require.NoError(b, err)
require.Len(b, sils, numSilences/10)
b.ResetTimer()
// Run queries in parallel across multiple goroutines
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
sils, _, err := s.Query(
b.Context(),
QState(types.SilenceStateActive),
QMatches(lset),
)
if err != nil {
b.Error(err)
}
if len(sils) != numSilences/10 {
b.Errorf("expected %d silences, got %d", numSilences/10, len(sils))
}
}
})
}
// BenchmarkQueryWithConcurrentAdds benchmarks the behavior when queries
// are running concurrently with silence additions. This demonstrates how
// the system handles read-heavy workloads with occasional writes.
func BenchmarkQueryWithConcurrentAdds(b *testing.B) {
b.Run("1000 initial silences, 10% add rate", func(b *testing.B) {
benchmarkQueryWithConcurrentAdds(b, 1000, 0.1)
})
b.Run("1000 initial silences, 1% add rate", func(b *testing.B) {
benchmarkQueryWithConcurrentAdds(b, 1000, 0.01)
})
b.Run("1000 initial silences, 0.1% add rate", func(b *testing.B) {
benchmarkQueryWithConcurrentAdds(b, 1000, 0.001)
})
b.Run("10000 initial silences, 1% add rate", func(b *testing.B) {
benchmarkQueryWithConcurrentAdds(b, 10000, 0.01)
})
b.Run("10000 initial silences, 0.1% add rate", func(b *testing.B) {
benchmarkQueryWithConcurrentAdds(b, 10000, 0.001)
})
}
func benchmarkQueryWithConcurrentAdds(b *testing.B, initialSilences int, addRatio float64) {
s, err := New(Options{Metrics: prometheus.NewRegistry()})
require.NoError(b, err)
clock := quartz.NewMock(b).WithLogger(quartz.NoOpLogger)
s.clock = clock
now := clock.Now()
lset := model.LabelSet{"aaaa": "AAAA", "bbbb": "BBBB", "cccc": "CCCC"}
// Create initial silences
for i := range initialSilences {
id := strconv.Itoa(i)
patA := "A{4}|" + id
patB := id
if i%10 == 0 {
patB = "B(B|C)B.|" + id
}
sil := &silencepb.Silence{
Matchers: []*silencepb.Matcher{
{Type: silencepb.Matcher_REGEXP, Name: "aaaa", Pattern: patA},
{Type: silencepb.Matcher_REGEXP, Name: "bbbb", Pattern: patB},
},
StartsAt: now.Add(-time.Minute),
EndsAt: now.Add(time.Hour),
UpdatedAt: now.Add(-time.Hour),
}
require.NoError(b, s.Set(b.Context(), sil))
}
var addCounter int
var mu sync.Mutex
b.ResetTimer()
// Run parallel operations
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
// Determine if this iteration should add a silence
mu.Lock()
shouldAdd := float64(addCounter) < float64(b.N)*addRatio
if shouldAdd {
addCounter++
}
localCounter := addCounter + initialSilences
mu.Unlock()
if shouldAdd {
// Add a new silence
id := strconv.Itoa(localCounter)
patA := "A{4}|" + id
patB := "B(B|C)B.|" + id
sil := &silencepb.Silence{
Matchers: []*silencepb.Matcher{
{Type: silencepb.Matcher_REGEXP, Name: "aaaa", Pattern: patA},
{Type: silencepb.Matcher_REGEXP, Name: "bbbb", Pattern: patB},
},
StartsAt: now.Add(-time.Minute),
EndsAt: now.Add(time.Hour),
UpdatedAt: now.Add(-time.Hour),
}
if err := s.Set(b.Context(), sil); err != nil {
b.Error(err)
}
} else {
// Query silences (the common operation)
_, _, err := s.Query(
b.Context(),
QState(types.SilenceStateActive),
QMatches(lset),
)
if err != nil {
b.Error(err)
}
}
}
})
}
// BenchmarkMutesParallel benchmarks concurrent Mutes calls to demonstrate
// the performance improvement from parallel query execution.
func BenchmarkMutesParallel(b *testing.B) {
b.Run("100 silences", func(b *testing.B) {
benchmarkMutesParallel(b, 100)
})
b.Run("1000 silences", func(b *testing.B) {
benchmarkMutesParallel(b, 1000)
})
b.Run("10000 silences", func(b *testing.B) {
benchmarkMutesParallel(b, 10000)
})
}
func benchmarkMutesParallel(b *testing.B, numSilences int) {
silences, err := New(Options{Metrics: prometheus.NewRegistry()})
require.NoError(b, err)
clock := quartz.NewMock(b).WithLogger(quartz.NoOpLogger)
silences.clock = clock
now := clock.Now()
// Create silences that will match the alert
for range numSilences {
s := &silencepb.Silence{
Matchers: []*silencepb.Matcher{{
Type: silencepb.Matcher_EQUAL,
Name: "foo",
Pattern: "bar",
}},
StartsAt: now,
EndsAt: now.Add(time.Minute),
}
require.NoError(b, silences.Set(b.Context(), s))
}
m := types.NewMarker(prometheus.NewRegistry())
silencer := NewSilencer(silences, m, promslog.NewNopLogger())
b.ResetTimer()
// Run Mutes in parallel
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
silencer.Mutes(b.Context(), model.LabelSet{"foo": "bar"})
}
})
}
// BenchmarkGC benchmarks the garbage collection performance for different
// numbers of silences and different ratios of expired silences.
func BenchmarkGC(b *testing.B) {
b.Run("1000 silences, 0% expired", func(b *testing.B) {
benchmarkGC(b, 1000, 0.0)
})
b.Run("1000 silences, 30% expired", func(b *testing.B) {
benchmarkGC(b, 1000, 0.3)
})
b.Run("1000 silences, 80% expired", func(b *testing.B) {
benchmarkGC(b, 1000, 0.8)
})
b.Run("10000 silences, 0% expired", func(b *testing.B) {
benchmarkGC(b, 10000, 0.0)
})
b.Run("10000 silences, 10% expired", func(b *testing.B) {
benchmarkGC(b, 10000, 0.1)
})
b.Run("10000 silences, 50% expired", func(b *testing.B) {
benchmarkGC(b, 10000, 0.5)
})
b.Run("10000 silences, 80% expired", func(b *testing.B) {
benchmarkGC(b, 10000, 0.8)
})
}
func benchmarkGC(b *testing.B, numSilences int, expiredRatio float64) {
b.ReportAllocs()
clock := quartz.NewMock(b).WithLogger(quartz.NoOpLogger)
now := clock.Now()
numExpired := int(float64(numSilences) * expiredRatio)
numActive := numSilences - numExpired
matchers := []*silencepb.Matcher{{
Type: silencepb.Matcher_EQUAL,
Name: "foo",
Pattern: "bar",
}}
startTime := now.Add(-2 * time.Hour)
updateTime := now.Add(-2 * time.Hour)
endTime := now.Add(-time.Hour)
expireTime := now.Add(-time.Minute)
activeTime := now.Add(2 * time.Hour)
sils := make([]*silencepb.MeshSilence, 0, numSilences)
for _, j := range rand.Perm(numSilences) {
if j < numExpired {
sil := &silencepb.MeshSilence{
Silence: &silencepb.Silence{
Id: fmt.Sprintf("expired-%d", j),
Matchers: matchers,
StartsAt: startTime,
EndsAt: endTime,
UpdatedAt: updateTime,
},
ExpiresAt: expireTime,
}
sils = append(sils, sil)
} else {
sil := &silencepb.MeshSilence{
Silence: &silencepb.Silence{
Id: fmt.Sprintf("active-%d", j),
Matchers: matchers,
StartsAt: startTime,
EndsAt: endTime,
UpdatedAt: updateTime,
},
ExpiresAt: activeTime,
}
sils = append(sils, sil)
}
}
b.ResetTimer()
for b.Loop() {
b.StopTimer()
s, err := New(Options{
Metrics: prometheus.NewRegistry(),
})
require.NoError(b, err)
s.clock = clock
for _, sil := range sils {
s.st[sil.Silence.Id] = sil
s.indexSilence(sil.Silence)
}
b.StartTimer()
n1, err := s.GC()
require.NoError(b, err)
n2, err := s.GC()
require.NoError(b, err)
b.StopTimer()
require.NoError(b, err)
require.Equal(b, numExpired, n1)
require.Equal(b, 0, n2)
require.Len(b, s.st, numActive)
require.Len(b, s.mi, numActive)
b.StartTimer()
}
}