1
0
mirror of https://github.com/prometheus/alertmanager.git synced 2026-02-05 15:45:34 +01:00

feat(inhibit): add inhibition metrics (#4629)

Add metrics for inhibitor:
- alertmanager_inhibitor_source_alerts_cache_items
- alertmanager_inhibitor_source_alerts_index_items
- alertmanager_inhibitor_mutes_duration_seconds

Add metrics for inhibition rules:
- alertmanager_inhibit_rule_source_alerts_cache_items
- alertmanager_inhibit_rule_source_alerts_index_items
- alertmanager_inhibit_rule_matches_duration_seconds
- alertmanager_inhibit_rule_mutes_duration_seconds

Other changes:
- Add debug logs for duplicate inhibition rule names
- Add Len() method to store.Alerts struct
- Add Len() method to inhibit.index struct
- update docs

Signed-off-by: Siavash Safi <siavash@cloudflare.com>
This commit is contained in:
Siavash Safi
2025-10-25 13:25:27 +02:00
committed by GitHub
parent 1f2df03c44
commit f2fbb318cf
10 changed files with 713 additions and 24 deletions

View File

@@ -104,7 +104,9 @@ var (
prometheus.GaugeOpts{
Name: "alertmanager_inhibition_rules",
Help: "Number of configured inhibition rules.",
})
},
)
promslogConfig = promslog.Config{}
)
@@ -408,6 +410,7 @@ func run() int {
)
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer)
inhibitMetrics := inhibit.NewInhibitorMetrics(prometheus.DefaultRegisterer)
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer, ff)
configLogger := logger.With("component", "configuration")
configCoordinator := config.NewCoordinator(
@@ -462,7 +465,7 @@ func run() int {
inhibitor.Stop()
disp.Stop()
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger, inhibitMetrics)
silencer := silence.NewSilencer(silences, marker, logger)
// An interface value that holds a nil concrete value is non-nil.

View File

@@ -442,6 +442,7 @@ to reason about and does not trigger this special case.
```yaml
# Optional name of the inhibition rule.
# Duplicate names are allowed but will affect the per-rule metrics.
name: <string>
# DEPRECATED: Use target_matchers below.

2
go.mod
View File

@@ -35,6 +35,7 @@ require (
github.com/oklog/run v1.2.0
github.com/oklog/ulid v1.3.1
github.com/prometheus/client_golang v1.23.2
github.com/prometheus/client_model v0.6.2
github.com/prometheus/common v0.67.1
github.com/prometheus/exporter-toolkit v0.14.1
github.com/prometheus/sigv4 v0.2.1
@@ -104,7 +105,6 @@ require (
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect

View File

@@ -55,3 +55,10 @@ func (c *index) Delete(key model.Fingerprint) {
delete(c.items, key)
}
func (c *index) Len() int {
c.mtx.RLock()
defer c.mtx.RUnlock()
return len(c.items)
}

View File

@@ -20,6 +20,7 @@ import (
"time"
"github.com/oklog/run"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/prometheus/alertmanager/config"
@@ -33,26 +34,39 @@ import (
// currently active alerts and a set of inhibition rules. It implements the
// Muter interface.
type Inhibitor struct {
alerts provider.Alerts
rules []*InhibitRule
marker types.AlertMarker
logger *slog.Logger
alerts provider.Alerts
rules []*InhibitRule
marker types.AlertMarker
logger *slog.Logger
metrics *InhibitorMetrics
mtx sync.RWMutex
cancel func()
}
// NewInhibitor returns a new Inhibitor.
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger) *Inhibitor {
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger, metrics *InhibitorMetrics) *Inhibitor {
ih := &Inhibitor{
alerts: ap,
marker: mk,
logger: logger,
alerts: ap,
marker: mk,
logger: logger,
metrics: metrics,
}
for _, cr := range rs {
r := NewInhibitRule(cr)
ruleNames := make(map[string]struct{})
for i, cr := range rs {
if _, ok := ruleNames[cr.Name]; ok {
ih.logger.Debug("duplicate inhibition rule name", "index", i, "name", cr.Name)
}
r := NewInhibitRule(cr, NewRuleMetrics(cr.Name, metrics))
ih.rules = append(ih.rules, r)
if cr.Name != "" {
ruleNames[cr.Name] = struct{}{}
}
}
return ih
}
@@ -70,16 +84,30 @@ func (ih *Inhibitor) run(ctx context.Context) {
continue
}
// Update the inhibition rules' cache.
cachedSum := 0
indexedSum := 0
for _, r := range ih.rules {
if r.SourceMatchers.Matches(a.Labels) {
if err := r.scache.Set(a); err != nil {
ih.logger.Error("error on set alert", "err", err)
continue
}
r.updateIndex(a)
cached := r.scache.Len()
indexed := r.sindex.Len()
if r.Name != "" {
r.metrics.sourceAlertsCacheItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(cached))
r.metrics.sourceAlertsIndexItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(indexed))
}
cachedSum += cached
indexedSum += indexed
}
}
ih.metrics.sourceAlertsCacheItems.Set(float64(cachedSum))
ih.metrics.sourceAlertsIndexItems.Set(float64(indexedSum))
}
}
}
@@ -128,21 +156,29 @@ func (ih *Inhibitor) Stop() {
// Mutes returns true iff the given label set is muted. It implements the Muter
// interface.
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
start := time.Now()
fp := lset.Fingerprint()
for _, r := range ih.rules {
ruleStart := time.Now()
if !r.TargetMatchers.Matches(lset) {
// If target side of rule doesn't match, we don't need to look any further.
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "false"}).Observe(time.Since(ruleStart).Seconds())
continue
}
r.metrics.matchesDuration.With(prometheus.Labels{"rule": r.Name, "matched": "true"}).Observe(time.Since(ruleStart).Seconds())
// If we are here, the target side matches. If the source side matches, too, we
// need to exclude inhibiting alerts for which the same is true.
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset)); eq {
ih.marker.SetInhibited(fp, inhibitedByFP.String())
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "true"}).Observe(time.Since(start).Seconds())
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "true"}).Observe(time.Since(ruleStart).Seconds())
return true
}
r.metrics.mutesDuration.With(prometheus.Labels{"rule": r.Name, "muted": "false"}).Observe(time.Since(ruleStart).Seconds())
}
ih.marker.SetInhibited(fp)
ih.metrics.mutesDuration.With(prometheus.Labels{"muted": "false"}).Observe(time.Since(start).Seconds())
return false
}
@@ -173,14 +209,17 @@ type InhibitRule struct {
// The index items might overwrite eachother if multiple source alerts have exact equal labels.
// Overwrites only happen if the new source alert has bigger EndsAt value.
sindex *index
metrics *RuleMetrics
}
// NewInhibitRule returns a new InhibitRule based on a configuration definition.
func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
func NewInhibitRule(cr config.InhibitRule, metrics *RuleMetrics) *InhibitRule {
var (
sourcem labels.Matchers
targetm labels.Matchers
)
// cr.SourceMatch will be deprecated. This for loop appends regex matchers.
for ln, lv := range cr.SourceMatch {
matcher, err := labels.NewMatcher(labels.MatchEqual, ln, lv)
@@ -235,6 +274,7 @@ func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
Equal: equal,
scache: store.NewAlerts(),
sindex: newIndex(),
metrics: metrics,
}
rule.scache.SetGCCallback(rule.gcCallback)
@@ -310,6 +350,10 @@ func (r *InhibitRule) gcCallback(alerts []types.Alert) {
fp := r.fingerprintEquals(a.Labels)
r.sindex.Delete(fp)
}
if r.Name != "" {
r.metrics.sourceAlertsCacheItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.scache.Len()))
r.metrics.sourceAlertsIndexItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.sindex.Len()))
}
}
// hasEqual checks whether the source cache contains alerts matching the equal

View File

@@ -198,7 +198,7 @@ func benchmarkMutes(b *testing.B, opts benchmarkOptions) {
}
}
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger())
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger(), NewInhibitorMetrics(r))
defer ih.Stop()
go ih.Run()

View File

@@ -125,9 +125,10 @@ func TestInhibitRuleHasEqual(t *testing.T) {
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
r := &InhibitRule{
Equal: map[model.LabelName]struct{}{},
scache: store.NewAlerts(),
sindex: newIndex(),
Equal: map[model.LabelName]struct{}{},
scache: store.NewAlerts(),
sindex: newIndex(),
metrics: NewRuleMetrics("test", NewInhibitorMetrics(prometheus.NewRegistry())),
}
for _, ln := range c.equal {
r.Equal[ln] = struct{}{}
@@ -159,7 +160,7 @@ func TestInhibitRuleMatches(t *testing.T) {
}
m := types.NewMarker(prometheus.NewRegistry())
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
now := time.Now()
// Active alert that matches the source filter of rule1.
sourceAlert1 := &types.Alert{
@@ -260,7 +261,7 @@ func TestInhibitRuleMatchers(t *testing.T) {
}
m := types.NewMarker(prometheus.NewRegistry())
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
now := time.Now()
// Active alert that matches the source filter of rule1.
sourceAlert1 := &types.Alert{
@@ -369,8 +370,8 @@ func TestInhibitRuleName(t *testing.T) {
Equal: []string{"instance"},
}
rule1 := NewInhibitRule(config1)
rule2 := NewInhibitRule(config2)
rule1 := NewInhibitRule(config1, nil)
rule2 := NewInhibitRule(config2, nil)
require.Equal(t, "test-rule", rule1.Name, "Expected named rule to have adopt name from config")
require.Empty(t, rule2.Name, "Expected unnamed rule to have empty name")
@@ -498,7 +499,7 @@ func TestInhibit(t *testing.T) {
} {
ap := newFakeAlerts(tc.alerts)
mk := types.NewMarker(prometheus.NewRegistry())
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger)
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
go func() {
for ap.finished != nil {

124
inhibit/metric.go Normal file
View File

@@ -0,0 +1,124 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package inhibit
import (
"github.com/prometheus/client_golang/prometheus"
)
// InhibitorMetrics represents metrics associated to an inhibitor.
type InhibitorMetrics struct {
// Inhibitor metrics
sourceAlertsCacheItems prometheus.Gauge
sourceAlertsIndexItems prometheus.Gauge
mutesDuration *prometheus.SummaryVec
// Rule metrics
ruleSourceAlertsCacheItems *prometheus.GaugeVec
ruleSourceAlertsIndexItems *prometheus.GaugeVec
ruleMatchesDuration *prometheus.SummaryVec
ruleMutesDuration *prometheus.SummaryVec
}
// NewInhibitorMetrics returns a new InhibitorMetrics.
func NewInhibitorMetrics(reg prometheus.Registerer) *InhibitorMetrics {
metrics := &InhibitorMetrics{
sourceAlertsCacheItems: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "alertmanager_inhibitor_source_alerts_cache_items",
Help: "Number of source alerts cached in inhibition rules.",
},
),
sourceAlertsIndexItems: prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "alertmanager_inhibitor_source_alerts_index_items",
Help: "Number of source alerts indexed in inhibition rules.",
},
),
mutesDuration: prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "alertmanager_inhibitor_mutes_duration_seconds",
Help: "Summary of latencies for the muting of alerts by inhibition rules.",
},
[]string{"muted"},
),
ruleSourceAlertsCacheItems: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "alertmanager_inhibit_rule_source_alerts_cache_items",
Help: "Number of source alerts cached in inhibition rules.",
},
[]string{"rule"},
),
ruleSourceAlertsIndexItems: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "alertmanager_inhibit_rule_source_alerts_index_items",
Help: "Number of source alerts indexed in inhibition rules.",
},
[]string{"rule"},
),
ruleMatchesDuration: prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "alertmanager_inhibit_rule_matches_duration_seconds",
Help: "Summary of latencies for the matching of alerts by inhibition rules.",
},
[]string{"rule", "matched"},
),
ruleMutesDuration: prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "alertmanager_inhibit_rule_mutes_duration_seconds",
Help: "Summary of latencies for the muting of alerts by inhibition rules.",
},
[]string{"rule", "muted"},
),
}
if reg != nil {
reg.MustRegister(
metrics.sourceAlertsCacheItems,
metrics.sourceAlertsIndexItems,
metrics.mutesDuration,
metrics.ruleSourceAlertsCacheItems,
metrics.ruleSourceAlertsIndexItems,
metrics.ruleMatchesDuration,
metrics.ruleMutesDuration,
)
}
metrics.sourceAlertsCacheItems.Set(0)
metrics.sourceAlertsIndexItems.Set(0)
return metrics
}
type RuleMetrics struct {
ruleName string
matchesDuration *prometheus.SummaryVec
mutesDuration *prometheus.SummaryVec
sourceAlertsCacheItems *prometheus.GaugeVec
sourceAlertsIndexItems *prometheus.GaugeVec
}
func NewRuleMetrics(name string, metrics *InhibitorMetrics) *RuleMetrics {
rm := &RuleMetrics{
ruleName: name,
matchesDuration: metrics.ruleMatchesDuration,
mutesDuration: metrics.ruleMutesDuration,
sourceAlertsCacheItems: metrics.ruleSourceAlertsCacheItems,
sourceAlertsIndexItems: metrics.ruleSourceAlertsIndexItems,
}
rm.sourceAlertsCacheItems.With(prometheus.Labels{"rule": rm.ruleName}).Set(0)
rm.sourceAlertsIndexItems.With(prometheus.Labels{"rule": rm.ruleName}).Set(0)
return rm
}

501
inhibit/metric_test.go Normal file
View File

@@ -0,0 +1,501 @@
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package inhibit
import (
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
io_prometheus_client "github.com/prometheus/client_model/go"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/pkg/labels"
"github.com/prometheus/alertmanager/provider/mem"
"github.com/prometheus/alertmanager/types"
)
// getMetricValue retrieves a specific metric value from the registry.
func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, labels map[string]string) (float64, uint64, bool) {
t.Helper()
metricFamilies, err := reg.Gather()
require.NoError(t, err)
for _, mf := range metricFamilies {
if mf.GetName() != metricName {
continue
}
for _, metric := range mf.GetMetric() {
if labelsMatch(metric, labels) {
if mf.GetType() == io_prometheus_client.MetricType_GAUGE {
return metric.GetGauge().GetValue(), 0, true
}
if mf.GetType() == io_prometheus_client.MetricType_SUMMARY {
return 0, metric.GetSummary().GetSampleCount(), true
}
}
}
}
return 0, 0, false
}
func labelsMatch(metric *io_prometheus_client.Metric, wantLabels map[string]string) bool {
for wantKey, wantVal := range wantLabels {
found := false
for _, labelPair := range metric.GetLabel() {
if labelPair.GetName() == wantKey && labelPair.GetValue() == wantVal {
found = true
break
}
}
if !found {
return false
}
}
return true
}
func TestInhibitorMetrics_RuleMatchesDuration(t *testing.T) {
reg := prometheus.NewRegistry()
metrics := NewInhibitorMetrics(reg)
rules := []config.InhibitRule{
{
Name: "test-rule",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"instance"},
},
}
marker := types.NewMarker(reg)
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
// Test case 1: Target matches (should record matched="true")
targetAlert := model.LabelSet{
"severity": "warning",
"instance": "server1",
}
inhibitor.Mutes(targetAlert)
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
map[string]string{"rule": "test-rule", "matched": "true"})
require.True(t, found, "Should find matched=true metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for matched=true")
// Test case 2: Target doesn't match (should record matched="false")
nonMatchingAlert := model.LabelSet{
"severity": "info",
"instance": "server2",
}
inhibitor.Mutes(nonMatchingAlert)
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
map[string]string{"rule": "test-rule", "matched": "false"})
require.True(t, found, "Should find matched=false metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for matched=false")
}
func TestInhibitorMetrics_RuleMutesDuration_Muted(t *testing.T) {
reg := prometheus.NewRegistry()
metrics := NewInhibitorMetrics(reg)
rules := []config.InhibitRule{
{
Name: "test-rule",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"instance"},
},
}
marker := types.NewMarker(reg)
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
// Add a source alert that will inhibit
sourceAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"severity": "critical",
"instance": "server1",
},
StartsAt: time.Now().Add(-time.Minute),
EndsAt: time.Now().Add(time.Hour),
},
}
inhibitor.rules[0].scache.Set(sourceAlert)
inhibitor.rules[0].updateIndex(sourceAlert)
// Test that target alert is muted
targetAlert := model.LabelSet{
"severity": "warning",
"instance": "server1",
}
muted := inhibitor.Mutes(targetAlert)
require.True(t, muted, "Alert should be muted")
// Verify per-rule muted="true" metric was recorded
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
map[string]string{"rule": "test-rule", "muted": "true"})
require.True(t, found, "Should find per-rule muted=true metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for per-rule muted=true")
// Verify global muted="true" metric was recorded
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
map[string]string{"muted": "true"})
require.True(t, found, "Should find global muted=true metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for global muted=true")
}
func TestInhibitorMetrics_RuleMutesDuration_NotMuted(t *testing.T) {
reg := prometheus.NewRegistry()
metrics := NewInhibitorMetrics(reg)
rules := []config.InhibitRule{
{
Name: "test-rule",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"instance"},
},
}
marker := types.NewMarker(reg)
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
// Add a source alert with different instance
sourceAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"severity": "critical",
"instance": "server1",
},
StartsAt: time.Now().Add(-time.Minute),
EndsAt: time.Now().Add(time.Hour),
},
}
inhibitor.rules[0].scache.Set(sourceAlert)
// Test that target alert with different instance is NOT muted
targetAlert := model.LabelSet{
"severity": "warning",
"instance": "server2",
}
muted := inhibitor.Mutes(targetAlert)
require.False(t, muted, "Alert should not be muted")
// Verify per-rule muted="false" metric was recorded
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
map[string]string{"rule": "test-rule", "muted": "false"})
require.True(t, found, "Should find per-rule muted=false metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for per-rule muted=false")
// Verify global muted="false" metric was recorded
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
map[string]string{"muted": "false"})
require.True(t, found, "Should find global muted=false metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for global muted=false")
}
func TestInhibitorMetrics_NoRuleMatches(t *testing.T) {
reg := prometheus.NewRegistry()
metrics := NewInhibitorMetrics(reg)
rules := []config.InhibitRule{
{
Name: "test-rule",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"instance"},
},
}
marker := types.NewMarker(reg)
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
// Test with alert that doesn't match any rule's target
nonMatchingAlert := model.LabelSet{
"severity": "info",
"instance": "server1",
}
muted := inhibitor.Mutes(nonMatchingAlert)
require.False(t, muted, "Alert should not be muted")
// Verify that global muted="false" metric was recorded
_, count, found := getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
map[string]string{"muted": "false"})
require.True(t, found, "Should find global muted=false metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for global muted=false")
// Verify per-rule matched="false" was recorded
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
map[string]string{"rule": "test-rule", "matched": "false"})
require.True(t, found, "Should find rule matched=false metric")
require.Equal(t, uint64(1), count, "Should have 1 sample for rule matched=false")
}
func TestInhibitorMetrics_MultipleRules(t *testing.T) {
reg := prometheus.NewRegistry()
metrics := NewInhibitorMetrics(reg)
rules := []config.InhibitRule{
{
Name: "rule-1",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"instance"},
},
{
Name: "rule-2",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "team", Value: "sre"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "team", Value: "dev"},
},
Equal: []string{"service"},
},
}
marker := types.NewMarker(reg)
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
// Add source alert for rule-1
sourceAlert1 := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"severity": "critical",
"instance": "server1",
},
StartsAt: time.Now().Add(-time.Minute),
EndsAt: time.Now().Add(time.Hour),
},
}
inhibitor.rules[0].scache.Set(sourceAlert1)
inhibitor.rules[0].updateIndex(sourceAlert1)
// Test alert that matches rule-1
targetAlert1 := model.LabelSet{
"severity": "warning",
"instance": "server1",
}
muted1 := inhibitor.Mutes(targetAlert1)
require.True(t, muted1, "Alert should be muted by rule-1")
// Verify metrics for rule-1
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
map[string]string{"rule": "rule-1", "matched": "true"})
require.True(t, found, "Should find rule-1 matched=true metric")
require.Equal(t, 1, int(count))
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
map[string]string{"rule": "rule-1", "muted": "true"})
require.True(t, found, "Should find rule-1 muted=true metric")
require.Equal(t, 1, int(count))
// Verify global muted="true" metric
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
map[string]string{"muted": "true"})
require.True(t, found, "Should find global muted=true metric")
require.Equal(t, 1, int(count))
// Test alert that matches rule-2 target but has no source
targetAlert2 := model.LabelSet{
"team": "dev",
"service": "api",
}
muted2 := inhibitor.Mutes(targetAlert2)
require.False(t, muted2, "Alert should not be muted")
// Verify metrics for rule-2 (both rules process this alert since rule-1 doesn't match target)
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
map[string]string{"rule": "rule-1", "matched": "false"})
require.True(t, found, "Should find rule-1 matched=false metric")
require.Equal(t, 1, int(count))
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
map[string]string{"rule": "rule-2", "matched": "true"})
require.True(t, found, "Should find rule-2 matched=true metric")
require.Equal(t, 1, int(count))
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
map[string]string{"rule": "rule-2", "muted": "false"})
require.True(t, found, "Should find rule-2 muted=false metric")
require.Equal(t, 1, int(count))
// Verify global muted="false" metric
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
map[string]string{"muted": "false"})
require.True(t, found, "Should find global muted=false metric")
require.Equal(t, 1, int(count), "Should have 1 samples")
}
func TestInhibitorMetrics_CacheAndIndexItems(t *testing.T) {
reg := prometheus.NewRegistry()
metrics := NewInhibitorMetrics(reg)
rules := []config.InhibitRule{
{
Name: "named-rule",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"instance"},
},
{
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"cluster"},
},
}
marker := types.NewMarker(reg)
provider, err := mem.NewAlerts(t.Context(), marker, 15*time.Minute, nil, nopLogger, reg)
require.NoError(t, err)
inhibitor := NewInhibitor(provider, rules, marker, nopLogger, metrics)
go inhibitor.Run()
// Add multiple source alerts
for i := 1; i <= 3; i++ {
sourceAlert := &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"severity": "critical",
"instance": model.LabelValue("server" + string(rune('0'+i))),
"cluster": model.LabelValue("cluster" + string(rune('0'+i))),
},
StartsAt: time.Now().Add(-time.Minute),
EndsAt: time.Now().Add(time.Hour),
},
}
require.NoError(t, provider.Put(sourceAlert))
}
// Wait for the inhibitor to process alerts and update metrics
// The Run() goroutine processes alerts asynchronously
require.Eventually(t, func() bool {
value, _, found := getMetricValue(t, reg, "alertmanager_inhibitor_source_alerts_cache_items",
map[string]string{})
return found && value == 6
}, 2*time.Second, 50*time.Millisecond, "Cache items metric should reach 6")
// Stop the inhibitor
inhibitor.Stop()
// Global metrics (no labels) show the sum across all rules
value, _, found := getMetricValue(t, reg, "alertmanager_inhibitor_source_alerts_cache_items",
map[string]string{})
require.True(t, found, "Should find global cache items metric")
require.Equal(t, float64(6), value, "Global cache should contain 6 alerts total")
value, _, found = getMetricValue(t, reg, "alertmanager_inhibitor_source_alerts_index_items",
map[string]string{})
require.True(t, found, "Should find global index items metric")
require.Equal(t, float64(6), value, "Global index should contain 6 entries total")
// Per-rule metrics show individual rule values
value, _, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_source_alerts_cache_items",
map[string]string{"rule": "named-rule"})
require.True(t, found, "Should find per-rule cache items metric")
require.Equal(t, float64(3), value, "Named rule cache should contain 3 alerts")
value, _, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_source_alerts_index_items",
map[string]string{"rule": "named-rule"})
require.True(t, found, "Should find per-rule index items metric")
require.Equal(t, float64(3), value, "Named rule index should contain 3 entries")
}
func TestInhibitorMetrics_Registration(t *testing.T) {
reg := prometheus.NewRegistry()
metrics := NewInhibitorMetrics(reg)
require.NotNil(t, metrics, "Metrics should be created")
// Create a rule and use the metrics so they appear in Gather() output
rules := []config.InhibitRule{
{
Name: "test-rule",
SourceMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
},
TargetMatchers: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
},
Equal: []string{"instance"},
},
}
marker := types.NewMarker(reg)
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
// Use the metrics to ensure they show up in Gather()
testAlert := model.LabelSet{
"severity": "warning",
"instance": "server1",
}
inhibitor.Mutes(testAlert)
// Verify all metrics are registered and have data
metricFamilies, err := reg.Gather()
require.NoError(t, err)
registeredMetrics := map[string]bool{
"alertmanager_inhibitor_source_alerts_cache_items": false,
"alertmanager_inhibitor_source_alerts_index_items": false,
"alertmanager_inhibitor_mutes_duration_seconds": false,
"alertmanager_inhibit_rule_source_alerts_cache_items": false,
"alertmanager_inhibit_rule_source_alerts_index_items": false,
"alertmanager_inhibit_rule_matches_duration_seconds": false,
"alertmanager_inhibit_rule_mutes_duration_seconds": false,
}
for _, mf := range metricFamilies {
if _, exists := registeredMetrics[mf.GetName()]; exists {
registeredMetrics[mf.GetName()] = true
}
}
for metricName, registered := range registeredMetrics {
require.True(t, registered, "Metric %s should be registered", metricName)
}
}

View File

@@ -150,3 +150,11 @@ func (a *Alerts) Empty() bool {
return len(a.c) == 0
}
// Len returns the number of alerts in the store.
func (a *Alerts) Len() int {
a.Lock()
defer a.Unlock()
return len(a.c)
}