mirror of
https://github.com/prometheus/alertmanager.git
synced 2026-02-05 06:45:45 +01:00
feat: add distributed tracing support (#4745)
Add tracing support using otel to the the following components: - api: extract trace and span IDs from request context - provider: mem put - dispatch: split logic and use better naming - inhibit: source and target traces, mutes, etc. drop metrics - silence: query, expire, mutes - notify: add distributed tracing support to stages and all http requests Note: inhibitor metrics are dropped since we have tracing now and they are not needed. We have not released any version with these metrics so we can drop them safely, this is not a breaking change. This change borrows part of the implementation from #3673 Fixes #3670 Signed-off-by: Dave Henderson <dhenderson@gmail.com> Signed-off-by: Siavash Safi <siavash@cloudflare.com> Co-authored-by: Dave Henderson <dhenderson@gmail.com>
This commit is contained in:
@@ -28,6 +28,7 @@ import (
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/promslog"
|
||||
"github.com/prometheus/common/route"
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
|
||||
apiv2 "github.com/prometheus/alertmanager/api/v2"
|
||||
"github.com/prometheus/alertmanager/cluster"
|
||||
@@ -201,7 +202,7 @@ func (api *API) Register(r *route.Router, routePrefix string) *http.ServeMux {
|
||||
|
||||
// Update config and resolve timeout of each API. APIv2 also needs
|
||||
// setAlertStatus to be updated.
|
||||
func (api *API) Update(cfg *config.Config, setAlertStatus func(model.LabelSet)) {
|
||||
func (api *API) Update(cfg *config.Config, setAlertStatus func(ctx context.Context, labels model.LabelSet)) {
|
||||
api.v2.Update(cfg, setAlertStatus)
|
||||
}
|
||||
|
||||
@@ -242,7 +243,7 @@ func (api *API) instrumentHandler(prefix string, h http.Handler) http.Handler {
|
||||
}
|
||||
promhttp.InstrumentHandlerDuration(
|
||||
api.requestDuration.MustCurryWith(prometheus.Labels{"handler": path}),
|
||||
h,
|
||||
otelhttp.WithRouteTag(path, h),
|
||||
).ServeHTTP(w, r)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -33,6 +33,8 @@ import (
|
||||
prometheus_model "github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/version"
|
||||
"github.com/rs/cors"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
|
||||
"github.com/prometheus/alertmanager/api/metrics"
|
||||
open_api_models "github.com/prometheus/alertmanager/api/v2/models"
|
||||
@@ -54,6 +56,8 @@ import (
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
)
|
||||
|
||||
var tracer = otel.Tracer("github.com/prometheus/alertmanager/api/v2")
|
||||
|
||||
// API represents an Alertmanager API v2.
|
||||
type API struct {
|
||||
peer cluster.ClusterPeer
|
||||
@@ -82,7 +86,7 @@ type (
|
||||
groupsFn func(context.Context, func(*dispatch.Route) bool, func(*types.Alert, time.Time) bool) (dispatch.AlertGroups, map[prometheus_model.Fingerprint][]string, error)
|
||||
groupMutedFunc func(routeID, groupKey string) ([]string, bool)
|
||||
getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus
|
||||
setAlertStatusFn func(prometheus_model.LabelSet)
|
||||
setAlertStatusFn func(ctx context.Context, labels prometheus_model.LabelSet)
|
||||
)
|
||||
|
||||
// NewAPI returns a new Alertmanager API v2.
|
||||
@@ -173,6 +177,9 @@ func (api *API) getStatusHandler(params general_ops.GetStatusParams) middleware.
|
||||
api.mtx.RLock()
|
||||
defer api.mtx.RUnlock()
|
||||
|
||||
_, span := tracer.Start(params.HTTPRequest.Context(), "api.getStatusHandler")
|
||||
defer span.End()
|
||||
|
||||
original := api.alertmanagerConfig.String()
|
||||
uptime := strfmt.DateTime(api.uptime)
|
||||
|
||||
@@ -229,6 +236,9 @@ func (api *API) getReceiversHandler(params receiver_ops.GetReceiversParams) midd
|
||||
api.mtx.RLock()
|
||||
defer api.mtx.RUnlock()
|
||||
|
||||
_, span := tracer.Start(params.HTTPRequest.Context(), "api.getReceiversHandler")
|
||||
defer span.End()
|
||||
|
||||
receivers := make([]*open_api_models.Receiver, 0, len(api.alertmanagerConfig.Receivers))
|
||||
for i := range api.alertmanagerConfig.Receivers {
|
||||
receivers = append(receivers, &open_api_models.Receiver{Name: &api.alertmanagerConfig.Receivers[i].Name})
|
||||
@@ -243,11 +253,13 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
|
||||
// Initialize result slice to prevent api returning `null` when there
|
||||
// are no alerts present
|
||||
res = open_api_models.GettableAlerts{}
|
||||
ctx = params.HTTPRequest.Context()
|
||||
|
||||
logger = api.requestLogger(params.HTTPRequest)
|
||||
)
|
||||
|
||||
ctx, span := tracer.Start(params.HTTPRequest.Context(), "api.getAlertsHandler")
|
||||
defer span.End()
|
||||
|
||||
matchers, err := parseFilter(params.Filter)
|
||||
if err != nil {
|
||||
logger.Debug("Failed to parse matchers", "err", err)
|
||||
@@ -274,6 +286,7 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
|
||||
|
||||
api.mtx.RLock()
|
||||
for a := range alerts.Next() {
|
||||
alert := a.Data
|
||||
if err = alerts.Err(); err != nil {
|
||||
break
|
||||
}
|
||||
@@ -281,7 +294,7 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
|
||||
break
|
||||
}
|
||||
|
||||
routes := api.route.Match(a.Labels)
|
||||
routes := api.route.Match(alert.Labels)
|
||||
receivers := make([]string, 0, len(routes))
|
||||
for _, r := range routes {
|
||||
receivers = append(receivers, r.RouteOpts.Receiver)
|
||||
@@ -291,13 +304,13 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
|
||||
continue
|
||||
}
|
||||
|
||||
if !alertFilter(a, now) {
|
||||
if !alertFilter(alert, now) {
|
||||
continue
|
||||
}
|
||||
|
||||
alert := AlertToOpenAPIAlert(a, api.getAlertStatus(a.Fingerprint()), receivers, nil)
|
||||
openAlert := AlertToOpenAPIAlert(alert, api.getAlertStatus(alert.Fingerprint()), receivers, nil)
|
||||
|
||||
res = append(res, alert)
|
||||
res = append(res, openAlert)
|
||||
}
|
||||
api.mtx.RUnlock()
|
||||
|
||||
@@ -315,7 +328,11 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
|
||||
func (api *API) postAlertsHandler(params alert_ops.PostAlertsParams) middleware.Responder {
|
||||
logger := api.requestLogger(params.HTTPRequest)
|
||||
|
||||
alerts := OpenAPIAlertsToAlerts(params.Alerts)
|
||||
ctx, span := tracer.Start(params.HTTPRequest.Context(), "api.postAlertsHandler")
|
||||
defer span.End()
|
||||
|
||||
alerts := OpenAPIAlertsToAlerts(ctx, params.Alerts)
|
||||
|
||||
now := time.Now()
|
||||
|
||||
api.mtx.RLock()
|
||||
@@ -361,13 +378,19 @@ func (api *API) postAlertsHandler(params alert_ops.PostAlertsParams) middleware.
|
||||
}
|
||||
validAlerts = append(validAlerts, a)
|
||||
}
|
||||
if err := api.alerts.Put(validAlerts...); err != nil {
|
||||
logger.Error("Failed to create alerts", "err", err)
|
||||
if err := api.alerts.Put(ctx, validAlerts...); err != nil {
|
||||
message := "Failed to create alerts"
|
||||
logger.Error(message, "err", err)
|
||||
span.SetStatus(codes.Error, message)
|
||||
span.RecordError(err)
|
||||
return alert_ops.NewPostAlertsInternalServerError().WithPayload(err.Error())
|
||||
}
|
||||
|
||||
if validationErrs.Len() > 0 {
|
||||
logger.Error("Failed to validate alerts", "err", validationErrs.Error())
|
||||
message := "Failed to validate alerts"
|
||||
logger.Error(message, "err", validationErrs.Error())
|
||||
span.SetStatus(codes.Error, message)
|
||||
span.RecordError(validationErrs)
|
||||
return alert_ops.NewPostAlertsBadRequest().WithPayload(validationErrs.Error())
|
||||
}
|
||||
|
||||
@@ -377,6 +400,9 @@ func (api *API) postAlertsHandler(params alert_ops.PostAlertsParams) middleware.
|
||||
func (api *API) getAlertGroupsHandler(params alertgroup_ops.GetAlertGroupsParams) middleware.Responder {
|
||||
logger := api.requestLogger(params.HTTPRequest)
|
||||
|
||||
ctx, span := tracer.Start(params.HTTPRequest.Context(), "api.getAlertGroupsHandler")
|
||||
defer span.End()
|
||||
|
||||
matchers, err := parseFilter(params.Filter)
|
||||
if err != nil {
|
||||
logger.Debug("Failed to parse matchers", "err", err)
|
||||
@@ -407,8 +433,12 @@ func (api *API) getAlertGroupsHandler(params alertgroup_ops.GetAlertGroupsParams
|
||||
}(receiverFilter)
|
||||
|
||||
af := api.alertFilter(matchers, *params.Silenced, *params.Inhibited, *params.Active)
|
||||
alertGroups, allReceivers, err := api.alertGroups(params.HTTPRequest.Context(), rf, af)
|
||||
alertGroups, allReceivers, err := api.alertGroups(ctx, rf, af)
|
||||
if err != nil {
|
||||
message := "Failed to get alert groups"
|
||||
logger.Error(message, "err", err)
|
||||
span.SetStatus(codes.Error, message)
|
||||
span.RecordError(err)
|
||||
return alertgroup_ops.NewGetAlertGroupsInternalServerError()
|
||||
}
|
||||
|
||||
@@ -441,12 +471,15 @@ func (api *API) getAlertGroupsHandler(params alertgroup_ops.GetAlertGroupsParams
|
||||
|
||||
func (api *API) alertFilter(matchers []*labels.Matcher, silenced, inhibited, active bool) func(a *types.Alert, now time.Time) bool {
|
||||
return func(a *types.Alert, now time.Time) bool {
|
||||
ctx, span := tracer.Start(context.Background(), "alertFilter")
|
||||
defer span.End()
|
||||
|
||||
if !a.EndsAt.IsZero() && a.EndsAt.Before(now) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Set alert's current status based on its label set.
|
||||
api.setAlertStatus(a.Labels)
|
||||
api.setAlertStatus(ctx, a.Labels)
|
||||
|
||||
// Get alert's current status after seeing if it is suppressed.
|
||||
status := api.getAlertStatus(a.Fingerprint())
|
||||
@@ -510,13 +543,16 @@ func matchFilterLabels(matchers []*labels.Matcher, sms map[string]string) bool {
|
||||
func (api *API) getSilencesHandler(params silence_ops.GetSilencesParams) middleware.Responder {
|
||||
logger := api.requestLogger(params.HTTPRequest)
|
||||
|
||||
ctx, span := tracer.Start(params.HTTPRequest.Context(), "api.getSilencesHandler")
|
||||
defer span.End()
|
||||
|
||||
matchers, err := parseFilter(params.Filter)
|
||||
if err != nil {
|
||||
logger.Debug("Failed to parse matchers", "err", err)
|
||||
return silence_ops.NewGetSilencesBadRequest().WithPayload(err.Error())
|
||||
}
|
||||
|
||||
psils, _, err := api.silences.Query()
|
||||
psils, _, err := api.silences.Query(ctx)
|
||||
if err != nil {
|
||||
logger.Error("Failed to get silences", "err", err)
|
||||
return silence_ops.NewGetSilencesInternalServerError().WithPayload(err.Error())
|
||||
@@ -606,7 +642,10 @@ func CheckSilenceMatchesFilterLabels(s *silencepb.Silence, matchers []*labels.Ma
|
||||
func (api *API) getSilenceHandler(params silence_ops.GetSilenceParams) middleware.Responder {
|
||||
logger := api.requestLogger(params.HTTPRequest)
|
||||
|
||||
sils, _, err := api.silences.Query(silence.QIDs(params.SilenceID.String()))
|
||||
ctx, span := tracer.Start(params.HTTPRequest.Context(), "api.getSilenceHandler")
|
||||
defer span.End()
|
||||
|
||||
sils, _, err := api.silences.Query(ctx, silence.QIDs(params.SilenceID.String()))
|
||||
if err != nil {
|
||||
logger.Error("Failed to get silence by id", "err", err, "id", params.SilenceID.String())
|
||||
return silence_ops.NewGetSilenceInternalServerError().WithPayload(err.Error())
|
||||
@@ -629,8 +668,11 @@ func (api *API) getSilenceHandler(params silence_ops.GetSilenceParams) middlewar
|
||||
func (api *API) deleteSilenceHandler(params silence_ops.DeleteSilenceParams) middleware.Responder {
|
||||
logger := api.requestLogger(params.HTTPRequest)
|
||||
|
||||
ctx, span := tracer.Start(params.HTTPRequest.Context(), "api.deleteSilenceHandler")
|
||||
defer span.End()
|
||||
|
||||
sid := params.SilenceID.String()
|
||||
if err := api.silences.Expire(sid); err != nil {
|
||||
if err := api.silences.Expire(ctx, sid); err != nil {
|
||||
logger.Error("Failed to expire silence", "err", err)
|
||||
if errors.Is(err, silence.ErrNotFound) {
|
||||
return silence_ops.NewDeleteSilenceNotFound()
|
||||
@@ -643,6 +685,9 @@ func (api *API) deleteSilenceHandler(params silence_ops.DeleteSilenceParams) mid
|
||||
func (api *API) postSilencesHandler(params silence_ops.PostSilencesParams) middleware.Responder {
|
||||
logger := api.requestLogger(params.HTTPRequest)
|
||||
|
||||
ctx, span := tracer.Start(params.HTTPRequest.Context(), "api.postSilencesHandler")
|
||||
defer span.End()
|
||||
|
||||
sil, err := PostableSilenceToProto(params.Silence)
|
||||
if err != nil {
|
||||
logger.Error("Failed to marshal silence to proto", "err", err)
|
||||
@@ -663,7 +708,7 @@ func (api *API) postSilencesHandler(params silence_ops.PostSilencesParams) middl
|
||||
return silence_ops.NewPostSilencesBadRequest().WithPayload(msg)
|
||||
}
|
||||
|
||||
if err = api.silences.Set(sil); err != nil {
|
||||
if err = api.silences.Set(ctx, sil); err != nil {
|
||||
logger.Error("Failed to create silence", "err", err)
|
||||
if errors.Is(err, silence.ErrNotFound) {
|
||||
return silence_ops.NewPostSilencesNotFound().WithPayload(err.Error())
|
||||
|
||||
@@ -53,7 +53,15 @@ func TestGetStatusHandlerWithNilPeer(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test ensures this method call does not panic.
|
||||
status := api.getStatusHandler(general_ops.GetStatusParams{}).(*general_ops.GetStatusOK)
|
||||
status := api.getStatusHandler(
|
||||
general_ops.GetStatusParams{
|
||||
HTTPRequest: httptest.NewRequest(
|
||||
"GET",
|
||||
"/api/v2/status",
|
||||
nil,
|
||||
),
|
||||
},
|
||||
).(*general_ops.GetStatusOK)
|
||||
|
||||
c := status.Payload.Cluster
|
||||
|
||||
@@ -160,7 +168,7 @@ func TestDeleteSilenceHandler(t *testing.T) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now,
|
||||
}
|
||||
require.NoError(t, silences.Set(unexpiredSil))
|
||||
require.NoError(t, silences.Set(t.Context(), unexpiredSil))
|
||||
|
||||
expiredSil := &silencepb.Silence{
|
||||
Matchers: []*silencepb.Matcher{m},
|
||||
@@ -168,8 +176,8 @@ func TestDeleteSilenceHandler(t *testing.T) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now,
|
||||
}
|
||||
require.NoError(t, silences.Set(expiredSil))
|
||||
require.NoError(t, silences.Expire(expiredSil.Id))
|
||||
require.NoError(t, silences.Set(t.Context(), expiredSil))
|
||||
require.NoError(t, silences.Expire(t.Context(), expiredSil.Id))
|
||||
|
||||
for i, tc := range []struct {
|
||||
sid string
|
||||
@@ -222,7 +230,7 @@ func TestPostSilencesHandler(t *testing.T) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now,
|
||||
}
|
||||
require.NoError(t, silences.Set(unexpiredSil))
|
||||
require.NoError(t, silences.Set(t.Context(), unexpiredSil))
|
||||
|
||||
expiredSil := &silencepb.Silence{
|
||||
Matchers: []*silencepb.Matcher{m},
|
||||
@@ -230,8 +238,8 @@ func TestPostSilencesHandler(t *testing.T) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now,
|
||||
}
|
||||
require.NoError(t, silences.Set(expiredSil))
|
||||
require.NoError(t, silences.Expire(expiredSil.Id))
|
||||
require.NoError(t, silences.Set(t.Context(), expiredSil))
|
||||
require.NoError(t, silences.Expire(t.Context(), expiredSil.Id))
|
||||
|
||||
t.Run("Silences CRUD", func(t *testing.T) {
|
||||
for i, tc := range []struct {
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
package v2
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
@@ -170,7 +171,10 @@ func AlertToOpenAPIAlert(alert *types.Alert, status types.AlertStatus, receivers
|
||||
}
|
||||
|
||||
// OpenAPIAlertsToAlerts converts open_api_models.PostableAlerts to []*types.Alert.
|
||||
func OpenAPIAlertsToAlerts(apiAlerts open_api_models.PostableAlerts) []*types.Alert {
|
||||
func OpenAPIAlertsToAlerts(ctx context.Context, apiAlerts open_api_models.PostableAlerts) []*types.Alert {
|
||||
_, span := tracer.Start(ctx, "OpenAPIAlertsToAlerts")
|
||||
defer span.End()
|
||||
|
||||
alerts := []*types.Alert{}
|
||||
for _, apiAlert := range apiAlerts {
|
||||
alert := types.Alert{
|
||||
|
||||
@@ -59,6 +59,7 @@ import (
|
||||
"github.com/prometheus/alertmanager/silence"
|
||||
"github.com/prometheus/alertmanager/template"
|
||||
"github.com/prometheus/alertmanager/timeinterval"
|
||||
"github.com/prometheus/alertmanager/tracing"
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
"github.com/prometheus/alertmanager/ui"
|
||||
)
|
||||
@@ -404,13 +405,14 @@ func run() int {
|
||||
return d + waitFunc()
|
||||
}
|
||||
|
||||
tracingManager := tracing.NewManager(logger.With("component", "tracing"))
|
||||
|
||||
var (
|
||||
inhibitor *inhibit.Inhibitor
|
||||
tmpl *template.Template
|
||||
)
|
||||
|
||||
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer)
|
||||
inhibitMetrics := inhibit.NewInhibitorMetrics(prometheus.DefaultRegisterer)
|
||||
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer, ff)
|
||||
configLogger := logger.With("component", "configuration")
|
||||
configCoordinator := config.NewCoordinator(
|
||||
@@ -465,7 +467,7 @@ func run() int {
|
||||
inhibitor.Stop()
|
||||
disp.Stop()
|
||||
|
||||
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger, inhibitMetrics)
|
||||
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
|
||||
silencer := silence.NewSilencer(silences, marker, logger)
|
||||
|
||||
// An interface value that holds a nil concrete value is non-nil.
|
||||
@@ -491,9 +493,9 @@ func run() int {
|
||||
configuredIntegrations.Set(float64(integrationsNum))
|
||||
configuredInhibitionRules.Set(float64(len(conf.InhibitRules)))
|
||||
|
||||
api.Update(conf, func(labels model.LabelSet) {
|
||||
inhibitor.Mutes(labels)
|
||||
silencer.Mutes(labels)
|
||||
api.Update(conf, func(ctx context.Context, labels model.LabelSet) {
|
||||
inhibitor.Mutes(ctx, labels)
|
||||
silencer.Mutes(ctx, labels)
|
||||
})
|
||||
|
||||
newDisp := dispatch.NewDispatcher(
|
||||
@@ -546,6 +548,13 @@ func run() int {
|
||||
newDisp.WaitForLoading()
|
||||
disp = newDisp
|
||||
|
||||
err = tracingManager.ApplyConfig(conf)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to apply tracing config: %w", err)
|
||||
}
|
||||
|
||||
go tracingManager.Run()
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
@@ -574,7 +583,10 @@ func run() int {
|
||||
|
||||
mux := api.Register(router, *routePrefix)
|
||||
|
||||
srv := &http.Server{Handler: mux}
|
||||
srv := &http.Server{
|
||||
// instrument all handlers with tracing
|
||||
Handler: tracing.Middleware(mux),
|
||||
}
|
||||
srvc := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
@@ -605,6 +617,11 @@ func run() int {
|
||||
errc <- configCoordinator.Reload()
|
||||
case <-term:
|
||||
logger.Info("Received SIGTERM, exiting gracefully...")
|
||||
|
||||
// shut down the tracing manager to flush any remaining spans.
|
||||
// this blocks for up to 5s
|
||||
tracingManager.Stop()
|
||||
|
||||
return 0
|
||||
case <-srvc:
|
||||
return 1
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright 2015 Prometheus Team
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
@@ -351,6 +351,8 @@ type Config struct {
|
||||
MuteTimeIntervals []MuteTimeInterval `yaml:"mute_time_intervals,omitempty" json:"mute_time_intervals,omitempty"`
|
||||
TimeIntervals []TimeInterval `yaml:"time_intervals,omitempty" json:"time_intervals,omitempty"`
|
||||
|
||||
TracingConfig TracingConfig `yaml:"tracing,omitempty" json:"tracing,omitempty"`
|
||||
|
||||
// original is the input from which the config was parsed.
|
||||
original string
|
||||
}
|
||||
|
||||
92
config/tracing.go
Normal file
92
config/tracing.go
Normal file
@@ -0,0 +1,92 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package config
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
commoncfg "github.com/prometheus/common/config"
|
||||
"github.com/prometheus/common/model"
|
||||
)
|
||||
|
||||
// TODO: probably move these into prometheus/common since they're copied from
|
||||
// prometheus/prometheus?
|
||||
|
||||
type TracingClientType string
|
||||
|
||||
const (
|
||||
TracingClientHTTP TracingClientType = "http"
|
||||
TracingClientGRPC TracingClientType = "grpc"
|
||||
|
||||
GzipCompression = "gzip"
|
||||
)
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
func (t *TracingClientType) UnmarshalYAML(unmarshal func(any) error) error {
|
||||
*t = TracingClientType("")
|
||||
type plain TracingClientType
|
||||
if err := unmarshal((*plain)(t)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch *t {
|
||||
case TracingClientHTTP, TracingClientGRPC:
|
||||
return nil
|
||||
default:
|
||||
return fmt.Errorf("expected tracing client type to be to be %s or %s, but got %s",
|
||||
TracingClientHTTP, TracingClientGRPC, *t,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// TracingConfig configures the tracing options.
|
||||
type TracingConfig struct {
|
||||
ClientType TracingClientType `yaml:"client_type,omitempty"`
|
||||
Endpoint string `yaml:"endpoint,omitempty"`
|
||||
SamplingFraction float64 `yaml:"sampling_fraction,omitempty"`
|
||||
Insecure bool `yaml:"insecure,omitempty"`
|
||||
TLSConfig *commoncfg.TLSConfig `yaml:"tls_config,omitempty"`
|
||||
Headers *commoncfg.Headers `yaml:"headers,omitempty"`
|
||||
Compression string `yaml:"compression,omitempty"`
|
||||
Timeout model.Duration `yaml:"timeout,omitempty"`
|
||||
}
|
||||
|
||||
// SetDirectory joins any relative file paths with dir.
|
||||
func (t *TracingConfig) SetDirectory(dir string) {
|
||||
t.TLSConfig.SetDirectory(dir)
|
||||
t.Headers.SetDirectory(dir)
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
func (t *TracingConfig) UnmarshalYAML(unmarshal func(any) error) error {
|
||||
*t = TracingConfig{
|
||||
ClientType: TracingClientGRPC,
|
||||
}
|
||||
type plain TracingConfig
|
||||
if err := unmarshal((*plain)(t)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if t.Endpoint == "" {
|
||||
return errors.New("tracing endpoint must be set")
|
||||
}
|
||||
|
||||
if t.Compression != "" && t.Compression != GzipCompression {
|
||||
return fmt.Errorf("invalid compression type %s provided, valid options: %s",
|
||||
t.Compression, GzipCompression)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -27,6 +27,11 @@ import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/prometheus/common/model"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/prometheus/alertmanager/notify"
|
||||
"github.com/prometheus/alertmanager/provider"
|
||||
@@ -40,6 +45,8 @@ const (
|
||||
DispatcherStateRunning
|
||||
)
|
||||
|
||||
var tracer = otel.Tracer("github.com/prometheus/alertmanager/dispatch")
|
||||
|
||||
// DispatcherMetrics represents metrics associated to a dispatcher.
|
||||
type DispatcherMetrics struct {
|
||||
aggrGroups prometheus.Gauge
|
||||
@@ -79,12 +86,13 @@ func NewDispatcherMetrics(registerLimitMetrics bool, r prometheus.Registerer) *D
|
||||
// Dispatcher sorts incoming alerts into aggregation groups and
|
||||
// assigns the correct notifiers to each.
|
||||
type Dispatcher struct {
|
||||
route *Route
|
||||
alerts provider.Alerts
|
||||
stage notify.Stage
|
||||
marker types.GroupMarker
|
||||
metrics *DispatcherMetrics
|
||||
limits Limits
|
||||
route *Route
|
||||
alerts provider.Alerts
|
||||
stage notify.Stage
|
||||
marker types.GroupMarker
|
||||
metrics *DispatcherMetrics
|
||||
limits Limits
|
||||
propagator propagation.TextMapPropagator
|
||||
|
||||
timeout func(time.Duration) time.Duration
|
||||
|
||||
@@ -138,6 +146,7 @@ func NewDispatcher(
|
||||
logger: logger.With("component", "dispatcher"),
|
||||
metrics: metrics,
|
||||
limits: limits,
|
||||
propagator: otel.GetTextMapPropagator(),
|
||||
state: DispatcherStateUnknown,
|
||||
}
|
||||
disp.loadingFinished.Add(1)
|
||||
@@ -161,7 +170,7 @@ func (d *Dispatcher) Run(dispatchStartTime time.Time) {
|
||||
|
||||
initalAlerts, it := d.alerts.SlurpAndSubscribe("dispatcher")
|
||||
for _, alert := range initalAlerts {
|
||||
d.ingestAlert(alert)
|
||||
d.routeAlert(d.ctx, alert)
|
||||
}
|
||||
d.loadingFinished.Done()
|
||||
|
||||
@@ -186,15 +195,18 @@ func (d *Dispatcher) run(it provider.AlertIterator) {
|
||||
return
|
||||
}
|
||||
|
||||
d.logger.Debug("Received alert", "alert", alert)
|
||||
|
||||
// Log errors but keep trying.
|
||||
if err := it.Err(); err != nil {
|
||||
d.logger.Error("Error on alert update", "err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
d.ingestAlert(alert)
|
||||
ctx := d.ctx
|
||||
if alert.Header != nil {
|
||||
ctx = d.propagator.Extract(ctx, propagation.MapCarrier(alert.Header))
|
||||
}
|
||||
|
||||
d.routeAlert(ctx, alert.Data)
|
||||
|
||||
case <-d.startTimer.C:
|
||||
if d.state == DispatcherStateWaitingToStart {
|
||||
@@ -216,6 +228,30 @@ func (d *Dispatcher) run(it provider.AlertIterator) {
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Dispatcher) routeAlert(ctx context.Context, alert *types.Alert) {
|
||||
d.logger.Debug("Received alert", "alert", alert)
|
||||
|
||||
ctx, span := tracer.Start(ctx, "dispatch.Dispatcher.routeAlert",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.alert.name", alert.Name()),
|
||||
attribute.String("alerting.alert.fingerprint", alert.Fingerprint().String()),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
now := time.Now()
|
||||
for _, r := range d.route.Match(alert.Labels) {
|
||||
span.AddEvent("dispatching alert to route",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.route.receiver.name", r.RouteOpts.Receiver),
|
||||
),
|
||||
)
|
||||
d.groupAlert(ctx, alert, r)
|
||||
}
|
||||
d.metrics.processingDuration.Observe(time.Since(now).Seconds())
|
||||
}
|
||||
|
||||
func (d *Dispatcher) doMaintenance() {
|
||||
d.mtx.Lock()
|
||||
defer d.mtx.Unlock()
|
||||
@@ -232,14 +268,6 @@ func (d *Dispatcher) doMaintenance() {
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Dispatcher) ingestAlert(alert *types.Alert) {
|
||||
now := time.Now()
|
||||
for _, r := range d.route.Match(alert.Labels) {
|
||||
d.processAlert(alert, r)
|
||||
}
|
||||
d.metrics.processingDuration.Observe(time.Since(now).Seconds())
|
||||
}
|
||||
|
||||
func (d *Dispatcher) WaitForLoading() {
|
||||
d.loadingFinished.Wait()
|
||||
}
|
||||
@@ -379,12 +407,22 @@ func (d *Dispatcher) Stop() {
|
||||
|
||||
// notifyFunc is a function that performs notification for the alert
|
||||
// with the given fingerprint. It aborts on context cancelation.
|
||||
// Returns false iff notifying failed.
|
||||
// Returns false if notifying failed.
|
||||
type notifyFunc func(context.Context, ...*types.Alert) bool
|
||||
|
||||
// processAlert determines in which aggregation group the alert falls
|
||||
// groupAlert determines in which aggregation group the alert falls
|
||||
// and inserts it.
|
||||
func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
||||
func (d *Dispatcher) groupAlert(ctx context.Context, alert *types.Alert, route *Route) {
|
||||
_, span := tracer.Start(ctx, "dispatch.Dispatcher.groupAlert",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.alert.name", alert.Name()),
|
||||
attribute.String("alerting.alert.fingerprint", alert.Fingerprint().String()),
|
||||
attribute.String("alerting.route.receiver.name", route.RouteOpts.Receiver),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
now := time.Now()
|
||||
groupLabels := getGroupLabels(alert, route)
|
||||
|
||||
@@ -401,14 +439,23 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
||||
|
||||
ag, ok := routeGroups[fp]
|
||||
if ok {
|
||||
ag.insert(alert)
|
||||
ag.insert(ctx, alert)
|
||||
return
|
||||
}
|
||||
|
||||
// If the group does not exist, create it. But check the limit first.
|
||||
if limit := d.limits.MaxNumberOfAggregationGroups(); limit > 0 && d.aggrGroupsNum >= limit {
|
||||
d.metrics.aggrGroupLimitReached.Inc()
|
||||
d.logger.Error("Too many aggregation groups, cannot create new group for alert", "groups", d.aggrGroupsNum, "limit", limit, "alert", alert.Name())
|
||||
err := errors.New("too many aggregation groups, cannot create new group for alert")
|
||||
message := "Failed to create aggregation group"
|
||||
d.logger.Error(message, "err", err.Error(), "groups", d.aggrGroupsNum, "limit", limit, "alert", alert.Name())
|
||||
span.SetStatus(codes.Error, message)
|
||||
span.RecordError(err,
|
||||
trace.WithAttributes(
|
||||
attribute.Int("alerting.aggregation_group.count", d.aggrGroupsNum),
|
||||
attribute.Int("alerting.aggregation_group.limit", limit),
|
||||
),
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -416,26 +463,35 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
||||
routeGroups[fp] = ag
|
||||
d.aggrGroupsNum++
|
||||
d.metrics.aggrGroups.Inc()
|
||||
span.AddEvent("new AggregationGroup created",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.aggregation_group.key", ag.GroupKey()),
|
||||
attribute.Int("alerting.aggregation_group.count", d.aggrGroupsNum),
|
||||
),
|
||||
)
|
||||
|
||||
// Insert the 1st alert in the group before starting the group's run()
|
||||
// function, to make sure that when the run() will be executed the 1st
|
||||
// alert is already there.
|
||||
ag.insert(alert)
|
||||
ag.insert(ctx, alert)
|
||||
|
||||
if alert.StartsAt.Add(ag.opts.GroupWait).Before(now) {
|
||||
ag.logger.Debug(
|
||||
"Alert is old enough for immediate flush, resetting timer to zero",
|
||||
"alert", alert.Name(),
|
||||
"fingerprint", alert.Fingerprint(),
|
||||
"startsAt", alert.StartsAt,
|
||||
message := "Alert is old enough for immediate flush, resetting timer to zero"
|
||||
ag.logger.Debug(message, "alert", alert.Name(), "fingerprint", alert.Fingerprint(), "startsAt", alert.StartsAt)
|
||||
span.AddEvent(message,
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.alert.StartsAt", alert.StartsAt.Format(time.RFC3339)),
|
||||
),
|
||||
)
|
||||
ag.resetTimer(0)
|
||||
}
|
||||
// Check dispatcher and alert state to determine if we should run the AG now.
|
||||
switch d.state {
|
||||
case DispatcherStateWaitingToStart:
|
||||
span.AddEvent("Not starting Aggregation Group, dispatcher is not running")
|
||||
d.logger.Debug("Dispatcher still waiting to start")
|
||||
case DispatcherStateRunning:
|
||||
span.AddEvent("Starting Aggregation Group")
|
||||
d.runAG(ag)
|
||||
default:
|
||||
d.logger.Warn("unknown state detected", "state", "unknown")
|
||||
@@ -570,7 +626,20 @@ func (ag *aggrGroup) run(nf notifyFunc) {
|
||||
ag.resetTimer(ag.opts.GroupInterval)
|
||||
|
||||
ag.flush(func(alerts ...*types.Alert) bool {
|
||||
return nf(ctx, alerts...)
|
||||
ctx, span := tracer.Start(ctx, "dispatch.AggregationGroup.flush",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.aggregation_group.key", ag.GroupKey()),
|
||||
attribute.Int("alerting.alerts.count", len(alerts)),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
success := nf(ctx, alerts...)
|
||||
if !success {
|
||||
span.SetStatus(codes.Error, "notification failed")
|
||||
}
|
||||
return success
|
||||
})
|
||||
|
||||
cancel()
|
||||
@@ -594,9 +663,21 @@ func (ag *aggrGroup) resetTimer(t time.Duration) {
|
||||
}
|
||||
|
||||
// insert inserts the alert into the aggregation group.
|
||||
func (ag *aggrGroup) insert(alert *types.Alert) {
|
||||
func (ag *aggrGroup) insert(ctx context.Context, alert *types.Alert) {
|
||||
_, span := tracer.Start(ctx, "dispatch.AggregationGroup.insert",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.alert.name", alert.Name()),
|
||||
attribute.String("alerting.alert.fingerprint", alert.Fingerprint().String()),
|
||||
attribute.String("alerting.aggregation_group.key", ag.GroupKey()),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
if err := ag.alerts.Set(alert); err != nil {
|
||||
ag.logger.Error("error on set alert", "err", err)
|
||||
message := "error on set alert"
|
||||
span.SetStatus(codes.Error, message)
|
||||
span.RecordError(err)
|
||||
ag.logger.Error(message, "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -144,7 +144,8 @@ func TestAggrGroup(t *testing.T) {
|
||||
ag := newAggrGroup(context.Background(), lset, route, nil, types.NewMarker(prometheus.NewRegistry()), promslog.NewNopLogger())
|
||||
go ag.run(ntfy)
|
||||
|
||||
ag.insert(a1)
|
||||
ctx := context.Background()
|
||||
ag.insert(ctx, a1)
|
||||
|
||||
select {
|
||||
case <-time.After(2 * opts.GroupWait):
|
||||
@@ -167,7 +168,7 @@ func TestAggrGroup(t *testing.T) {
|
||||
|
||||
for range 3 {
|
||||
// New alert should come in after group interval.
|
||||
ag.insert(a3)
|
||||
ag.insert(ctx, a3)
|
||||
|
||||
select {
|
||||
case <-time.After(2 * opts.GroupInterval):
|
||||
@@ -196,8 +197,8 @@ func TestAggrGroup(t *testing.T) {
|
||||
ag = newAggrGroup(context.Background(), lset, route, nil, types.NewMarker(prometheus.NewRegistry()), promslog.NewNopLogger())
|
||||
go ag.run(ntfy)
|
||||
|
||||
ag.insert(a1)
|
||||
ag.insert(a2)
|
||||
ag.insert(ctx, a1)
|
||||
ag.insert(ctx, a2)
|
||||
|
||||
batch := <-alertsCh
|
||||
exp := removeEndsAt(types.AlertSlice{a1, a2})
|
||||
@@ -209,7 +210,7 @@ func TestAggrGroup(t *testing.T) {
|
||||
|
||||
for range 3 {
|
||||
// New alert should come in after group interval.
|
||||
ag.insert(a3)
|
||||
ag.insert(ctx, a3)
|
||||
|
||||
select {
|
||||
case <-time.After(2 * opts.GroupInterval):
|
||||
@@ -234,7 +235,7 @@ func TestAggrGroup(t *testing.T) {
|
||||
// Resolve an alert, and it should be removed after the next batch was sent.
|
||||
a1r := *a1
|
||||
a1r.EndsAt = time.Now()
|
||||
ag.insert(&a1r)
|
||||
ag.insert(ctx, &a1r)
|
||||
exp = append(types.AlertSlice{&a1r}, removeEndsAt(types.AlertSlice{a2, a3})...)
|
||||
|
||||
select {
|
||||
@@ -260,7 +261,7 @@ func TestAggrGroup(t *testing.T) {
|
||||
resolved := types.AlertSlice{&a2r, &a3r}
|
||||
for _, a := range resolved {
|
||||
a.EndsAt = time.Now()
|
||||
ag.insert(a)
|
||||
ag.insert(ctx, a)
|
||||
}
|
||||
|
||||
select {
|
||||
@@ -412,7 +413,7 @@ route:
|
||||
// Matches the second and third sub-route.
|
||||
newAlert(model.LabelSet{"env": "prod", "alertname": "HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}),
|
||||
}
|
||||
alerts.Put(inputAlerts...)
|
||||
alerts.Put(context.Background(), inputAlerts...)
|
||||
|
||||
// Let alerts get processed.
|
||||
for i := 0; len(recorder.Alerts()) != 7 && i < 10; i++ {
|
||||
@@ -565,7 +566,7 @@ route:
|
||||
// Matches the second and third sub-route.
|
||||
newAlert(model.LabelSet{"env": "prod", "alertname": "HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}),
|
||||
}
|
||||
err = alerts.Put(inputAlerts...)
|
||||
err = alerts.Put(context.Background(), inputAlerts...)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -585,7 +586,7 @@ route:
|
||||
require.Equal(t, 0.0, testutil.ToFloat64(m.aggrGroupLimitReached))
|
||||
|
||||
// Try to store new alert. This time, we will hit limit for number of groups.
|
||||
err = alerts.Put(newAlert(model.LabelSet{"env": "prod", "alertname": "NewAlert", "cluster": "new-cluster", "service": "db"}))
|
||||
err = alerts.Put(context.Background(), newAlert(model.LabelSet{"env": "prod", "alertname": "NewAlert", "cluster": "new-cluster", "service": "db"}))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -702,7 +703,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)
|
||||
// Push all alerts.
|
||||
for i := range numAlerts {
|
||||
alert := newAlert(model.LabelSet{"alertname": model.LabelValue(fmt.Sprintf("Alert_%d", i))})
|
||||
require.NoError(t, alerts.Put(alert))
|
||||
require.NoError(t, alerts.Put(context.Background(), alert))
|
||||
}
|
||||
|
||||
// Wait until the alerts have been notified or the waiting timeout expires.
|
||||
@@ -818,8 +819,8 @@ func TestDispatcher_DeleteResolvedAlertsFromMarker(t *testing.T) {
|
||||
}
|
||||
|
||||
// Insert alerts into the aggregation group
|
||||
ag.insert(activeAlert)
|
||||
ag.insert(resolvedAlert)
|
||||
ag.insert(ctx, activeAlert)
|
||||
ag.insert(ctx, resolvedAlert)
|
||||
|
||||
// Set markers for both alerts
|
||||
marker.SetActiveOrSilenced(activeAlert.Fingerprint(), 0, nil, nil)
|
||||
@@ -876,7 +877,7 @@ func TestDispatcher_DeleteResolvedAlertsFromMarker(t *testing.T) {
|
||||
}
|
||||
|
||||
// Insert alert into the aggregation group
|
||||
ag.insert(resolvedAlert)
|
||||
ag.insert(ctx, resolvedAlert)
|
||||
|
||||
// Set marker for the alert
|
||||
marker.SetActiveOrSilenced(resolvedAlert.Fingerprint(), 0, nil, nil)
|
||||
@@ -930,7 +931,7 @@ func TestDispatcher_DeleteResolvedAlertsFromMarker(t *testing.T) {
|
||||
}
|
||||
|
||||
// Insert alert into the aggregation group
|
||||
ag.insert(resolvedAlert)
|
||||
ag.insert(ctx, resolvedAlert)
|
||||
|
||||
// Set marker for the alert
|
||||
marker.SetActiveOrSilenced(resolvedAlert.Fingerprint(), 0, nil, nil)
|
||||
@@ -1024,7 +1025,7 @@ func TestDispatchOnStartup(t *testing.T) {
|
||||
}
|
||||
|
||||
// Send alert1
|
||||
require.NoError(t, alerts.Put(alert1))
|
||||
require.NoError(t, alerts.Put(context.Background(), alert1))
|
||||
|
||||
var recordedAlerts []*types.Alert
|
||||
// Expect a recorded alert after startTime + GroupWait which is in future
|
||||
@@ -1036,7 +1037,7 @@ func TestDispatchOnStartup(t *testing.T) {
|
||||
require.Equal(t, alert1.Fingerprint(), recordedAlerts[0].Fingerprint(), "expected alert1 to be dispatched after GroupWait")
|
||||
|
||||
// Send alert2
|
||||
require.NoError(t, alerts.Put(alert2))
|
||||
require.NoError(t, alerts.Put(context.Background(), alert2))
|
||||
|
||||
// Expect a recorded alert after GroupInterval
|
||||
require.Eventually(t, func() bool {
|
||||
|
||||
@@ -120,3 +120,8 @@ receivers:
|
||||
- name: 'team-DB-pager'
|
||||
pagerduty_configs:
|
||||
- service_key: <team-DB-key>
|
||||
|
||||
tracing:
|
||||
endpoint: localhost:4317
|
||||
insecure: true
|
||||
sampling_fraction: 1.0
|
||||
|
||||
@@ -831,14 +831,20 @@ tls_config:
|
||||
# Custom HTTP headers to be sent along with each request.
|
||||
# Headers that are set by Prometheus itself can't be overwritten.
|
||||
http_headers:
|
||||
# Header name.
|
||||
[ <string>:
|
||||
[ <http_header> ]
|
||||
```
|
||||
|
||||
#### `<http_header>`
|
||||
|
||||
```yaml
|
||||
# Header name.
|
||||
<string>:
|
||||
# Header values.
|
||||
[ values: [<string>, ...] ]
|
||||
# Headers values. Hidden in configuration page.
|
||||
[ secrets: [<secret>, ...] ]
|
||||
# Files to read header values from.
|
||||
[ files: [<string>, ...] ] ]
|
||||
[ files: [<string>, ...] ]
|
||||
```
|
||||
|
||||
#### `<oauth2>`
|
||||
@@ -1907,3 +1913,34 @@ room_id: <tmpl_string>
|
||||
# The HTTP client's configuration. You must use this configuration to supply the bot token as part of the HTTP `Authorization` header.
|
||||
[ http_config: <http_config> | default = global.http_config ]
|
||||
```
|
||||
|
||||
## Tracing Configuration
|
||||
### `<tracing_config>`
|
||||
|
||||
```yaml
|
||||
# The tracing client type, supported values are `http` and `grpc`.
|
||||
[ client_type: <tracing_client_type> | default = "grpc" ]
|
||||
|
||||
# The tracing endpoint.
|
||||
[ endpoint: <string> | default = "" ]
|
||||
|
||||
# The sampling fraction.
|
||||
[ sampling_fraction: <float> | default = 0.0 ]
|
||||
|
||||
# Whether to disable TLS.
|
||||
[ insecure: <boolean> | default = false ]
|
||||
|
||||
# The HTTP client's configuration.
|
||||
[ tls_config: <tls_config> ]
|
||||
|
||||
# Custom HTTP headers.
|
||||
[ http_headers:
|
||||
[ <http_header> ] ]
|
||||
|
||||
# The tracing compression.
|
||||
[ compression: <string> | default = "gzip" ]
|
||||
|
||||
# The tracing timeout.
|
||||
[ timeout: <duration> | default = 0s ]
|
||||
```
|
||||
|
||||
|
||||
21
go.mod
21
go.mod
@@ -34,7 +34,6 @@ require (
|
||||
github.com/oklog/run v1.2.0
|
||||
github.com/oklog/ulid/v2 v2.1.1
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/prometheus/client_model v0.6.2
|
||||
github.com/prometheus/common v0.67.4
|
||||
github.com/prometheus/exporter-toolkit v0.15.0
|
||||
github.com/prometheus/sigv4 v0.3.0
|
||||
@@ -43,11 +42,20 @@ require (
|
||||
github.com/shurcooL/vfsgen v0.0.0-20230704071429-0000e147ea92
|
||||
github.com/stretchr/testify v1.11.1
|
||||
github.com/xlab/treeprint v1.2.0
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.63.0
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0
|
||||
go.opentelemetry.io/otel v1.38.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0
|
||||
go.opentelemetry.io/otel/sdk v1.38.0
|
||||
go.opentelemetry.io/otel/trace v1.38.0
|
||||
go.uber.org/automaxprocs v1.6.0
|
||||
golang.org/x/mod v0.30.0
|
||||
golang.org/x/net v0.47.0
|
||||
golang.org/x/text v0.31.0
|
||||
golang.org/x/tools v0.39.0
|
||||
google.golang.org/grpc v1.75.0
|
||||
gopkg.in/telebot.v3 v3.3.8
|
||||
gopkg.in/yaml.v2 v2.4.0
|
||||
)
|
||||
@@ -64,10 +72,12 @@ require (
|
||||
github.com/aws/aws-sdk-go-v2/service/sso v1.30.6 // indirect
|
||||
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.11 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
|
||||
github.com/coreos/go-systemd/v22 v22.6.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/docker/go-units v0.5.0 // indirect
|
||||
github.com/emersion/go-sasl v0.0.0-20241020182733-b788ff22d5a6 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/go-logr/logr v1.4.3 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.22.1 // indirect
|
||||
@@ -85,8 +95,9 @@ require (
|
||||
github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
|
||||
github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
|
||||
github.com/golang-jwt/jwt/v5 v5.3.0 // indirect
|
||||
github.com/golang/protobuf v1.5.3 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/btree v1.0.0 // indirect
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 // indirect
|
||||
github.com/hashicorp/errwrap v1.1.0 // indirect
|
||||
github.com/hashicorp/go-immutable-radix v1.3.1 // indirect
|
||||
github.com/hashicorp/go-metrics v0.5.4 // indirect
|
||||
@@ -104,14 +115,14 @@ require (
|
||||
github.com/oklog/ulid v1.3.1 // indirect
|
||||
github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/procfs v0.16.1 // indirect
|
||||
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
|
||||
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect
|
||||
go.mongodb.org/mongo-driver v1.17.6 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
||||
go.opentelemetry.io/otel v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.38.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.38.0 // indirect
|
||||
go.opentelemetry.io/proto/otlp v1.7.1 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.3 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||
golang.org/x/crypto v0.44.0 // indirect
|
||||
@@ -121,6 +132,8 @@ require (
|
||||
golang.org/x/telemetry v0.0.0-20251111182119-bc8e575c7b54 // indirect
|
||||
golang.org/x/time v0.13.0 // indirect
|
||||
golang.org/x/tools/godoc v0.1.0-deprecated // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 // indirect
|
||||
google.golang.org/protobuf v1.36.10 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
|
||||
32
go.sum
32
go.sum
@@ -114,6 +114,8 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
|
||||
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
|
||||
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
|
||||
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
|
||||
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||
github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc=
|
||||
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
@@ -165,6 +167,8 @@ github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5Kwzbycv
|
||||
github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU=
|
||||
github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM=
|
||||
github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/frankban/quicktest v1.14.3/go.mod h1:mgiwOwqx65TmIk1wJ6Q7wvnVMocbUorkibMOrVTHZps=
|
||||
github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU=
|
||||
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
|
||||
@@ -275,8 +279,8 @@ github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw
|
||||
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||
github.com/golang/protobuf v1.5.1/go.mod h1:DopwsBzvsk0Fs44TXzsVbJyPhcCPeIwnvohx4u74HPM=
|
||||
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
||||
github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
|
||||
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||
github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo=
|
||||
@@ -331,6 +335,8 @@ github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK
|
||||
github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g=
|
||||
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk=
|
||||
github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs=
|
||||
github.com/hashicorp/consul/api v1.12.0/go.mod h1:6pVBMo0ebnYdt2S3H87XhekM/HHrUoTD2XXb/VrZVy0=
|
||||
github.com/hashicorp/consul/sdk v0.8.0/go.mod h1:GBvyrGALthsZObzUGsfgHZQDXjg4lOjagTIwIR1vPms=
|
||||
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
|
||||
@@ -573,15 +579,29 @@ go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk=
|
||||
go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
|
||||
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.63.0 h1:2pn7OzMewmYRiNtv1doZnLo3gONcnMHlFnmOR8Vgt+8=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.63.0/go.mod h1:rjbQTDEPQymPE0YnRQp9/NuPwwtL0sesz/fnqRW/v84=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
|
||||
go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8=
|
||||
go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4=
|
||||
go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA=
|
||||
go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E=
|
||||
go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM=
|
||||
go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA=
|
||||
go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE=
|
||||
go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs=
|
||||
go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4=
|
||||
go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE=
|
||||
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
|
||||
go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
|
||||
go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
|
||||
@@ -892,6 +912,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20220411194840-2f41105eb62f/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20220517211312-f3a8303e98df/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
|
||||
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
|
||||
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
|
||||
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
|
||||
@@ -1016,6 +1038,10 @@ google.golang.org/genproto v0.0.0-20220421151946-72621c1f0bd3/go.mod h1:8w6bsBMX
|
||||
google.golang.org/genproto v0.0.0-20220429170224-98d788798c3e/go.mod h1:8w6bsBMX6yCPbAVTeqQHvzxW0EIFigd5lZyahWgyfDo=
|
||||
google.golang.org/genproto v0.0.0-20220505152158-f39f71e6c8f3/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4=
|
||||
google.golang.org/genproto v0.0.0-20220519153652-3a47de7e79bd/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5 h1:BIRfGDEjiHRrk0QKZe3Xv2ieMhtgRGeLcZQ0mIVn4EY=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20250825161204-c5933d9347a5/go.mod h1:j3QtIyytwqGr1JUDtYXwtMXWPKsEa5LtzIFN1Wn5WvE=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5 h1:eaY8u2EuxbRv7c3NiGK0/NedzVsCcV6hDuU5qPX5EGE=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20250825161204-c5933d9347a5/go.mod h1:M4/wBTSeyLxupu3W3tJtOgB14jILAS/XWPSSa3TAlJc=
|
||||
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
|
||||
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
|
||||
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
|
||||
@@ -1046,6 +1072,8 @@ google.golang.org/grpc v1.44.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ5
|
||||
google.golang.org/grpc v1.45.0/go.mod h1:lN7owxKUQEqMfSyQikvvk5tf/6zMPsrK+ONuO11+0rQ=
|
||||
google.golang.org/grpc v1.46.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk=
|
||||
google.golang.org/grpc v1.46.2/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk=
|
||||
google.golang.org/grpc v1.75.0 h1:+TW+dqTd2Biwe6KKfhE5JpiYIBWq865PhKGSXiivqt4=
|
||||
google.golang.org/grpc v1.75.0/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ=
|
||||
google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw=
|
||||
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
|
||||
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
|
||||
|
||||
@@ -20,8 +20,12 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/oklog/run"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/model"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/prometheus/alertmanager/config"
|
||||
"github.com/prometheus/alertmanager/pkg/labels"
|
||||
@@ -30,15 +34,17 @@ import (
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
)
|
||||
|
||||
var tracer = otel.Tracer("github.com/prometheus/alertmanager/inhibit")
|
||||
|
||||
// An Inhibitor determines whether a given label set is muted based on the
|
||||
// currently active alerts and a set of inhibition rules. It implements the
|
||||
// Muter interface.
|
||||
type Inhibitor struct {
|
||||
alerts provider.Alerts
|
||||
rules []*InhibitRule
|
||||
marker types.AlertMarker
|
||||
logger *slog.Logger
|
||||
metrics *InhibitorMetrics
|
||||
alerts provider.Alerts
|
||||
rules []*InhibitRule
|
||||
marker types.AlertMarker
|
||||
logger *slog.Logger
|
||||
propagator propagation.TextMapPropagator
|
||||
|
||||
mtx sync.RWMutex
|
||||
loadingFinished sync.WaitGroup
|
||||
@@ -46,12 +52,12 @@ type Inhibitor struct {
|
||||
}
|
||||
|
||||
// NewInhibitor returns a new Inhibitor.
|
||||
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger, metrics *InhibitorMetrics) *Inhibitor {
|
||||
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger) *Inhibitor {
|
||||
ih := &Inhibitor{
|
||||
alerts: ap,
|
||||
marker: mk,
|
||||
logger: logger,
|
||||
metrics: metrics,
|
||||
alerts: ap,
|
||||
marker: mk,
|
||||
logger: logger,
|
||||
propagator: otel.GetTextMapPropagator(),
|
||||
}
|
||||
|
||||
ih.loadingFinished.Add(1)
|
||||
@@ -61,7 +67,7 @@ func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMar
|
||||
ih.logger.Debug("duplicate inhibition rule name", "index", i, "name", cr.Name)
|
||||
}
|
||||
|
||||
r := NewInhibitRule(cr, NewRuleMetrics(cr.Name, metrics))
|
||||
r := NewInhibitRule(cr)
|
||||
ih.rules = append(ih.rules, r)
|
||||
|
||||
if cr.Name != "" {
|
||||
@@ -76,7 +82,7 @@ func (ih *Inhibitor) run(ctx context.Context) {
|
||||
defer it.Close()
|
||||
|
||||
for _, a := range initalAlerts {
|
||||
ih.processAlert(a)
|
||||
ih.processAlert(ctx, a)
|
||||
}
|
||||
|
||||
ih.loadingFinished.Done()
|
||||
@@ -90,37 +96,41 @@ func (ih *Inhibitor) run(ctx context.Context) {
|
||||
ih.logger.Error("Error iterating alerts", "err", err)
|
||||
continue
|
||||
}
|
||||
ih.processAlert(a)
|
||||
traceCtx := context.Background()
|
||||
if a.Header != nil {
|
||||
traceCtx = ih.propagator.Extract(traceCtx, propagation.MapCarrier(a.Header))
|
||||
}
|
||||
ih.processAlert(traceCtx, a.Data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ih *Inhibitor) processAlert(a *types.Alert) {
|
||||
func (ih *Inhibitor) processAlert(ctx context.Context, a *types.Alert) {
|
||||
_, span := tracer.Start(ctx, "inhibit.Inhibitor.processAlert",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.alert.name", a.Name()),
|
||||
attribute.String("alerting.alert.fingerprint", a.Fingerprint().String()),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
// Update the inhibition rules' cache.
|
||||
cachedSum := 0
|
||||
indexedSum := 0
|
||||
for _, r := range ih.rules {
|
||||
if r.SourceMatchers.Matches(a.Labels) {
|
||||
attr := attribute.String("alerting.inhibit_rule.name", r.Name)
|
||||
span.AddEvent("alert matched rule source", trace.WithAttributes(attr))
|
||||
if err := r.scache.Set(a); err != nil {
|
||||
ih.logger.Error("error on set alert", "err", err)
|
||||
message := "error on set alert"
|
||||
ih.logger.Error(message, "err", err)
|
||||
span.SetStatus(codes.Error, message)
|
||||
span.RecordError(err)
|
||||
continue
|
||||
}
|
||||
span.SetAttributes(attr)
|
||||
r.updateIndex(a)
|
||||
|
||||
}
|
||||
cached := r.scache.Len()
|
||||
indexed := r.sindex.Len()
|
||||
|
||||
if r.Name != "" {
|
||||
r.metrics.sourceAlertsCacheItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(cached))
|
||||
r.metrics.sourceAlertsIndexItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(indexed))
|
||||
}
|
||||
|
||||
cachedSum += cached
|
||||
indexedSum += indexed
|
||||
}
|
||||
ih.metrics.sourceAlertsCacheItems.Set(float64(cachedSum))
|
||||
ih.metrics.sourceAlertsIndexItems.Set(float64(indexedSum))
|
||||
}
|
||||
|
||||
func (ih *Inhibitor) WaitForLoading() {
|
||||
@@ -170,33 +180,40 @@ func (ih *Inhibitor) Stop() {
|
||||
|
||||
// Mutes returns true iff the given label set is muted. It implements the Muter
|
||||
// interface.
|
||||
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
|
||||
start := time.Now()
|
||||
func (ih *Inhibitor) Mutes(ctx context.Context, lset model.LabelSet) bool {
|
||||
fp := lset.Fingerprint()
|
||||
|
||||
_, span := tracer.Start(ctx, "inhibit.Inhibitor.Mutes",
|
||||
trace.WithAttributes(attribute.String("alerting.alert.fingerprint", fp.String())),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
now := time.Now()
|
||||
for _, r := range ih.rules {
|
||||
ruleStart := time.Now()
|
||||
if !r.TargetMatchers.Matches(lset) {
|
||||
// If target side of rule doesn't match, we don't need to look any further.
|
||||
r.metrics.matchesDurationNotMatched.Observe(time.Since(ruleStart).Seconds())
|
||||
continue
|
||||
}
|
||||
r.metrics.matchesDurationMatched.Observe(time.Since(ruleStart).Seconds())
|
||||
span.AddEvent("alert matched rule target",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.inhibit_rule.name", r.Name),
|
||||
),
|
||||
)
|
||||
// If we are here, the target side matches. If the source side matches, too, we
|
||||
// need to exclude inhibiting alerts for which the same is true.
|
||||
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset), ruleStart); eq {
|
||||
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset), now); eq {
|
||||
ih.marker.SetInhibited(fp, inhibitedByFP.String())
|
||||
now := time.Now()
|
||||
sinceStart := now.Sub(start)
|
||||
sinceRuleStart := now.Sub(ruleStart)
|
||||
ih.metrics.mutesDurationMuted.Observe(sinceStart.Seconds())
|
||||
r.metrics.mutesDurationMuted.Observe(sinceRuleStart.Seconds())
|
||||
span.AddEvent("alert inhibited",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.inhibit_rule.source.fingerprint", inhibitedByFP.String()),
|
||||
),
|
||||
)
|
||||
return true
|
||||
}
|
||||
r.metrics.mutesDurationNotMuted.Observe(time.Since(ruleStart).Seconds())
|
||||
}
|
||||
ih.marker.SetInhibited(fp)
|
||||
ih.metrics.mutesDurationNotMuted.Observe(time.Since(start).Seconds())
|
||||
span.AddEvent("alert not inhibited")
|
||||
|
||||
return false
|
||||
}
|
||||
@@ -227,12 +244,10 @@ type InhibitRule struct {
|
||||
// The index items might overwrite eachother if multiple source alerts have exact equal labels.
|
||||
// Overwrites only happen if the new source alert has bigger EndsAt value.
|
||||
sindex *index
|
||||
|
||||
metrics *RuleMetrics
|
||||
}
|
||||
|
||||
// NewInhibitRule returns a new InhibitRule based on a configuration definition.
|
||||
func NewInhibitRule(cr config.InhibitRule, metrics *RuleMetrics) *InhibitRule {
|
||||
func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
|
||||
var (
|
||||
sourcem labels.Matchers
|
||||
targetm labels.Matchers
|
||||
@@ -292,7 +307,6 @@ func NewInhibitRule(cr config.InhibitRule, metrics *RuleMetrics) *InhibitRule {
|
||||
Equal: equal,
|
||||
scache: store.NewAlerts(),
|
||||
sindex: newIndex(),
|
||||
metrics: metrics,
|
||||
}
|
||||
|
||||
rule.scache.SetGCCallback(rule.gcCallback)
|
||||
@@ -368,10 +382,6 @@ func (r *InhibitRule) gcCallback(alerts []types.Alert) {
|
||||
fp := r.fingerprintEquals(a.Labels)
|
||||
r.sindex.Delete(fp)
|
||||
}
|
||||
if r.Name != "" {
|
||||
r.metrics.sourceAlertsCacheItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.scache.Len()))
|
||||
r.metrics.sourceAlertsIndexItems.With(prometheus.Labels{"rule": r.Name}).Set(float64(r.sindex.Len()))
|
||||
}
|
||||
}
|
||||
|
||||
// hasEqual checks whether the source cache contains alerts matching the equal
|
||||
|
||||
@@ -88,7 +88,7 @@ type benchmarkOptions struct {
|
||||
// It is called n times.
|
||||
newAlertsFunc func(idx int, r config.InhibitRule) []types.Alert
|
||||
// benchFunc runs the benchmark.
|
||||
benchFunc func(mutesFunc func(model.LabelSet) bool) error
|
||||
benchFunc func(mutesFunc func(context.Context, model.LabelSet) bool) error
|
||||
}
|
||||
|
||||
// allRulesMatchBenchmark returns a new benchmark where all inhibition rules
|
||||
@@ -130,8 +130,8 @@ func allRulesMatchBenchmark(b *testing.B, numInhibitionRules, numInhibitingAlert
|
||||
})
|
||||
}
|
||||
return alerts
|
||||
}, benchFunc: func(mutesFunc func(set model.LabelSet) bool) error {
|
||||
if ok := mutesFunc(model.LabelSet{"dst": "0"}); !ok {
|
||||
}, benchFunc: func(mutesFunc func(context.Context, model.LabelSet) bool) error {
|
||||
if ok := mutesFunc(context.Background(), model.LabelSet{"dst": "0"}); !ok {
|
||||
return errors.New("expected dst=0 to be muted")
|
||||
}
|
||||
return nil
|
||||
@@ -172,8 +172,8 @@ func lastRuleMatchesBenchmark(b *testing.B, n int) benchmarkOptions {
|
||||
},
|
||||
},
|
||||
}}
|
||||
}, benchFunc: func(mutesFunc func(set model.LabelSet) bool) error {
|
||||
if ok := mutesFunc(model.LabelSet{"dst": "0"}); !ok {
|
||||
}, benchFunc: func(mutesFunc func(context.Context, model.LabelSet) bool) error {
|
||||
if ok := mutesFunc(context.Background(), model.LabelSet{"dst": "0"}); !ok {
|
||||
return errors.New("expected dst=0 to be muted")
|
||||
}
|
||||
return nil
|
||||
@@ -193,12 +193,12 @@ func benchmarkMutes(b *testing.B, opts benchmarkOptions) {
|
||||
alerts, rules := benchmarkFromOptions(opts)
|
||||
for _, a := range alerts {
|
||||
tmp := a
|
||||
if err = s.Put(&tmp); err != nil {
|
||||
if err = s.Put(context.Background(), &tmp); err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger(), NewInhibitorMetrics(r))
|
||||
ih := NewInhibitor(s, rules, m, promslog.NewNopLogger())
|
||||
defer ih.Stop()
|
||||
go ih.Run()
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
package inhibit
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -125,10 +126,9 @@ func TestInhibitRuleHasEqual(t *testing.T) {
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
r := &InhibitRule{
|
||||
Equal: map[model.LabelName]struct{}{},
|
||||
scache: store.NewAlerts(),
|
||||
sindex: newIndex(),
|
||||
metrics: NewRuleMetrics("test", NewInhibitorMetrics(prometheus.NewRegistry())),
|
||||
Equal: map[model.LabelName]struct{}{},
|
||||
scache: store.NewAlerts(),
|
||||
sindex: newIndex(),
|
||||
}
|
||||
for _, ln := range c.equal {
|
||||
r.Equal[ln] = struct{}{}
|
||||
@@ -160,7 +160,7 @@ func TestInhibitRuleMatches(t *testing.T) {
|
||||
}
|
||||
|
||||
m := types.NewMarker(prometheus.NewRegistry())
|
||||
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
|
||||
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
|
||||
now := time.Now()
|
||||
// Active alert that matches the source filter of rule1.
|
||||
sourceAlert1 := &types.Alert{
|
||||
@@ -240,7 +240,7 @@ func TestInhibitRuleMatches(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
if actual := ih.Mutes(c.target); actual != c.expected {
|
||||
if actual := ih.Mutes(context.Background(), c.target); actual != c.expected {
|
||||
t.Errorf("Expected (*Inhibitor).Mutes(%v) to return %t but got %t", c.target, c.expected, actual)
|
||||
}
|
||||
}
|
||||
@@ -261,7 +261,7 @@ func TestInhibitRuleMatchers(t *testing.T) {
|
||||
}
|
||||
|
||||
m := types.NewMarker(prometheus.NewRegistry())
|
||||
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
|
||||
ih := NewInhibitor(nil, []config.InhibitRule{rule1, rule2}, m, nopLogger)
|
||||
now := time.Now()
|
||||
// Active alert that matches the source filter of rule1.
|
||||
sourceAlert1 := &types.Alert{
|
||||
@@ -341,7 +341,7 @@ func TestInhibitRuleMatchers(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
if actual := ih.Mutes(c.target); actual != c.expected {
|
||||
if actual := ih.Mutes(context.Background(), c.target); actual != c.expected {
|
||||
t.Errorf("Expected (*Inhibitor).Mutes(%v) to return %t but got %t", c.target, c.expected, actual)
|
||||
}
|
||||
}
|
||||
@@ -370,8 +370,8 @@ func TestInhibitRuleName(t *testing.T) {
|
||||
Equal: []string{"instance"},
|
||||
}
|
||||
|
||||
rule1 := NewInhibitRule(config1, nil)
|
||||
rule2 := NewInhibitRule(config2, nil)
|
||||
rule1 := NewInhibitRule(config1)
|
||||
rule2 := NewInhibitRule(config2)
|
||||
|
||||
require.Equal(t, "test-rule", rule1.Name, "Expected named rule to have adopt name from config")
|
||||
require.Empty(t, rule2.Name, "Expected unnamed rule to have empty name")
|
||||
@@ -391,21 +391,27 @@ func newFakeAlerts(alerts []*types.Alert) *fakeAlerts {
|
||||
|
||||
func (f *fakeAlerts) GetPending() provider.AlertIterator { return nil }
|
||||
func (f *fakeAlerts) Get(model.Fingerprint) (*types.Alert, error) { return nil, nil }
|
||||
func (f *fakeAlerts) Put(...*types.Alert) error { return nil }
|
||||
func (f *fakeAlerts) Put(context.Context, ...*types.Alert) error { return nil }
|
||||
func (f *fakeAlerts) Subscribe(name string) provider.AlertIterator {
|
||||
ch := make(chan *types.Alert)
|
||||
ch := make(chan *provider.Alert)
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
for _, a := range f.alerts {
|
||||
ch <- a
|
||||
ch <- &provider.Alert{
|
||||
Data: a,
|
||||
Header: map[string]string{},
|
||||
}
|
||||
}
|
||||
// Send another (meaningless) alert to make sure that the inhibitor has
|
||||
// processed everything.
|
||||
ch <- &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{},
|
||||
StartsAt: time.Now(),
|
||||
ch <- &provider.Alert{
|
||||
Data: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{},
|
||||
StartsAt: time.Now(),
|
||||
},
|
||||
},
|
||||
Header: map[string]string{},
|
||||
}
|
||||
close(f.finished)
|
||||
<-done
|
||||
@@ -414,19 +420,25 @@ func (f *fakeAlerts) Subscribe(name string) provider.AlertIterator {
|
||||
}
|
||||
|
||||
func (f *fakeAlerts) SlurpAndSubscribe(name string) ([]*types.Alert, provider.AlertIterator) {
|
||||
ch := make(chan *types.Alert)
|
||||
ch := make(chan *provider.Alert)
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
for _, a := range f.alerts {
|
||||
ch <- a
|
||||
ch <- &provider.Alert{
|
||||
Data: a,
|
||||
Header: map[string]string{},
|
||||
}
|
||||
}
|
||||
// Send another (meaningless) alert to make sure that the inhibitor has
|
||||
// processed everything.
|
||||
ch <- &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{},
|
||||
StartsAt: time.Now(),
|
||||
ch <- &provider.Alert{
|
||||
Data: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{},
|
||||
StartsAt: time.Now(),
|
||||
},
|
||||
},
|
||||
Header: map[string]string{},
|
||||
}
|
||||
close(f.finished)
|
||||
<-done
|
||||
@@ -520,7 +532,7 @@ func TestInhibit(t *testing.T) {
|
||||
} {
|
||||
ap := newFakeAlerts(tc.alerts)
|
||||
mk := types.NewMarker(prometheus.NewRegistry())
|
||||
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger, NewInhibitorMetrics(prometheus.NewRegistry()))
|
||||
inhibitor := NewInhibitor(ap, []config.InhibitRule{inhibitRule()}, mk, nopLogger)
|
||||
|
||||
go func() {
|
||||
for ap.finished != nil {
|
||||
@@ -535,7 +547,7 @@ func TestInhibit(t *testing.T) {
|
||||
inhibitor.Run()
|
||||
|
||||
for _, expected := range tc.expected {
|
||||
if inhibitor.Mutes(expected.lbls) != expected.muted {
|
||||
if inhibitor.Mutes(context.Background(), expected.lbls) != expected.muted {
|
||||
mute := "unmuted"
|
||||
if expected.muted {
|
||||
mute = "muted"
|
||||
|
||||
@@ -1,129 +0,0 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package inhibit
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
)
|
||||
|
||||
// InhibitorMetrics represents metrics associated to an inhibitor.
|
||||
type InhibitorMetrics struct {
|
||||
// Inhibitor metrics
|
||||
sourceAlertsCacheItems prometheus.Gauge
|
||||
sourceAlertsIndexItems prometheus.Gauge
|
||||
mutesDuration *prometheus.SummaryVec
|
||||
mutesDurationMuted prometheus.Observer
|
||||
mutesDurationNotMuted prometheus.Observer
|
||||
|
||||
// Rule metrics
|
||||
ruleSourceAlertsCacheItems *prometheus.GaugeVec
|
||||
ruleSourceAlertsIndexItems *prometheus.GaugeVec
|
||||
ruleMatchesDuration *prometheus.SummaryVec
|
||||
ruleMutesDuration *prometheus.SummaryVec
|
||||
}
|
||||
|
||||
// NewInhibitorMetrics returns a new InhibitorMetrics.
|
||||
func NewInhibitorMetrics(reg prometheus.Registerer) *InhibitorMetrics {
|
||||
if reg == nil {
|
||||
return nil
|
||||
}
|
||||
metrics := &InhibitorMetrics{
|
||||
sourceAlertsCacheItems: promauto.With(reg).NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_inhibitor_source_alerts_cache_items",
|
||||
Help: "Number of source alerts cached in inhibition rules.",
|
||||
},
|
||||
),
|
||||
sourceAlertsIndexItems: promauto.With(reg).NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_inhibitor_source_alerts_index_items",
|
||||
Help: "Number of source alerts indexed in inhibition rules.",
|
||||
},
|
||||
),
|
||||
mutesDuration: promauto.With(reg).NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Name: "alertmanager_inhibitor_mutes_duration_seconds",
|
||||
Help: "Summary of latencies for the muting of alerts by inhibition rules.",
|
||||
},
|
||||
[]string{"muted"},
|
||||
),
|
||||
|
||||
ruleSourceAlertsCacheItems: promauto.With(reg).NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_inhibit_rule_source_alerts_cache_items",
|
||||
Help: "Number of source alerts cached in inhibition rules.",
|
||||
},
|
||||
[]string{"rule"},
|
||||
),
|
||||
ruleSourceAlertsIndexItems: promauto.With(reg).NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: "alertmanager_inhibit_rule_source_alerts_index_items",
|
||||
Help: "Number of source alerts indexed in inhibition rules.",
|
||||
},
|
||||
[]string{"rule"},
|
||||
),
|
||||
ruleMatchesDuration: promauto.With(reg).NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Name: "alertmanager_inhibit_rule_matches_duration_seconds",
|
||||
Help: "Summary of latencies for the matching of alerts by inhibition rules.",
|
||||
},
|
||||
[]string{"rule", "matched"},
|
||||
),
|
||||
ruleMutesDuration: promauto.With(reg).NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Name: "alertmanager_inhibit_rule_mutes_duration_seconds",
|
||||
Help: "Summary of latencies for the muting of alerts by inhibition rules.",
|
||||
},
|
||||
[]string{"rule", "muted"},
|
||||
),
|
||||
}
|
||||
|
||||
metrics.mutesDurationMuted = metrics.mutesDuration.With(prometheus.Labels{"muted": "true"})
|
||||
metrics.mutesDurationNotMuted = metrics.mutesDuration.With(prometheus.Labels{"muted": "false"})
|
||||
|
||||
metrics.sourceAlertsCacheItems.Set(0)
|
||||
metrics.sourceAlertsIndexItems.Set(0)
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
||||
type RuleMetrics struct {
|
||||
ruleName string
|
||||
matchesDurationMatched prometheus.Observer
|
||||
matchesDurationNotMatched prometheus.Observer
|
||||
|
||||
mutesDurationMuted prometheus.Observer
|
||||
mutesDurationNotMuted prometheus.Observer
|
||||
|
||||
sourceAlertsCacheItems *prometheus.GaugeVec
|
||||
sourceAlertsIndexItems *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func NewRuleMetrics(name string, metrics *InhibitorMetrics) *RuleMetrics {
|
||||
rm := &RuleMetrics{
|
||||
ruleName: name,
|
||||
matchesDurationMatched: metrics.ruleMatchesDuration.With(prometheus.Labels{"rule": name, "matched": "true"}),
|
||||
matchesDurationNotMatched: metrics.ruleMatchesDuration.With(prometheus.Labels{"rule": name, "matched": "false"}),
|
||||
mutesDurationMuted: metrics.ruleMutesDuration.With(prometheus.Labels{"rule": name, "muted": "true"}),
|
||||
mutesDurationNotMuted: metrics.ruleMutesDuration.With(prometheus.Labels{"rule": name, "muted": "false"}),
|
||||
sourceAlertsCacheItems: metrics.ruleSourceAlertsCacheItems,
|
||||
sourceAlertsIndexItems: metrics.ruleSourceAlertsIndexItems,
|
||||
}
|
||||
|
||||
rm.sourceAlertsCacheItems.With(prometheus.Labels{"rule": rm.ruleName}).Set(0)
|
||||
rm.sourceAlertsIndexItems.With(prometheus.Labels{"rule": rm.ruleName}).Set(0)
|
||||
|
||||
return rm
|
||||
}
|
||||
@@ -1,501 +0,0 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package inhibit
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
io_prometheus_client "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/prometheus/alertmanager/config"
|
||||
"github.com/prometheus/alertmanager/pkg/labels"
|
||||
"github.com/prometheus/alertmanager/provider/mem"
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
)
|
||||
|
||||
// getMetricValue retrieves a specific metric value from the registry.
|
||||
func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, labels map[string]string) (float64, uint64, bool) {
|
||||
t.Helper()
|
||||
metricFamilies, err := reg.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, mf := range metricFamilies {
|
||||
if mf.GetName() != metricName {
|
||||
continue
|
||||
}
|
||||
for _, metric := range mf.GetMetric() {
|
||||
if labelsMatch(metric, labels) {
|
||||
if mf.GetType() == io_prometheus_client.MetricType_GAUGE {
|
||||
return metric.GetGauge().GetValue(), 0, true
|
||||
}
|
||||
if mf.GetType() == io_prometheus_client.MetricType_SUMMARY {
|
||||
return 0, metric.GetSummary().GetSampleCount(), true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func labelsMatch(metric *io_prometheus_client.Metric, wantLabels map[string]string) bool {
|
||||
for wantKey, wantVal := range wantLabels {
|
||||
found := false
|
||||
for _, labelPair := range metric.GetLabel() {
|
||||
if labelPair.GetName() == wantKey && labelPair.GetValue() == wantVal {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func TestInhibitorMetrics_RuleMatchesDuration(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := NewInhibitorMetrics(reg)
|
||||
|
||||
rules := []config.InhibitRule{
|
||||
{
|
||||
Name: "test-rule",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"instance"},
|
||||
},
|
||||
}
|
||||
|
||||
marker := types.NewMarker(reg)
|
||||
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
|
||||
|
||||
// Test case 1: Target matches (should record matched="true")
|
||||
targetAlert := model.LabelSet{
|
||||
"severity": "warning",
|
||||
"instance": "server1",
|
||||
}
|
||||
inhibitor.Mutes(targetAlert)
|
||||
|
||||
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
|
||||
map[string]string{"rule": "test-rule", "matched": "true"})
|
||||
require.True(t, found, "Should find matched=true metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for matched=true")
|
||||
|
||||
// Test case 2: Target doesn't match (should record matched="false")
|
||||
nonMatchingAlert := model.LabelSet{
|
||||
"severity": "info",
|
||||
"instance": "server2",
|
||||
}
|
||||
inhibitor.Mutes(nonMatchingAlert)
|
||||
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
|
||||
map[string]string{"rule": "test-rule", "matched": "false"})
|
||||
require.True(t, found, "Should find matched=false metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for matched=false")
|
||||
}
|
||||
|
||||
func TestInhibitorMetrics_RuleMutesDuration_Muted(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := NewInhibitorMetrics(reg)
|
||||
|
||||
rules := []config.InhibitRule{
|
||||
{
|
||||
Name: "test-rule",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"instance"},
|
||||
},
|
||||
}
|
||||
|
||||
marker := types.NewMarker(reg)
|
||||
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
|
||||
|
||||
// Add a source alert that will inhibit
|
||||
sourceAlert := &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{
|
||||
"severity": "critical",
|
||||
"instance": "server1",
|
||||
},
|
||||
StartsAt: time.Now().Add(-time.Minute),
|
||||
EndsAt: time.Now().Add(time.Hour),
|
||||
},
|
||||
}
|
||||
inhibitor.rules[0].scache.Set(sourceAlert)
|
||||
inhibitor.rules[0].updateIndex(sourceAlert)
|
||||
|
||||
// Test that target alert is muted
|
||||
targetAlert := model.LabelSet{
|
||||
"severity": "warning",
|
||||
"instance": "server1",
|
||||
}
|
||||
muted := inhibitor.Mutes(targetAlert)
|
||||
require.True(t, muted, "Alert should be muted")
|
||||
|
||||
// Verify per-rule muted="true" metric was recorded
|
||||
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
|
||||
map[string]string{"rule": "test-rule", "muted": "true"})
|
||||
require.True(t, found, "Should find per-rule muted=true metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for per-rule muted=true")
|
||||
|
||||
// Verify global muted="true" metric was recorded
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
|
||||
map[string]string{"muted": "true"})
|
||||
require.True(t, found, "Should find global muted=true metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for global muted=true")
|
||||
}
|
||||
|
||||
func TestInhibitorMetrics_RuleMutesDuration_NotMuted(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := NewInhibitorMetrics(reg)
|
||||
|
||||
rules := []config.InhibitRule{
|
||||
{
|
||||
Name: "test-rule",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"instance"},
|
||||
},
|
||||
}
|
||||
|
||||
marker := types.NewMarker(reg)
|
||||
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
|
||||
|
||||
// Add a source alert with different instance
|
||||
sourceAlert := &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{
|
||||
"severity": "critical",
|
||||
"instance": "server1",
|
||||
},
|
||||
StartsAt: time.Now().Add(-time.Minute),
|
||||
EndsAt: time.Now().Add(time.Hour),
|
||||
},
|
||||
}
|
||||
inhibitor.rules[0].scache.Set(sourceAlert)
|
||||
|
||||
// Test that target alert with different instance is NOT muted
|
||||
targetAlert := model.LabelSet{
|
||||
"severity": "warning",
|
||||
"instance": "server2",
|
||||
}
|
||||
muted := inhibitor.Mutes(targetAlert)
|
||||
require.False(t, muted, "Alert should not be muted")
|
||||
|
||||
// Verify per-rule muted="false" metric was recorded
|
||||
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
|
||||
map[string]string{"rule": "test-rule", "muted": "false"})
|
||||
require.True(t, found, "Should find per-rule muted=false metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for per-rule muted=false")
|
||||
|
||||
// Verify global muted="false" metric was recorded
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
|
||||
map[string]string{"muted": "false"})
|
||||
require.True(t, found, "Should find global muted=false metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for global muted=false")
|
||||
}
|
||||
|
||||
func TestInhibitorMetrics_NoRuleMatches(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := NewInhibitorMetrics(reg)
|
||||
|
||||
rules := []config.InhibitRule{
|
||||
{
|
||||
Name: "test-rule",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"instance"},
|
||||
},
|
||||
}
|
||||
|
||||
marker := types.NewMarker(reg)
|
||||
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
|
||||
|
||||
// Test with alert that doesn't match any rule's target
|
||||
nonMatchingAlert := model.LabelSet{
|
||||
"severity": "info",
|
||||
"instance": "server1",
|
||||
}
|
||||
muted := inhibitor.Mutes(nonMatchingAlert)
|
||||
require.False(t, muted, "Alert should not be muted")
|
||||
|
||||
// Verify that global muted="false" metric was recorded
|
||||
_, count, found := getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
|
||||
map[string]string{"muted": "false"})
|
||||
require.True(t, found, "Should find global muted=false metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for global muted=false")
|
||||
|
||||
// Verify per-rule matched="false" was recorded
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
|
||||
map[string]string{"rule": "test-rule", "matched": "false"})
|
||||
require.True(t, found, "Should find rule matched=false metric")
|
||||
require.Equal(t, uint64(1), count, "Should have 1 sample for rule matched=false")
|
||||
}
|
||||
|
||||
func TestInhibitorMetrics_MultipleRules(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := NewInhibitorMetrics(reg)
|
||||
|
||||
rules := []config.InhibitRule{
|
||||
{
|
||||
Name: "rule-1",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"instance"},
|
||||
},
|
||||
{
|
||||
Name: "rule-2",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "team", Value: "sre"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "team", Value: "dev"},
|
||||
},
|
||||
Equal: []string{"service"},
|
||||
},
|
||||
}
|
||||
|
||||
marker := types.NewMarker(reg)
|
||||
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
|
||||
|
||||
// Add source alert for rule-1
|
||||
sourceAlert1 := &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{
|
||||
"severity": "critical",
|
||||
"instance": "server1",
|
||||
},
|
||||
StartsAt: time.Now().Add(-time.Minute),
|
||||
EndsAt: time.Now().Add(time.Hour),
|
||||
},
|
||||
}
|
||||
inhibitor.rules[0].scache.Set(sourceAlert1)
|
||||
inhibitor.rules[0].updateIndex(sourceAlert1)
|
||||
|
||||
// Test alert that matches rule-1
|
||||
targetAlert1 := model.LabelSet{
|
||||
"severity": "warning",
|
||||
"instance": "server1",
|
||||
}
|
||||
muted1 := inhibitor.Mutes(targetAlert1)
|
||||
require.True(t, muted1, "Alert should be muted by rule-1")
|
||||
|
||||
// Verify metrics for rule-1
|
||||
_, count, found := getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
|
||||
map[string]string{"rule": "rule-1", "matched": "true"})
|
||||
require.True(t, found, "Should find rule-1 matched=true metric")
|
||||
require.Equal(t, 1, int(count))
|
||||
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
|
||||
map[string]string{"rule": "rule-1", "muted": "true"})
|
||||
require.True(t, found, "Should find rule-1 muted=true metric")
|
||||
require.Equal(t, 1, int(count))
|
||||
|
||||
// Verify global muted="true" metric
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
|
||||
map[string]string{"muted": "true"})
|
||||
require.True(t, found, "Should find global muted=true metric")
|
||||
require.Equal(t, 1, int(count))
|
||||
|
||||
// Test alert that matches rule-2 target but has no source
|
||||
targetAlert2 := model.LabelSet{
|
||||
"team": "dev",
|
||||
"service": "api",
|
||||
}
|
||||
muted2 := inhibitor.Mutes(targetAlert2)
|
||||
require.False(t, muted2, "Alert should not be muted")
|
||||
|
||||
// Verify metrics for rule-2 (both rules process this alert since rule-1 doesn't match target)
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
|
||||
map[string]string{"rule": "rule-1", "matched": "false"})
|
||||
require.True(t, found, "Should find rule-1 matched=false metric")
|
||||
require.Equal(t, 1, int(count))
|
||||
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_matches_duration_seconds",
|
||||
map[string]string{"rule": "rule-2", "matched": "true"})
|
||||
require.True(t, found, "Should find rule-2 matched=true metric")
|
||||
require.Equal(t, 1, int(count))
|
||||
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_mutes_duration_seconds",
|
||||
map[string]string{"rule": "rule-2", "muted": "false"})
|
||||
require.True(t, found, "Should find rule-2 muted=false metric")
|
||||
require.Equal(t, 1, int(count))
|
||||
|
||||
// Verify global muted="false" metric
|
||||
_, count, found = getMetricValue(t, reg, "alertmanager_inhibitor_mutes_duration_seconds",
|
||||
map[string]string{"muted": "false"})
|
||||
require.True(t, found, "Should find global muted=false metric")
|
||||
require.Equal(t, 1, int(count), "Should have 1 samples")
|
||||
}
|
||||
|
||||
func TestInhibitorMetrics_CacheAndIndexItems(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := NewInhibitorMetrics(reg)
|
||||
|
||||
rules := []config.InhibitRule{
|
||||
{
|
||||
Name: "named-rule",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"instance"},
|
||||
},
|
||||
{
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"cluster"},
|
||||
},
|
||||
}
|
||||
|
||||
marker := types.NewMarker(reg)
|
||||
provider, err := mem.NewAlerts(t.Context(), marker, 15*time.Minute, nil, nopLogger, reg)
|
||||
require.NoError(t, err)
|
||||
inhibitor := NewInhibitor(provider, rules, marker, nopLogger, metrics)
|
||||
go inhibitor.Run()
|
||||
|
||||
// Add multiple source alerts
|
||||
for i := 1; i <= 3; i++ {
|
||||
sourceAlert := &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{
|
||||
"severity": "critical",
|
||||
"instance": model.LabelValue("server" + string(rune('0'+i))),
|
||||
"cluster": model.LabelValue("cluster" + string(rune('0'+i))),
|
||||
},
|
||||
StartsAt: time.Now().Add(-time.Minute),
|
||||
EndsAt: time.Now().Add(time.Hour),
|
||||
},
|
||||
}
|
||||
require.NoError(t, provider.Put(sourceAlert))
|
||||
}
|
||||
|
||||
// Wait for the inhibitor to process alerts and update metrics
|
||||
// The Run() goroutine processes alerts asynchronously
|
||||
require.Eventually(t, func() bool {
|
||||
value, _, found := getMetricValue(t, reg, "alertmanager_inhibitor_source_alerts_cache_items",
|
||||
map[string]string{})
|
||||
return found && value == 6
|
||||
}, 2*time.Second, 50*time.Millisecond, "Cache items metric should reach 6")
|
||||
|
||||
// Stop the inhibitor
|
||||
inhibitor.Stop()
|
||||
|
||||
// Global metrics (no labels) show the sum across all rules
|
||||
value, _, found := getMetricValue(t, reg, "alertmanager_inhibitor_source_alerts_cache_items",
|
||||
map[string]string{})
|
||||
require.True(t, found, "Should find global cache items metric")
|
||||
require.Equal(t, float64(6), value, "Global cache should contain 6 alerts total")
|
||||
|
||||
value, _, found = getMetricValue(t, reg, "alertmanager_inhibitor_source_alerts_index_items",
|
||||
map[string]string{})
|
||||
require.True(t, found, "Should find global index items metric")
|
||||
require.Equal(t, float64(6), value, "Global index should contain 6 entries total")
|
||||
|
||||
// Per-rule metrics show individual rule values
|
||||
value, _, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_source_alerts_cache_items",
|
||||
map[string]string{"rule": "named-rule"})
|
||||
require.True(t, found, "Should find per-rule cache items metric")
|
||||
require.Equal(t, float64(3), value, "Named rule cache should contain 3 alerts")
|
||||
|
||||
value, _, found = getMetricValue(t, reg, "alertmanager_inhibit_rule_source_alerts_index_items",
|
||||
map[string]string{"rule": "named-rule"})
|
||||
require.True(t, found, "Should find per-rule index items metric")
|
||||
require.Equal(t, float64(3), value, "Named rule index should contain 3 entries")
|
||||
}
|
||||
|
||||
func TestInhibitorMetrics_Registration(t *testing.T) {
|
||||
reg := prometheus.NewRegistry()
|
||||
metrics := NewInhibitorMetrics(reg)
|
||||
|
||||
require.NotNil(t, metrics, "Metrics should be created")
|
||||
|
||||
// Create a rule and use the metrics so they appear in Gather() output
|
||||
rules := []config.InhibitRule{
|
||||
{
|
||||
Name: "test-rule",
|
||||
SourceMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "critical"},
|
||||
},
|
||||
TargetMatchers: []*labels.Matcher{
|
||||
{Type: labels.MatchEqual, Name: "severity", Value: "warning"},
|
||||
},
|
||||
Equal: []string{"instance"},
|
||||
},
|
||||
}
|
||||
|
||||
marker := types.NewMarker(reg)
|
||||
inhibitor := NewInhibitor(nil, rules, marker, nopLogger, metrics)
|
||||
|
||||
// Use the metrics to ensure they show up in Gather()
|
||||
testAlert := model.LabelSet{
|
||||
"severity": "warning",
|
||||
"instance": "server1",
|
||||
}
|
||||
inhibitor.Mutes(testAlert)
|
||||
|
||||
// Verify all metrics are registered and have data
|
||||
metricFamilies, err := reg.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
registeredMetrics := map[string]bool{
|
||||
"alertmanager_inhibitor_source_alerts_cache_items": false,
|
||||
"alertmanager_inhibitor_source_alerts_index_items": false,
|
||||
"alertmanager_inhibitor_mutes_duration_seconds": false,
|
||||
"alertmanager_inhibit_rule_source_alerts_cache_items": false,
|
||||
"alertmanager_inhibit_rule_source_alerts_index_items": false,
|
||||
"alertmanager_inhibit_rule_matches_duration_seconds": false,
|
||||
"alertmanager_inhibit_rule_mutes_duration_seconds": false,
|
||||
}
|
||||
|
||||
for _, mf := range metricFamilies {
|
||||
if _, exists := registeredMetrics[mf.GetName()]; exists {
|
||||
registeredMetrics[mf.GetName()] = true
|
||||
}
|
||||
}
|
||||
|
||||
for metricName, registered := range registeredMetrics {
|
||||
require.True(t, registered, "Metric %s should be registered", metricName)
|
||||
}
|
||||
}
|
||||
105
notify/notify.go
105
notify/notify.go
@@ -27,6 +27,10 @@ import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/prometheus/common/model"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/prometheus/alertmanager/featurecontrol"
|
||||
"github.com/prometheus/alertmanager/inhibit"
|
||||
@@ -37,6 +41,8 @@ import (
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
)
|
||||
|
||||
var tracer = otel.Tracer("github.com/prometheus/alertmanager/notify")
|
||||
|
||||
// ResolvedSender returns true if resolved notifications should be sent.
|
||||
type ResolvedSender interface {
|
||||
SendResolved() bool
|
||||
@@ -81,8 +87,24 @@ func NewIntegration(notifier Notifier, rs ResolvedSender, name string, idx int,
|
||||
}
|
||||
|
||||
// Notify implements the Notifier interface.
|
||||
func (i *Integration) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) {
|
||||
return i.notifier.Notify(ctx, alerts...)
|
||||
func (i *Integration) Notify(ctx context.Context, alerts ...*types.Alert) (recoverable bool, err error) {
|
||||
ctx, span := tracer.Start(ctx, "notify.Integration.Notify",
|
||||
trace.WithAttributes(attribute.String("alerting.notify.integration.name", i.name)),
|
||||
trace.WithAttributes(attribute.Int("alerting.alerts.count", len(alerts))),
|
||||
trace.WithSpanKind(trace.SpanKindClient),
|
||||
)
|
||||
|
||||
defer func() {
|
||||
span.SetAttributes(attribute.Bool("alerting.notify.error.recoverable", recoverable))
|
||||
if err != nil {
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
span.RecordError(err)
|
||||
}
|
||||
span.End()
|
||||
}()
|
||||
|
||||
recoverable, err = i.notifier.Notify(ctx, alerts...)
|
||||
return recoverable, err
|
||||
}
|
||||
|
||||
// SendResolved implements the ResolvedSender interface.
|
||||
@@ -454,6 +476,15 @@ func (rs RoutingStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*type
|
||||
return ctx, nil, errors.New("receiver missing")
|
||||
}
|
||||
|
||||
ctx, span := tracer.Start(ctx, "notify.RoutingStage.Exec",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.notify.receiver.name", receiver),
|
||||
attribute.Int("alerting.alerts.count", len(alerts)),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
s, ok := rs[receiver]
|
||||
if !ok {
|
||||
return ctx, nil, errors.New("stage for receiver missing")
|
||||
@@ -548,6 +579,12 @@ func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
|
||||
|
||||
// Exec implements the Stage interface.
|
||||
func (n *MuteStage) Exec(ctx context.Context, logger *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
||||
ctx, span := tracer.Start(ctx, "notify.MuteStage.Exec",
|
||||
trace.WithAttributes(attribute.Int("alerting.alerts.count", len(alerts))),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
var (
|
||||
filtered []*types.Alert
|
||||
muted []*types.Alert
|
||||
@@ -555,7 +592,7 @@ func (n *MuteStage) Exec(ctx context.Context, logger *slog.Logger, alerts ...*ty
|
||||
for _, a := range alerts {
|
||||
// TODO(fabxc): increment total alerts counter.
|
||||
// Do not send the alert if muted.
|
||||
if n.muter.Mutes(a.Labels) {
|
||||
if n.muter.Mutes(ctx, a.Labels) {
|
||||
muted = append(muted, a)
|
||||
} else {
|
||||
filtered = append(filtered, a)
|
||||
@@ -572,6 +609,11 @@ func (n *MuteStage) Exec(ctx context.Context, logger *slog.Logger, alerts ...*ty
|
||||
reason = SuppressedReasonInhibition
|
||||
default:
|
||||
}
|
||||
span.SetAttributes(
|
||||
attribute.Int("alerting.alerts.muted.count", len(muted)),
|
||||
attribute.Int("alerting.alerts.filtered.count", len(filtered)),
|
||||
attribute.String("alerting.suppressed.reason", reason),
|
||||
)
|
||||
n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(muted)))
|
||||
logger.Debug("Notifications will not be sent for muted alerts", "alerts", fmt.Sprintf("%v", muted), "reason", reason)
|
||||
}
|
||||
@@ -700,6 +742,13 @@ func (n *DedupStage) Exec(ctx context.Context, _ *slog.Logger, alerts ...*types.
|
||||
return ctx, nil, errors.New("group key missing")
|
||||
}
|
||||
|
||||
ctx, span := tracer.Start(ctx, "notify.DedupStage.Exec",
|
||||
trace.WithAttributes(attribute.String("alerting.group.key", gkey)),
|
||||
trace.WithAttributes(attribute.Int("alerting.alerts.count", len(alerts))),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
repeatInterval, ok := RepeatInterval(ctx)
|
||||
if !ok {
|
||||
return ctx, nil, errors.New("repeat interval missing")
|
||||
@@ -740,6 +789,7 @@ func (n *DedupStage) Exec(ctx context.Context, _ *slog.Logger, alerts ...*types.
|
||||
}
|
||||
|
||||
if n.needsUpdate(entry, firingSet, resolvedSet, repeatInterval) {
|
||||
span.AddEvent("notify.DedupStage.Exec nflog needs update")
|
||||
return ctx, alerts, nil
|
||||
}
|
||||
return ctx, nil, nil
|
||||
@@ -772,10 +822,23 @@ func NewRetryStage(i Integration, groupName string, metrics *Metrics) *RetryStag
|
||||
|
||||
func (r RetryStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
||||
r.metrics.numNotifications.WithLabelValues(r.labelValues...).Inc()
|
||||
|
||||
ctx, span := tracer.Start(ctx, "notify.RetryStage.Exec",
|
||||
trace.WithAttributes(attribute.String("alerting.group.name", r.groupName)),
|
||||
trace.WithAttributes(attribute.String("alerting.integration.name", r.integration.name)),
|
||||
trace.WithAttributes(attribute.StringSlice("alerting.label.values", r.labelValues)),
|
||||
trace.WithAttributes(attribute.Int("alerting.alerts.count", len(alerts))),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
ctx, alerts, err := r.exec(ctx, l, alerts...)
|
||||
|
||||
failureReason := DefaultReason.String()
|
||||
if err != nil {
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
span.RecordError(err)
|
||||
|
||||
var e *ErrorWithReason
|
||||
if errors.As(err, &e) {
|
||||
failureReason = e.Reason.String()
|
||||
@@ -917,6 +980,13 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*t
|
||||
return ctx, nil, errors.New("group key missing")
|
||||
}
|
||||
|
||||
ctx, span := tracer.Start(ctx, "notify.SetNotifiesStage.Exec",
|
||||
trace.WithAttributes(attribute.String("alerting.group.key", gkey)),
|
||||
trace.WithAttributes(attribute.Int("alerting.alerts.count", len(alerts))),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
firing, ok := FiringAlerts(ctx)
|
||||
if !ok {
|
||||
return ctx, nil, errors.New("firing alerts missing")
|
||||
@@ -933,6 +1003,11 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*t
|
||||
}
|
||||
expiry := 2 * repeat
|
||||
|
||||
span.SetAttributes(
|
||||
attribute.Int("alerting.alerts.firing.count", len(firing)),
|
||||
attribute.Int("alerting.alerts.resolved.count", len(resolved)),
|
||||
)
|
||||
|
||||
return ctx, alerts, n.nflog.Log(n.recv, gkey, firing, resolved, expiry)
|
||||
}
|
||||
|
||||
@@ -951,15 +1026,26 @@ func NewTimeMuteStage(muter types.TimeMuter, marker types.GroupMarker, metrics *
|
||||
// Exec implements the stage interface for TimeMuteStage.
|
||||
// TimeMuteStage is responsible for muting alerts whose route is not in an active time.
|
||||
func (tms TimeMuteStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
|
||||
ctx, span := tracer.Start(ctx, "notify.TimeMuteStage.Exec",
|
||||
trace.WithAttributes(attribute.Int("alerting.alerts.count", len(alerts))),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
routeID, ok := RouteID(ctx)
|
||||
if !ok {
|
||||
return ctx, nil, errors.New("route ID missing")
|
||||
err := errors.New("route ID missing")
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
span.RecordError(err)
|
||||
return ctx, nil, err
|
||||
}
|
||||
span.SetAttributes(attribute.String("alerting.route.id", routeID))
|
||||
|
||||
gkey, ok := GroupKey(ctx)
|
||||
if !ok {
|
||||
return ctx, nil, errors.New("group key missing")
|
||||
}
|
||||
span.SetAttributes(attribute.String("alerting.group.key", gkey))
|
||||
|
||||
muteTimeIntervalNames, ok := MuteTimeIntervalNames(ctx)
|
||||
if !ok {
|
||||
@@ -977,6 +1063,8 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*ty
|
||||
|
||||
muted, mutedBy, err := tms.muter.Mutes(muteTimeIntervalNames, now)
|
||||
if err != nil {
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
span.RecordError(err)
|
||||
return ctx, alerts, err
|
||||
}
|
||||
// If muted is false then mutedBy is nil and the muted marker is removed.
|
||||
@@ -986,6 +1074,7 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*ty
|
||||
if muted {
|
||||
tms.metrics.numNotificationSuppressedTotal.WithLabelValues(SuppressedReasonMuteTimeInterval).Add(float64(len(alerts)))
|
||||
l.Debug("Notifications not sent, route is within mute time", "alerts", len(alerts))
|
||||
span.AddEvent("notify.TimeMuteStage.Exec muted the alerts")
|
||||
return ctx, nil, nil
|
||||
}
|
||||
|
||||
@@ -1006,6 +1095,13 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*
|
||||
return ctx, nil, errors.New("route ID missing")
|
||||
}
|
||||
|
||||
ctx, span := tracer.Start(ctx, "notify.TimeActiveStage.Exec",
|
||||
trace.WithAttributes(attribute.String("alerting.route.id", routeID)),
|
||||
trace.WithAttributes(attribute.Int("alerting.alerts.count", len(alerts))),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
gkey, ok := GroupKey(ctx)
|
||||
if !ok {
|
||||
return ctx, nil, errors.New("group key missing")
|
||||
@@ -1042,6 +1138,7 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*
|
||||
|
||||
// If the current time is not inside an active time, all alerts are removed from the pipeline
|
||||
if !active {
|
||||
span.AddEvent("notify.TimeActiveStage.Exec not active, removing all alerts")
|
||||
tas.metrics.numNotificationSuppressedTotal.WithLabelValues(SuppressedReasonActiveTimeInterval).Add(float64(len(alerts)))
|
||||
l.Debug("Notifications not sent, route is not within active time", "alerts", len(alerts))
|
||||
return ctx, nil, nil
|
||||
|
||||
@@ -664,7 +664,7 @@ func TestSetNotifiesStage(t *testing.T) {
|
||||
|
||||
func TestMuteStage(t *testing.T) {
|
||||
// Mute all label sets that have a "mute" key.
|
||||
muter := types.MuteFunc(func(lset model.LabelSet) bool {
|
||||
muter := types.MuteFunc(func(ctx context.Context, lset model.LabelSet) bool {
|
||||
_, ok := lset["mute"]
|
||||
return ok
|
||||
})
|
||||
@@ -724,7 +724,7 @@ func TestMuteStageWithSilences(t *testing.T) {
|
||||
EndsAt: utcNow().Add(time.Hour),
|
||||
Matchers: []*silencepb.Matcher{{Name: "mute", Pattern: "me"}},
|
||||
}
|
||||
if err = silences.Set(sil); err != nil {
|
||||
if err = silences.Set(t.Context(), sil); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
@@ -801,11 +801,11 @@ func TestMuteStageWithSilences(t *testing.T) {
|
||||
}
|
||||
|
||||
// Expire the silence and verify that no alerts are silenced now.
|
||||
if err := silences.Expire(sil.Id); err != nil {
|
||||
if err := silences.Expire(t.Context(), sil.Id); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
_, alerts, err = stage.Exec(context.Background(), promslog.NewNopLogger(), inAlerts...)
|
||||
_, alerts, err = stage.Exec(t.Context(), promslog.NewNopLogger(), inAlerts...)
|
||||
if err != nil {
|
||||
t.Fatalf("Exec failed: %s", err)
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ import (
|
||||
"github.com/prometheus/common/version"
|
||||
|
||||
"github.com/prometheus/alertmanager/template"
|
||||
"github.com/prometheus/alertmanager/tracing"
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
)
|
||||
|
||||
@@ -75,6 +76,10 @@ func request(ctx context.Context, client *http.Client, method, url, bodyType str
|
||||
if bodyType != "" {
|
||||
req.Header.Set("Content-Type", bodyType)
|
||||
}
|
||||
|
||||
// Inject trancing transport
|
||||
client.Transport = tracing.Transport(client.Transport)
|
||||
|
||||
return client.Do(req.WithContext(ctx))
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,10 @@ import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/prometheus/common/model"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/prometheus/alertmanager/provider"
|
||||
"github.com/prometheus/alertmanager/store"
|
||||
@@ -30,6 +34,8 @@ import (
|
||||
|
||||
const alertChannelLength = 200
|
||||
|
||||
var tracer = otel.Tracer("github.com/prometheus/alertmanager/provider/mem")
|
||||
|
||||
// Alerts gives access to a set of alerts. All methods are goroutine-safe.
|
||||
type Alerts struct {
|
||||
cancel context.CancelFunc
|
||||
@@ -44,7 +50,8 @@ type Alerts struct {
|
||||
|
||||
callback AlertStoreCallback
|
||||
|
||||
logger *slog.Logger
|
||||
logger *slog.Logger
|
||||
propagator propagation.TextMapPropagator
|
||||
|
||||
subscriberChannelWrites *prometheus.CounterVec
|
||||
}
|
||||
@@ -65,7 +72,7 @@ type AlertStoreCallback interface {
|
||||
|
||||
type listeningAlerts struct {
|
||||
name string
|
||||
alerts chan *types.Alert
|
||||
alerts chan *provider.Alert
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
@@ -104,13 +111,14 @@ func NewAlerts(ctx context.Context, m types.AlertMarker, intervalGC time.Duratio
|
||||
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
a := &Alerts{
|
||||
marker: m,
|
||||
alerts: store.NewAlerts(),
|
||||
cancel: cancel,
|
||||
listeners: map[int]listeningAlerts{},
|
||||
next: 0,
|
||||
logger: l.With("component", "provider"),
|
||||
callback: alertCallback,
|
||||
marker: m,
|
||||
alerts: store.NewAlerts(),
|
||||
cancel: cancel,
|
||||
listeners: map[int]listeningAlerts{},
|
||||
next: 0,
|
||||
logger: l.With("component", "provider"),
|
||||
propagator: otel.GetTextMapPropagator(),
|
||||
callback: alertCallback,
|
||||
}
|
||||
|
||||
if r != nil {
|
||||
@@ -175,11 +183,14 @@ func (a *Alerts) Subscribe(name string) provider.AlertIterator {
|
||||
var (
|
||||
done = make(chan struct{})
|
||||
alerts = a.alerts.List()
|
||||
ch = make(chan *types.Alert, max(len(alerts), alertChannelLength))
|
||||
ch = make(chan *provider.Alert, max(len(alerts), alertChannelLength))
|
||||
)
|
||||
|
||||
for _, a := range alerts {
|
||||
ch <- a
|
||||
ch <- &provider.Alert{
|
||||
Header: map[string]string{},
|
||||
Data: a,
|
||||
}
|
||||
}
|
||||
|
||||
a.listeners[a.next] = listeningAlerts{name: name, alerts: ch, done: done}
|
||||
@@ -195,7 +206,7 @@ func (a *Alerts) SlurpAndSubscribe(name string) ([]*types.Alert, provider.AlertI
|
||||
var (
|
||||
done = make(chan struct{})
|
||||
alerts = a.alerts.List()
|
||||
ch = make(chan *types.Alert, alertChannelLength)
|
||||
ch = make(chan *provider.Alert, alertChannelLength)
|
||||
)
|
||||
|
||||
a.listeners[a.next] = listeningAlerts{name: name, alerts: ch, done: done}
|
||||
@@ -208,7 +219,7 @@ func (a *Alerts) SlurpAndSubscribe(name string) ([]*types.Alert, provider.AlertI
|
||||
// pending notifications.
|
||||
func (a *Alerts) GetPending() provider.AlertIterator {
|
||||
var (
|
||||
ch = make(chan *types.Alert, alertChannelLength)
|
||||
ch = make(chan *provider.Alert, alertChannelLength)
|
||||
done = make(chan struct{})
|
||||
)
|
||||
a.mtx.Lock()
|
||||
@@ -219,7 +230,10 @@ func (a *Alerts) GetPending() provider.AlertIterator {
|
||||
defer close(ch)
|
||||
for _, a := range alerts {
|
||||
select {
|
||||
case ch <- a:
|
||||
case ch <- &provider.Alert{
|
||||
Header: map[string]string{},
|
||||
Data: a,
|
||||
}:
|
||||
case <-done:
|
||||
return
|
||||
}
|
||||
@@ -237,10 +251,18 @@ func (a *Alerts) Get(fp model.Fingerprint) (*types.Alert, error) {
|
||||
}
|
||||
|
||||
// Put adds the given alert to the set.
|
||||
func (a *Alerts) Put(alerts ...*types.Alert) error {
|
||||
func (a *Alerts) Put(ctx context.Context, alerts ...*types.Alert) error {
|
||||
a.mtx.Lock()
|
||||
defer a.mtx.Unlock()
|
||||
|
||||
ctx, span := tracer.Start(ctx, "provider.mem.Put",
|
||||
trace.WithAttributes(
|
||||
attribute.Int("alerting.alerts.count", len(alerts)),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindProducer),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
for _, alert := range alerts {
|
||||
fp := alert.Fingerprint()
|
||||
|
||||
@@ -270,9 +292,16 @@ func (a *Alerts) Put(alerts ...*types.Alert) error {
|
||||
|
||||
a.callback.PostStore(alert, existing)
|
||||
|
||||
metadata := map[string]string{}
|
||||
a.propagator.Inject(ctx, propagation.MapCarrier(metadata))
|
||||
msg := &provider.Alert{
|
||||
Data: alert,
|
||||
Header: metadata,
|
||||
}
|
||||
|
||||
for _, l := range a.listeners {
|
||||
select {
|
||||
case l.alerts <- alert:
|
||||
case l.alerts <- msg:
|
||||
a.subscriberChannelWrites.WithLabelValues(l.name).Inc()
|
||||
case <-l.done:
|
||||
}
|
||||
|
||||
@@ -109,7 +109,7 @@ func TestAlertsSubscribePutStarvation(t *testing.T) {
|
||||
putIsDone := make(chan struct{})
|
||||
putsErr := make(chan error, 1)
|
||||
go func() {
|
||||
if err := alerts.Put(alertsToInsert...); err != nil {
|
||||
if err := alerts.Put(context.Background(), alertsToInsert...); err != nil {
|
||||
putsErr <- err
|
||||
return
|
||||
}
|
||||
@@ -157,7 +157,7 @@ func TestDeadLock(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
if err := alerts.Put(alertsToInsert...); err != nil {
|
||||
if err := alerts.Put(context.Background(), alertsToInsert...); err != nil {
|
||||
t.Fatal("Unable to add alerts")
|
||||
}
|
||||
done := make(chan bool)
|
||||
@@ -197,7 +197,7 @@ func TestAlertsPut(t *testing.T) {
|
||||
|
||||
insert := []*types.Alert{alert1, alert2, alert3}
|
||||
|
||||
if err := alerts.Put(insert...); err != nil {
|
||||
if err := alerts.Put(context.Background(), insert...); err != nil {
|
||||
t.Fatalf("Insert failed: %s", err)
|
||||
}
|
||||
|
||||
@@ -220,7 +220,7 @@ func TestAlertsSubscribe(t *testing.T) {
|
||||
}
|
||||
|
||||
// Add alert1 to validate if pending alerts will be sent.
|
||||
if err := alerts.Put(alert1); err != nil {
|
||||
if err := alerts.Put(ctx, alert1); err != nil {
|
||||
t.Fatalf("Insert failed: %s", err)
|
||||
}
|
||||
|
||||
@@ -256,12 +256,12 @@ func TestAlertsSubscribe(t *testing.T) {
|
||||
fatalc <- fmt.Sprintf("Iterator %d: %v", i, it.Err())
|
||||
return
|
||||
}
|
||||
expected := expectedAlerts[got.Fingerprint()]
|
||||
if err := alertDiff(got, expected); err != nil {
|
||||
expected := expectedAlerts[got.Data.Fingerprint()]
|
||||
if err := alertDiff(got.Data, expected); err != nil {
|
||||
fatalc <- fmt.Sprintf("Unexpected alert (iterator %d)\n%s", i, err.Error())
|
||||
return
|
||||
}
|
||||
received[got.Fingerprint()] = struct{}{}
|
||||
received[got.Data.Fingerprint()] = struct{}{}
|
||||
if len(received) == len(expectedAlerts) {
|
||||
return
|
||||
}
|
||||
@@ -274,10 +274,10 @@ func TestAlertsSubscribe(t *testing.T) {
|
||||
}
|
||||
|
||||
// Add more alerts that should be received by the subscribers.
|
||||
if err := alerts.Put(alert2); err != nil {
|
||||
if err := alerts.Put(ctx, alert2); err != nil {
|
||||
t.Fatalf("Insert failed: %s", err)
|
||||
}
|
||||
if err := alerts.Put(alert3); err != nil {
|
||||
if err := alerts.Put(ctx, alert3); err != nil {
|
||||
t.Fatalf("Insert failed: %s", err)
|
||||
}
|
||||
|
||||
@@ -296,7 +296,8 @@ func TestAlertsGetPending(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := alerts.Put(alert1, alert2); err != nil {
|
||||
ctx := context.Background()
|
||||
if err := alerts.Put(ctx, alert1, alert2); err != nil {
|
||||
t.Fatalf("Insert failed: %s", err)
|
||||
}
|
||||
|
||||
@@ -306,11 +307,11 @@ func TestAlertsGetPending(t *testing.T) {
|
||||
}
|
||||
iterator := alerts.GetPending()
|
||||
for actual := range iterator.Next() {
|
||||
expected := expectedAlerts[actual.Fingerprint()]
|
||||
require.NoError(t, alertDiff(actual, expected))
|
||||
expected := expectedAlerts[actual.Data.Fingerprint()]
|
||||
require.NoError(t, alertDiff(actual.Data, expected))
|
||||
}
|
||||
|
||||
if err := alerts.Put(alert3); err != nil {
|
||||
if err := alerts.Put(ctx, alert3); err != nil {
|
||||
t.Fatalf("Insert failed: %s", err)
|
||||
}
|
||||
|
||||
@@ -321,8 +322,8 @@ func TestAlertsGetPending(t *testing.T) {
|
||||
}
|
||||
iterator = alerts.GetPending()
|
||||
for actual := range iterator.Next() {
|
||||
expected := expectedAlerts[actual.Fingerprint()]
|
||||
require.NoError(t, alertDiff(actual, expected))
|
||||
expected := expectedAlerts[actual.Data.Fingerprint()]
|
||||
require.NoError(t, alertDiff(actual.Data, expected))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -335,7 +336,7 @@ func TestAlertsGC(t *testing.T) {
|
||||
|
||||
insert := []*types.Alert{alert1, alert2, alert3}
|
||||
|
||||
if err := alerts.Put(insert...); err != nil {
|
||||
if err := alerts.Put(context.Background(), insert...); err != nil {
|
||||
t.Fatalf("Insert failed: %s", err)
|
||||
}
|
||||
|
||||
@@ -370,7 +371,8 @@ func TestAlertsStoreCallback(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
err = alerts.Put(alert1, alert2, alert3)
|
||||
ctx := context.Background()
|
||||
err = alerts.Put(ctx, alert1, alert2, alert3)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -393,7 +395,7 @@ func TestAlertsStoreCallback(t *testing.T) {
|
||||
Timeout: false,
|
||||
}
|
||||
|
||||
err = alerts.Put(&alert1Mod, alert4)
|
||||
err = alerts.Put(ctx, &alert1Mod, alert4)
|
||||
// Verify that we failed to put new alert into store (not reported via error, only checked using Load)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error %v", err)
|
||||
@@ -417,7 +419,7 @@ func TestAlertsStoreCallback(t *testing.T) {
|
||||
t.Fatalf("unexpected number of alerts in the store, expected %v, got %v", 0, num)
|
||||
}
|
||||
|
||||
err = alerts.Put(alert4)
|
||||
err = alerts.Put(ctx, alert4)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@@ -458,7 +460,8 @@ func TestAlerts_Count(t *testing.T) {
|
||||
Timeout: false,
|
||||
}
|
||||
|
||||
alerts.Put(a1)
|
||||
ctx := context.Background()
|
||||
alerts.Put(ctx, a1)
|
||||
require.Equal(t, 1, countByState(types.AlertStateUnprocessed))
|
||||
require.Equal(t, 1, countTotal())
|
||||
require.Eventually(t, func() bool {
|
||||
@@ -480,7 +483,7 @@ func TestAlerts_Count(t *testing.T) {
|
||||
}
|
||||
|
||||
// When insert an alert, and then silence it. It shows up with the correct filter.
|
||||
alerts.Put(a2)
|
||||
alerts.Put(ctx, a2)
|
||||
marker.SetActiveOrSilenced(a2.Fingerprint(), 1, []string{"1"}, nil)
|
||||
require.Equal(t, 1, countByState(types.AlertStateSuppressed))
|
||||
require.Equal(t, 1, countTotal())
|
||||
@@ -575,7 +578,7 @@ func TestAlertsConcurrently(t *testing.T) {
|
||||
default:
|
||||
}
|
||||
now := time.Now()
|
||||
err := a.Put(&types.Alert{
|
||||
err := a.Put(context.Background(), &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"bar": model.LabelValue(strconv.Itoa(j))},
|
||||
StartsAt: now,
|
||||
@@ -685,7 +688,7 @@ func TestSubscriberChannelMetrics(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
err = alerts.Put(alertsToSend...)
|
||||
err = alerts.Put(context.Background(), alertsToSend...)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify the counter incremented for each successful write
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
package provider
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/prometheus/common/model"
|
||||
@@ -24,6 +25,12 @@ import (
|
||||
// ErrNotFound is returned if a provider cannot find a requested item.
|
||||
var ErrNotFound = fmt.Errorf("item not found")
|
||||
|
||||
type Alert struct {
|
||||
// Header contains metadata, for example propagated tracing information.
|
||||
Header map[string]string
|
||||
Data *types.Alert
|
||||
}
|
||||
|
||||
// Iterator provides the functions common to all iterators. To be useful, a
|
||||
// specific iterator interface (e.g. AlertIterator) has to be implemented that
|
||||
// provides a Next method.
|
||||
@@ -44,11 +51,11 @@ type AlertIterator interface {
|
||||
// exhausted. It is not necessary to exhaust the iterator but Close must
|
||||
// be called in any case to release resources used by the iterator (even
|
||||
// if the iterator is exhausted).
|
||||
Next() <-chan *types.Alert
|
||||
Next() <-chan *Alert
|
||||
}
|
||||
|
||||
// NewAlertIterator returns a new AlertIterator based on the generic alertIterator type.
|
||||
func NewAlertIterator(ch <-chan *types.Alert, done chan struct{}, err error) AlertIterator {
|
||||
func NewAlertIterator(ch <-chan *Alert, done chan struct{}, err error) AlertIterator {
|
||||
return &alertIterator{
|
||||
ch: ch,
|
||||
done: done,
|
||||
@@ -58,12 +65,12 @@ func NewAlertIterator(ch <-chan *types.Alert, done chan struct{}, err error) Ale
|
||||
|
||||
// alertIterator implements AlertIterator. So far, this one fits all providers.
|
||||
type alertIterator struct {
|
||||
ch <-chan *types.Alert
|
||||
ch <-chan *Alert
|
||||
done chan struct{}
|
||||
err error
|
||||
}
|
||||
|
||||
func (ai alertIterator) Next() <-chan *types.Alert {
|
||||
func (ai alertIterator) Next() <-chan *Alert {
|
||||
return ai.ch
|
||||
}
|
||||
|
||||
@@ -94,5 +101,5 @@ type Alerts interface {
|
||||
// Get returns the alert for a given fingerprint.
|
||||
Get(model.Fingerprint) (*types.Alert, error)
|
||||
// Put adds the given set of alerts to the set.
|
||||
Put(...*types.Alert) error
|
||||
Put(ctx context.Context, alerts ...*types.Alert) error
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ package silence
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -37,6 +38,10 @@ import (
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/promslog"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/attribute"
|
||||
"go.opentelemetry.io/otel/codes"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
|
||||
"github.com/prometheus/alertmanager/cluster"
|
||||
"github.com/prometheus/alertmanager/matcher/compat"
|
||||
@@ -45,6 +50,8 @@ import (
|
||||
"github.com/prometheus/alertmanager/types"
|
||||
)
|
||||
|
||||
var tracer = otel.Tracer("github.com/prometheus/alertmanager/silence")
|
||||
|
||||
// ErrNotFound is returned if a silence was not found.
|
||||
var ErrNotFound = errors.New("silence not found")
|
||||
|
||||
@@ -143,10 +150,18 @@ func NewSilencer(s *Silences, m types.AlertMarker, l *slog.Logger) *Silencer {
|
||||
}
|
||||
|
||||
// Mutes implements the Muter interface.
|
||||
func (s *Silencer) Mutes(lset model.LabelSet) bool {
|
||||
func (s *Silencer) Mutes(ctx context.Context, lset model.LabelSet) bool {
|
||||
fp := lset.Fingerprint()
|
||||
activeIDs, pendingIDs, markerVersion, _ := s.marker.Silenced(fp)
|
||||
|
||||
ctx, span := tracer.Start(ctx, "silence.Silencer.Mutes",
|
||||
trace.WithAttributes(
|
||||
attribute.String("alerting.alert.fingerprint", fp.String()),
|
||||
),
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
var (
|
||||
oldSils []*pb.Silence
|
||||
newSils []*pb.Silence
|
||||
@@ -158,6 +173,11 @@ func (s *Silencer) Mutes(lset model.LabelSet) bool {
|
||||
if markerIsUpToDate && totalMarkerSilences == 0 {
|
||||
// Very fast path: no new silences have been added and this lset was not
|
||||
// silenced last time we checked.
|
||||
span.AddEvent("No new silences to match since last check",
|
||||
trace.WithAttributes(
|
||||
attribute.Int("alerting.silences.count", totalMarkerSilences),
|
||||
),
|
||||
)
|
||||
return false
|
||||
}
|
||||
// Either there are new silences and we need to check if those match lset or there were
|
||||
@@ -170,10 +190,13 @@ func (s *Silencer) Mutes(lset model.LabelSet) bool {
|
||||
var err error
|
||||
allIDs := append(append(make([]string, 0, totalMarkerSilences), activeIDs...), pendingIDs...)
|
||||
oldSils, _, err = s.silences.Query(
|
||||
ctx,
|
||||
QIDs(allIDs...),
|
||||
QState(types.SilenceStateActive, types.SilenceStatePending),
|
||||
)
|
||||
if err != nil {
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
span.RecordError(err)
|
||||
s.logger.Error(
|
||||
"Querying old silences failed, alerts might not get silenced correctly",
|
||||
"err", err,
|
||||
@@ -188,11 +211,14 @@ func (s *Silencer) Mutes(lset model.LabelSet) bool {
|
||||
// newer than markerVersion.
|
||||
var err error
|
||||
newSils, newVersion, err = s.silences.Query(
|
||||
ctx,
|
||||
QSince(markerVersion),
|
||||
QState(types.SilenceStateActive, types.SilenceStatePending),
|
||||
QMatches(lset),
|
||||
)
|
||||
if err != nil {
|
||||
span.SetStatus(codes.Error, err.Error())
|
||||
span.RecordError(err)
|
||||
s.logger.Error(
|
||||
"Querying silences failed, alerts might not get silenced correctly",
|
||||
"err", err,
|
||||
@@ -207,6 +233,9 @@ func (s *Silencer) Mutes(lset model.LabelSet) bool {
|
||||
if totalSilences == 0 {
|
||||
// Easy case, neither active nor pending silences anymore.
|
||||
s.marker.SetActiveOrSilenced(fp, newVersion, nil, nil)
|
||||
span.AddEvent("No silences to match", trace.WithAttributes(
|
||||
attribute.Int("alerting.silences.count", totalSilences),
|
||||
))
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -242,8 +271,12 @@ func (s *Silencer) Mutes(lset model.LabelSet) bool {
|
||||
sort.Strings(pendingIDs)
|
||||
|
||||
s.marker.SetActiveOrSilenced(fp, newVersion, activeIDs, pendingIDs)
|
||||
|
||||
return len(activeIDs) > 0
|
||||
mutes := len(activeIDs) > 0
|
||||
span.AddEvent("Silencer mutes alert", trace.WithAttributes(
|
||||
attribute.Int("alerting.silences.active.count", len(activeIDs)),
|
||||
attribute.Int("alerting.silences.pending.count", len(pendingIDs)),
|
||||
))
|
||||
return mutes
|
||||
}
|
||||
|
||||
// Silences holds a silence state that can be modified, queried, and snapshot.
|
||||
@@ -308,7 +341,7 @@ func newSilenceMetricByState(s *Silences, st types.SilenceState) prometheus.Gaug
|
||||
ConstLabels: prometheus.Labels{"state": string(st)},
|
||||
},
|
||||
func() float64 {
|
||||
count, err := s.CountState(st)
|
||||
count, err := s.CountState(context.Background(), st)
|
||||
if err != nil {
|
||||
s.logger.Error("Counting silences failed", "err", err)
|
||||
}
|
||||
@@ -739,7 +772,10 @@ func (s *Silences) setSilence(msil *pb.MeshSilence, now time.Time) error {
|
||||
|
||||
// Set the specified silence. If a silence with the ID already exists and the modification
|
||||
// modifies history, the old silence gets expired and a new one is created.
|
||||
func (s *Silences) Set(sil *pb.Silence) error {
|
||||
func (s *Silences) Set(ctx context.Context, sil *pb.Silence) error {
|
||||
_, span := tracer.Start(ctx, "silences.Set")
|
||||
defer span.End()
|
||||
|
||||
now := s.nowUTC()
|
||||
if sil.StartsAt.IsZero() {
|
||||
sil.StartsAt = now
|
||||
@@ -830,9 +866,15 @@ func canUpdate(a, b *pb.Silence, now time.Time) bool {
|
||||
}
|
||||
|
||||
// Expire the silence with the given ID immediately.
|
||||
func (s *Silences) Expire(id string) error {
|
||||
func (s *Silences) Expire(ctx context.Context, id string) error {
|
||||
s.mtx.Lock()
|
||||
defer s.mtx.Unlock()
|
||||
|
||||
_, span := tracer.Start(ctx, "silences.Expire", trace.WithAttributes(
|
||||
attribute.String("alerting.silence.id", id),
|
||||
))
|
||||
defer span.End()
|
||||
|
||||
return s.expire(id)
|
||||
}
|
||||
|
||||
@@ -945,8 +987,12 @@ func QState(states ...types.SilenceState) QueryParam {
|
||||
|
||||
// QueryOne queries with the given parameters and returns the first result.
|
||||
// Returns ErrNotFound if the query result is empty.
|
||||
func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) {
|
||||
res, _, err := s.Query(params...)
|
||||
func (s *Silences) QueryOne(ctx context.Context, params ...QueryParam) (*pb.Silence, error) {
|
||||
_, span := tracer.Start(ctx, "inhibit.Silences.QueryOne",
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
res, _, err := s.Query(ctx, params...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -958,7 +1004,11 @@ func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) {
|
||||
|
||||
// Query for silences based on the given query parameters. It returns the
|
||||
// resulting silences and the state version the result is based on.
|
||||
func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) {
|
||||
func (s *Silences) Query(ctx context.Context, params ...QueryParam) ([]*pb.Silence, int, error) {
|
||||
_, span := tracer.Start(ctx, "inhibit.Silences.Query",
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
s.metrics.queriesTotal.Inc()
|
||||
defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration()
|
||||
|
||||
@@ -984,9 +1034,13 @@ func (s *Silences) Version() int {
|
||||
}
|
||||
|
||||
// CountState counts silences by state.
|
||||
func (s *Silences) CountState(states ...types.SilenceState) (int, error) {
|
||||
func (s *Silences) CountState(ctx context.Context, states ...types.SilenceState) (int, error) {
|
||||
_, span := tracer.Start(ctx, "inhibit.Silences.CountState",
|
||||
trace.WithSpanKind(trace.SpanKindInternal),
|
||||
)
|
||||
defer span.End()
|
||||
// This could probably be optimized.
|
||||
sils, _, err := s.Query(QState(states...))
|
||||
sils, _, err := s.Query(ctx, QState(states...))
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
package silence
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"strconv"
|
||||
@@ -108,14 +109,14 @@ func benchmarkMutes(b *testing.B, totalSilences, matchingSilences int) {
|
||||
EndsAt: now.Add(time.Minute),
|
||||
}
|
||||
}
|
||||
require.NoError(b, silences.Set(s))
|
||||
require.NoError(b, silences.Set(b.Context(), s))
|
||||
}
|
||||
|
||||
m := types.NewMarker(prometheus.NewRegistry())
|
||||
s := NewSilencer(silences, m, promslog.NewNopLogger())
|
||||
|
||||
for b.Loop() {
|
||||
s.Mutes(model.LabelSet{"foo": "bar"})
|
||||
s.Mutes(context.Background(), model.LabelSet{"foo": "bar"})
|
||||
}
|
||||
b.StopTimer()
|
||||
|
||||
@@ -183,7 +184,7 @@ func BenchmarkMutesIncremental(b *testing.B) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
}
|
||||
}
|
||||
require.NoError(b, silences.Set(s))
|
||||
require.NoError(b, silences.Set(b.Context(), s))
|
||||
}
|
||||
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
@@ -192,7 +193,7 @@ func BenchmarkMutesIncremental(b *testing.B) {
|
||||
// Warm up: Establish marker state (markerVersion = current version)
|
||||
// This simulates a system that has been running for a while
|
||||
lset := model.LabelSet{"service": "test", "instance": "instance1"}
|
||||
silencer.Mutes(lset)
|
||||
silencer.Mutes(context.Background(), lset)
|
||||
|
||||
// Benchmark: Measure Mutes() performance with incremental additions
|
||||
// Every other iteration adds 1 new silence, all iterations call Mutes()
|
||||
@@ -238,12 +239,12 @@ func BenchmarkMutesIncremental(b *testing.B) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
}
|
||||
}
|
||||
require.NoError(b, silences.Set(s))
|
||||
require.NoError(b, silences.Set(b.Context(), s))
|
||||
}
|
||||
|
||||
b.StartTimer()
|
||||
// Now query - should use incremental path or cached paths
|
||||
silencer.Mutes(lset)
|
||||
silencer.Mutes(context.Background(), lset)
|
||||
iteration++
|
||||
}
|
||||
})
|
||||
@@ -295,11 +296,12 @@ func benchmarkQuery(b *testing.B, numSilences int) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now.Add(-time.Hour),
|
||||
}
|
||||
require.NoError(b, s.Set(sil))
|
||||
require.NoError(b, s.Set(b.Context(), sil))
|
||||
}
|
||||
|
||||
// Run things once to populate the matcherCache.
|
||||
sils, _, err := s.Query(
|
||||
b.Context(),
|
||||
QState(types.SilenceStateActive),
|
||||
QMatches(lset),
|
||||
)
|
||||
@@ -308,6 +310,7 @@ func benchmarkQuery(b *testing.B, numSilences int) {
|
||||
|
||||
for b.Loop() {
|
||||
sils, _, err := s.Query(
|
||||
b.Context(),
|
||||
QState(types.SilenceStateActive),
|
||||
QMatches(lset),
|
||||
)
|
||||
@@ -360,11 +363,12 @@ func benchmarkQueryParallel(b *testing.B, numSilences int) {
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now.Add(-time.Hour),
|
||||
}
|
||||
require.NoError(b, s.Set(sil))
|
||||
require.NoError(b, s.Set(b.Context(), sil))
|
||||
}
|
||||
|
||||
// Verify initial query works
|
||||
sils, _, err := s.Query(
|
||||
b.Context(),
|
||||
QState(types.SilenceStateActive),
|
||||
QMatches(lset),
|
||||
)
|
||||
@@ -377,6 +381,7 @@ func benchmarkQueryParallel(b *testing.B, numSilences int) {
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
sils, _, err := s.Query(
|
||||
b.Context(),
|
||||
QState(types.SilenceStateActive),
|
||||
QMatches(lset),
|
||||
)
|
||||
@@ -439,7 +444,7 @@ func benchmarkQueryWithConcurrentAdds(b *testing.B, initialSilences int, addRati
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now.Add(-time.Hour),
|
||||
}
|
||||
require.NoError(b, s.Set(sil))
|
||||
require.NoError(b, s.Set(b.Context(), sil))
|
||||
}
|
||||
|
||||
var addCounter int
|
||||
@@ -474,12 +479,13 @@ func benchmarkQueryWithConcurrentAdds(b *testing.B, initialSilences int, addRati
|
||||
EndsAt: now.Add(time.Hour),
|
||||
UpdatedAt: now.Add(-time.Hour),
|
||||
}
|
||||
if err := s.Set(sil); err != nil {
|
||||
if err := s.Set(b.Context(), sil); err != nil {
|
||||
b.Error(err)
|
||||
}
|
||||
} else {
|
||||
// Query silences (the common operation)
|
||||
_, _, err := s.Query(
|
||||
b.Context(),
|
||||
QState(types.SilenceStateActive),
|
||||
QMatches(lset),
|
||||
)
|
||||
@@ -524,7 +530,7 @@ func benchmarkMutesParallel(b *testing.B, numSilences int) {
|
||||
StartsAt: now,
|
||||
EndsAt: now.Add(time.Minute),
|
||||
}
|
||||
require.NoError(b, silences.Set(s))
|
||||
require.NoError(b, silences.Set(b.Context(), s))
|
||||
}
|
||||
|
||||
m := types.NewMarker(prometheus.NewRegistry())
|
||||
@@ -535,7 +541,7 @@ func benchmarkMutesParallel(b *testing.B, numSilences int) {
|
||||
// Run Mutes in parallel
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
silencer.Mutes(model.LabelSet{"foo": "bar"})
|
||||
silencer.Mutes(b.Context(), model.LabelSet{"foo": "bar"})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -126,7 +126,7 @@ func TestSilenceGCOverTime(t *testing.T) {
|
||||
StartsAt: clock.Now(),
|
||||
EndsAt: clock.Now().Add(time.Minute),
|
||||
}
|
||||
require.NoError(t, s.Set(sil1))
|
||||
require.NoError(t, s.Set(t.Context(), sil1))
|
||||
require.Len(t, s.st, 1)
|
||||
require.Len(t, s.mi, 1)
|
||||
// Move time forward and both silence and cache entry should be garbage
|
||||
@@ -153,7 +153,7 @@ func TestSilenceGCOverTime(t *testing.T) {
|
||||
StartsAt: clock.Now(),
|
||||
EndsAt: clock.Now().Add(time.Minute),
|
||||
}
|
||||
require.NoError(t, s.Set(sil1))
|
||||
require.NoError(t, s.Set(t.Context(), sil1))
|
||||
require.Len(t, s.st, 1)
|
||||
require.Len(t, s.mi, 1)
|
||||
// must clone sil1 before replacing it.
|
||||
@@ -163,7 +163,7 @@ func TestSilenceGCOverTime(t *testing.T) {
|
||||
Name: "bar",
|
||||
Pattern: "baz",
|
||||
}}
|
||||
require.NoError(t, s.Set(sil2))
|
||||
require.NoError(t, s.Set(t.Context(), sil2))
|
||||
require.Len(t, s.st, 2)
|
||||
require.Len(t, s.mi, 2)
|
||||
// Move time forward and both silence and cache entry should be garbage
|
||||
@@ -198,11 +198,11 @@ func TestSilenceGCOverTime(t *testing.T) {
|
||||
require.Len(t, s.mi, 1)
|
||||
// must clone sil1 before updating it.
|
||||
sil2 := cloneSilence(sil1)
|
||||
require.NoError(t, s.Set(sil2))
|
||||
require.NoError(t, s.Set(t.Context(), sil2))
|
||||
// The memory leak occurred because updating a silence would add a new
|
||||
// entry in the matcher cache even though no new silence was created.
|
||||
// This check asserts that this no longer happens.
|
||||
s.Query(QMatches(model.LabelSet{"foo": "bar"}))
|
||||
s.Query(t.Context(), QMatches(model.LabelSet{"foo": "bar"}))
|
||||
require.Len(t, s.st, 1)
|
||||
require.Len(t, s.mi, 1)
|
||||
// Move time forward and both silence and cache entry should be garbage
|
||||
@@ -351,7 +351,7 @@ func TestSilenceGCOverTime(t *testing.T) {
|
||||
StartsAt: now,
|
||||
EndsAt: now.Add(time.Minute),
|
||||
}
|
||||
require.NoError(t, s.Set(validSil))
|
||||
require.NoError(t, s.Set(t.Context(), validSil))
|
||||
validID := validSil.Id
|
||||
|
||||
// Manually add an erroneous silence with zero expiration
|
||||
@@ -614,7 +614,7 @@ func TestSilenceSet(t *testing.T) {
|
||||
EndsAt: start1.Add(5 * time.Minute),
|
||||
}
|
||||
versionBeforeOp := s.Version()
|
||||
require.NoError(t, s.Set(sil1))
|
||||
require.NoError(t, s.Set(t.Context(), sil1))
|
||||
require.NotEmpty(t, sil1.Id)
|
||||
require.NotEqual(t, versionBeforeOp, s.Version())
|
||||
|
||||
@@ -641,7 +641,7 @@ func TestSilenceSet(t *testing.T) {
|
||||
EndsAt: start2.Add(1 * time.Minute),
|
||||
}
|
||||
versionBeforeOp = s.Version()
|
||||
require.NoError(t, s.Set(sil2))
|
||||
require.NoError(t, s.Set(t.Context(), sil2))
|
||||
require.NotEmpty(t, sil2.Id)
|
||||
require.NotEqual(t, versionBeforeOp, s.Version())
|
||||
|
||||
@@ -664,7 +664,7 @@ func TestSilenceSet(t *testing.T) {
|
||||
// keep the same ID.
|
||||
sil3 := cloneSilence(sil2)
|
||||
versionBeforeOp = s.Version()
|
||||
require.NoError(t, s.Set(sil3))
|
||||
require.NoError(t, s.Set(t.Context(), sil3))
|
||||
require.Equal(t, sil2.Id, sil3.Id)
|
||||
require.Equal(t, versionBeforeOp, s.Version())
|
||||
|
||||
@@ -673,7 +673,7 @@ func TestSilenceSet(t *testing.T) {
|
||||
sil4 := cloneSilence(sil3)
|
||||
sil4.Comment = "c"
|
||||
versionBeforeOp = s.Version()
|
||||
require.NoError(t, s.Set(sil4))
|
||||
require.NoError(t, s.Set(t.Context(), sil4))
|
||||
require.Equal(t, sil3.Id, sil4.Id)
|
||||
require.Equal(t, versionBeforeOp, s.Version())
|
||||
|
||||
@@ -684,7 +684,7 @@ func TestSilenceSet(t *testing.T) {
|
||||
sil5 := cloneSilence(sil4)
|
||||
sil5.EndsAt = start5.Add(100 * time.Minute)
|
||||
versionBeforeOp = s.Version()
|
||||
require.NoError(t, s.Set(sil5))
|
||||
require.NoError(t, s.Set(t.Context(), sil5))
|
||||
require.Equal(t, sil4.Id, sil5.Id)
|
||||
want = state{
|
||||
sil1.Id: want[sil1.Id],
|
||||
@@ -714,7 +714,7 @@ func TestSilenceSet(t *testing.T) {
|
||||
sil6 := cloneSilence(sil5)
|
||||
sil6.Matchers = []*pb.Matcher{{Name: "a", Pattern: "c"}}
|
||||
versionBeforeOp = s.Version()
|
||||
require.NoError(t, s.Set(sil6))
|
||||
require.NoError(t, s.Set(t.Context(), sil6))
|
||||
require.NotEqual(t, sil5.Id, sil6.Id)
|
||||
want = state{
|
||||
sil1.Id: want[sil1.Id],
|
||||
@@ -753,7 +753,7 @@ func TestSilenceSet(t *testing.T) {
|
||||
sil7.StartsAt = start1
|
||||
sil7.EndsAt = start1.Add(5 * time.Minute)
|
||||
versionBeforeOp = s.Version()
|
||||
require.NoError(t, s.Set(sil7))
|
||||
require.NoError(t, s.Set(t.Context(), sil7))
|
||||
require.NotEqual(t, sil2.Id, sil7.Id)
|
||||
want = state{
|
||||
sil1.Id: want[sil1.Id],
|
||||
@@ -780,11 +780,11 @@ func TestSilenceSet(t *testing.T) {
|
||||
sil8 := cloneSilence(sil7)
|
||||
sil8.EndsAt = time.Time{}
|
||||
versionBeforeOp = s.Version()
|
||||
require.EqualError(t, s.Set(sil8), "invalid silence: invalid zero end timestamp")
|
||||
require.EqualError(t, s.Set(t.Context(), sil8), "invalid silence: invalid zero end timestamp")
|
||||
|
||||
// sil7 should not be expired because the update failed.
|
||||
clock.Advance(time.Millisecond)
|
||||
sil7, err = s.QueryOne(QIDs(sil7.Id))
|
||||
sil7, err = s.QueryOne(t.Context(), QIDs(sil7.Id))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, types.SilenceStateActive, getState(sil7, s.nowUTC()))
|
||||
require.Equal(t, versionBeforeOp, s.Version())
|
||||
@@ -806,7 +806,7 @@ func TestSilenceLimits(t *testing.T) {
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
require.NoError(t, s.Set(sil1))
|
||||
require.NoError(t, s.Set(t.Context(), sil1))
|
||||
|
||||
// Insert sil2 should fail because maximum number of silences has been
|
||||
// exceeded.
|
||||
@@ -815,17 +815,17 @@ func TestSilenceLimits(t *testing.T) {
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
require.EqualError(t, s.Set(sil2), "exceeded maximum number of silences: 1 (limit: 1)")
|
||||
require.EqualError(t, s.Set(t.Context(), sil2), "exceeded maximum number of silences: 1 (limit: 1)")
|
||||
|
||||
// Expire sil1 and run the GC. This should allow sil2 to be inserted.
|
||||
require.NoError(t, s.Expire(sil1.Id))
|
||||
require.NoError(t, s.Expire(t.Context(), sil1.Id))
|
||||
n, err := s.GC()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, n)
|
||||
require.NoError(t, s.Set(sil2))
|
||||
require.NoError(t, s.Set(t.Context(), sil2))
|
||||
|
||||
// Expire sil2 and run the GC.
|
||||
require.NoError(t, s.Expire(sil2.Id))
|
||||
require.NoError(t, s.Expire(t.Context(), sil2.Id))
|
||||
n, err = s.GC()
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, n)
|
||||
@@ -847,7 +847,7 @@ func TestSilenceLimits(t *testing.T) {
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
require.EqualError(t, s.Set(sil3), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil3).Size()))
|
||||
require.EqualError(t, s.Set(t.Context(), sil3), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil3).Size()))
|
||||
|
||||
// Should be able to insert sil4.
|
||||
sil4 := &pb.Silence{
|
||||
@@ -855,19 +855,19 @@ func TestSilenceLimits(t *testing.T) {
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
require.NoError(t, s.Set(sil4))
|
||||
require.NoError(t, s.Set(t.Context(), sil4))
|
||||
|
||||
// Should be able to update sil4 without modifications. It is expected to
|
||||
// keep the same ID.
|
||||
sil5 := cloneSilence(sil4)
|
||||
require.NoError(t, s.Set(sil5))
|
||||
require.NoError(t, s.Set(t.Context(), sil5))
|
||||
require.Equal(t, sil4.Id, sil5.Id)
|
||||
|
||||
// Should be able to update the comment. It is also expected to keep the
|
||||
// same ID.
|
||||
sil6 := cloneSilence(sil5)
|
||||
sil6.Comment = "m"
|
||||
require.NoError(t, s.Set(sil6))
|
||||
require.NoError(t, s.Set(t.Context(), sil6))
|
||||
require.Equal(t, sil5.Id, sil6.Id)
|
||||
|
||||
// Should not be able to update the start and end time as this requires
|
||||
@@ -875,12 +875,12 @@ func TestSilenceLimits(t *testing.T) {
|
||||
// exceed the maximum number of silences, which counts both active and
|
||||
// expired silences.
|
||||
sil7 := cloneSilence(sil6)
|
||||
sil7.StartsAt = time.Now().Add(5 * time.Minute)
|
||||
sil7.StartsAt = time.Now().Add(1 * time.Minute)
|
||||
sil7.EndsAt = time.Now().Add(10 * time.Minute)
|
||||
require.EqualError(t, s.Set(sil7), "exceeded maximum number of silences: 1 (limit: 1)")
|
||||
require.EqualError(t, s.Set(t.Context(), sil7), "exceeded maximum number of silences: 1 (limit: 1)")
|
||||
|
||||
// sil6 should not be expired because the update failed.
|
||||
sil6, err = s.QueryOne(QIDs(sil6.Id))
|
||||
sil6, err = s.QueryOne(t.Context(), QIDs(sil6.Id))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, types.SilenceStateActive, getState(sil6, s.nowUTC()))
|
||||
|
||||
@@ -889,10 +889,10 @@ func TestSilenceLimits(t *testing.T) {
|
||||
s.limits.MaxSilences = func() int { return 2 }
|
||||
sil8 := cloneSilence(sil6)
|
||||
sil8.Comment = strings.Repeat("m", 2<<11)
|
||||
require.EqualError(t, s.Set(sil8), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil8).Size()))
|
||||
require.EqualError(t, s.Set(t.Context(), sil8), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil8).Size()))
|
||||
|
||||
// sil6 should not be expired because the update failed.
|
||||
sil6, err = s.QueryOne(QIDs(sil6.Id))
|
||||
sil6, err = s.QueryOne(t.Context(), QIDs(sil6.Id))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, types.SilenceStateActive, getState(sil6, s.nowUTC()))
|
||||
|
||||
@@ -904,10 +904,10 @@ func TestSilenceLimits(t *testing.T) {
|
||||
// should still be active.
|
||||
sil9 := cloneSilence(sil8)
|
||||
sil9.Matchers = []*pb.Matcher{{Name: "n", Pattern: "o"}}
|
||||
require.EqualError(t, s.Set(sil9), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil9).Size()))
|
||||
require.EqualError(t, s.Set(t.Context(), sil9), fmt.Sprintf("silence exceeded maximum size: %d bytes (limit: 4096 bytes)", s.toMeshSilence(sil9).Size()))
|
||||
|
||||
// sil6 should not be expired because the update failed.
|
||||
sil6, err = s.QueryOne(QIDs(sil6.Id))
|
||||
sil6, err = s.QueryOne(t.Context(), QIDs(sil6.Id))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, types.SilenceStateActive, getState(sil6, s.nowUTC()))
|
||||
}
|
||||
@@ -926,7 +926,7 @@ func TestSilenceNoLimits(t *testing.T) {
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
Comment: strings.Repeat("c", 2<<9),
|
||||
}
|
||||
require.NoError(t, s.Set(sil))
|
||||
require.NoError(t, s.Set(t.Context(), sil))
|
||||
require.NotEmpty(t, sil.Id)
|
||||
}
|
||||
|
||||
@@ -949,7 +949,7 @@ func TestSetActiveSilence(t *testing.T) {
|
||||
StartsAt: startsAt,
|
||||
EndsAt: endsAt,
|
||||
}
|
||||
require.NoError(t, s.Set(sil1))
|
||||
require.NoError(t, s.Set(t.Context(), sil1))
|
||||
|
||||
// Update silence with 2 extra nanoseconds so the "seconds" part should not change
|
||||
|
||||
@@ -963,7 +963,7 @@ func TestSetActiveSilence(t *testing.T) {
|
||||
|
||||
clock.Advance(time.Minute)
|
||||
now = s.nowUTC()
|
||||
require.NoError(t, s.Set(sil2))
|
||||
require.NoError(t, s.Set(t.Context(), sil2))
|
||||
require.Equal(t, sil1.Id, sil2.Id)
|
||||
|
||||
want := state{
|
||||
@@ -1005,7 +1005,7 @@ func TestSilencesSetFail(t *testing.T) {
|
||||
},
|
||||
}
|
||||
for _, c := range cases {
|
||||
checkErr(t, c.err, s.Set(c.s))
|
||||
checkErr(t, c.err, s.Set(t.Context(), c.s))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1215,7 +1215,7 @@ func TestQSince(t *testing.T) {
|
||||
silences.st = st
|
||||
silences.vi = c.index
|
||||
|
||||
res, _, err := silences.Query(QSince(c.since))
|
||||
res, _, err := silences.Query(t.Context(), QSince(c.since))
|
||||
require.NoError(t, err)
|
||||
resultIds := []string{}
|
||||
for _, sil := range res {
|
||||
@@ -1327,28 +1327,28 @@ func TestQIDs(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test QIDs with empty arguments returns an error
|
||||
_, _, err = s.Query(QIDs())
|
||||
_, _, err = s.Query(t.Context(), QIDs())
|
||||
require.Error(t, err, "expected error when QIDs is called with no arguments")
|
||||
require.Contains(t, err.Error(), "QIDs filter must have at least one id")
|
||||
|
||||
// Test QIDs with empty arguments returns an error via QueryOne
|
||||
_, err = s.QueryOne(QIDs())
|
||||
_, err = s.QueryOne(t.Context(), QIDs())
|
||||
require.Error(t, err, "expected error when QIDs is called with no arguments")
|
||||
require.Contains(t, err.Error(), "QIDs filter must have at least one id")
|
||||
|
||||
// Test QIDs with single ID works
|
||||
res, _, err := s.Query(QIDs("1"))
|
||||
res, _, err := s.Query(t.Context(), QIDs("1"))
|
||||
require.NoError(t, err)
|
||||
require.Len(t, res, 1)
|
||||
require.Equal(t, "1", res[0].Id)
|
||||
|
||||
// Test QIDs with multiple IDs works
|
||||
res, _, err = s.Query(QIDs("1", "2"))
|
||||
res, _, err = s.Query(t.Context(), QIDs("1", "2"))
|
||||
require.NoError(t, err)
|
||||
require.Len(t, res, 2)
|
||||
|
||||
// Test QueryOne with single ID works
|
||||
sil, err := s.QueryOne(QIDs("1"))
|
||||
sil, err := s.QueryOne(t.Context(), QIDs("1"))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "1", sil.Id)
|
||||
}
|
||||
@@ -1523,20 +1523,20 @@ func TestSilenceExpire(t *testing.T) {
|
||||
silenceVersion{id: "active"},
|
||||
silenceVersion{id: "expired"},
|
||||
}
|
||||
count, err := s.CountState(types.SilenceStatePending)
|
||||
count, err := s.CountState(t.Context(), types.SilenceStatePending)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
|
||||
count, err = s.CountState(types.SilenceStateExpired)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateExpired)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
|
||||
require.NoError(t, s.Expire("pending"))
|
||||
require.NoError(t, s.Expire("active"))
|
||||
require.NoError(t, s.Expire(t.Context(), "pending"))
|
||||
require.NoError(t, s.Expire(t.Context(), "active"))
|
||||
|
||||
require.NoError(t, s.Expire("expired"))
|
||||
require.NoError(t, s.Expire(t.Context(), "expired"))
|
||||
|
||||
sil, err := s.QueryOne(QIDs("pending"))
|
||||
sil, err := s.QueryOne(t.Context(), QIDs("pending"))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, &pb.Silence{
|
||||
Id: "pending",
|
||||
@@ -1549,11 +1549,11 @@ func TestSilenceExpire(t *testing.T) {
|
||||
// Let time pass...
|
||||
clock.Advance(time.Second)
|
||||
|
||||
count, err = s.CountState(types.SilenceStatePending)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStatePending)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, count)
|
||||
|
||||
count, err = s.CountState(types.SilenceStateExpired)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateExpired)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 3, count)
|
||||
|
||||
@@ -1562,7 +1562,7 @@ func TestSilenceExpire(t *testing.T) {
|
||||
silenceState := types.CalcSilenceState(sil.StartsAt, sil.EndsAt)
|
||||
require.Equal(t, types.SilenceStateExpired, silenceState)
|
||||
|
||||
sil, err = s.QueryOne(QIDs("active"))
|
||||
sil, err = s.QueryOne(t.Context(), QIDs("active"))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, &pb.Silence{
|
||||
Id: "active",
|
||||
@@ -1572,7 +1572,7 @@ func TestSilenceExpire(t *testing.T) {
|
||||
UpdatedAt: now,
|
||||
}, sil)
|
||||
|
||||
sil, err = s.QueryOne(QIDs("expired"))
|
||||
sil, err = s.QueryOne(t.Context(), QIDs("expired"))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, &pb.Silence{
|
||||
Id: "expired",
|
||||
@@ -1625,15 +1625,15 @@ func TestSilenceExpireWithZeroRetention(t *testing.T) {
|
||||
silenceVersion{id: "expired"},
|
||||
}
|
||||
|
||||
count, err := s.CountState(types.SilenceStatePending)
|
||||
count, err := s.CountState(t.Context(), types.SilenceStatePending)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
|
||||
count, err = s.CountState(types.SilenceStateActive)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateActive)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
|
||||
count, err = s.CountState(types.SilenceStateExpired)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateExpired)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
|
||||
@@ -1642,9 +1642,9 @@ func TestSilenceExpireWithZeroRetention(t *testing.T) {
|
||||
// one tick for updates to take effect.
|
||||
clock.Advance(1 * time.Millisecond)
|
||||
|
||||
require.NoError(t, s.Expire("pending"))
|
||||
require.NoError(t, s.Expire("active"))
|
||||
require.NoError(t, s.Expire("expired"))
|
||||
require.NoError(t, s.Expire(t.Context(), "pending"))
|
||||
require.NoError(t, s.Expire(t.Context(), "active"))
|
||||
require.NoError(t, s.Expire(t.Context(), "expired"))
|
||||
|
||||
// Advance time again. Despite what the function name says, s.Expire() does
|
||||
// not expire a silence. It sets the silence to EndAt the current time. This
|
||||
@@ -1652,15 +1652,15 @@ func TestSilenceExpireWithZeroRetention(t *testing.T) {
|
||||
clock.Advance(1 * time.Millisecond)
|
||||
|
||||
// Verify all silences have expired.
|
||||
count, err = s.CountState(types.SilenceStatePending)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStatePending)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, count)
|
||||
|
||||
count, err = s.CountState(types.SilenceStateActive)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateActive)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, count)
|
||||
|
||||
count, err = s.CountState(types.SilenceStateExpired)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateExpired)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 3, count)
|
||||
}
|
||||
@@ -1689,19 +1689,19 @@ func TestSilenceExpireInvalid(t *testing.T) {
|
||||
s.vi = versionIndex{silenceVersion{id: "active"}}
|
||||
|
||||
// The silence should be active.
|
||||
count, err := s.CountState(types.SilenceStateActive)
|
||||
count, err := s.CountState(t.Context(), types.SilenceStateActive)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
|
||||
clock.Advance(time.Millisecond)
|
||||
require.NoError(t, s.Expire("active"))
|
||||
require.NoError(t, s.Expire(t.Context(), "active"))
|
||||
clock.Advance(time.Millisecond)
|
||||
|
||||
// The silence should be expired.
|
||||
count, err = s.CountState(types.SilenceStateActive)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateActive)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, count)
|
||||
count, err = s.CountState(types.SilenceStateExpired)
|
||||
count, err = s.CountState(t.Context(), types.SilenceStateExpired)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, count)
|
||||
}
|
||||
@@ -1717,35 +1717,35 @@ func TestSilencer(t *testing.T) {
|
||||
m := types.NewMarker(prometheus.NewRegistry())
|
||||
s := NewSilencer(ss, m, promslog.NewNopLogger())
|
||||
|
||||
require.False(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert not silenced without any silences")
|
||||
require.False(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert not silenced without any silences")
|
||||
|
||||
sil1 := &pb.Silence{
|
||||
Matchers: []*pb.Matcher{{Name: "foo", Pattern: "baz"}},
|
||||
StartsAt: now.Add(-time.Hour),
|
||||
EndsAt: now.Add(5 * time.Minute),
|
||||
}
|
||||
require.NoError(t, ss.Set(sil1))
|
||||
require.NoError(t, ss.Set(t.Context(), sil1))
|
||||
|
||||
require.False(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert not silenced by non-matching silence")
|
||||
require.False(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert not silenced by non-matching silence")
|
||||
|
||||
sil2 := &pb.Silence{
|
||||
Matchers: []*pb.Matcher{{Name: "foo", Pattern: "bar"}},
|
||||
StartsAt: now.Add(-time.Hour),
|
||||
EndsAt: now.Add(5 * time.Minute),
|
||||
}
|
||||
require.NoError(t, ss.Set(sil2))
|
||||
require.NoError(t, ss.Set(t.Context(), sil2))
|
||||
require.NotEmpty(t, sil2.Id)
|
||||
|
||||
require.True(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert silenced by matching silence")
|
||||
require.True(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert silenced by matching silence")
|
||||
|
||||
// One hour passes, silence expires.
|
||||
clock.Advance(time.Hour)
|
||||
now = ss.nowUTC()
|
||||
|
||||
require.False(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert not silenced by expired silence")
|
||||
require.False(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert not silenced by expired silence")
|
||||
|
||||
// Update silence to start in the future.
|
||||
err = ss.Set(&pb.Silence{
|
||||
err = ss.Set(t.Context(), &pb.Silence{
|
||||
Id: sil2.Id,
|
||||
Matchers: []*pb.Matcher{{Name: "foo", Pattern: "bar"}},
|
||||
StartsAt: now.Add(time.Hour),
|
||||
@@ -1753,16 +1753,16 @@ func TestSilencer(t *testing.T) {
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
require.False(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert not silenced by future silence")
|
||||
require.False(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert not silenced by future silence")
|
||||
|
||||
// Two hours pass, silence becomes active.
|
||||
clock.Advance(2 * time.Hour)
|
||||
now = ss.nowUTC()
|
||||
|
||||
// Exposes issue #2426.
|
||||
require.True(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert silenced by activated silence")
|
||||
require.True(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert silenced by activated silence")
|
||||
|
||||
err = ss.Set(&pb.Silence{
|
||||
err = ss.Set(t.Context(), &pb.Silence{
|
||||
Matchers: []*pb.Matcher{{Name: "foo", Pattern: "b..", Type: pb.Matcher_REGEXP}},
|
||||
StartsAt: now.Add(time.Hour),
|
||||
EndsAt: now.Add(3 * time.Hour),
|
||||
@@ -1770,13 +1770,13 @@ func TestSilencer(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
|
||||
// Note that issue #2426 doesn't apply anymore because we added a new silence.
|
||||
require.True(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert still silenced by activated silence")
|
||||
require.True(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert still silenced by activated silence")
|
||||
|
||||
// Two hours pass, first silence expires, overlapping second silence becomes active.
|
||||
clock.Advance(2 * time.Hour)
|
||||
|
||||
// Another variant of issue #2426 (overlapping silences).
|
||||
require.True(t, s.Mutes(model.LabelSet{"foo": "bar"}), "expected alert silenced by activated second silence")
|
||||
require.True(t, s.Mutes(t.Context(), model.LabelSet{"foo": "bar"}), "expected alert silenced by activated second silence")
|
||||
}
|
||||
|
||||
func TestValidateClassicMatcher(t *testing.T) {
|
||||
|
||||
50
tracing/http.go
Normal file
50
tracing/http.go
Normal file
@@ -0,0 +1,50 @@
|
||||
// Copyright 2024 Prometheus Team
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tracing
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptrace"
|
||||
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace"
|
||||
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
|
||||
)
|
||||
|
||||
// TODO: maybe move these into prometheus/common?
|
||||
|
||||
// Transport wraps the provided http.RoundTripper with one that starts a span
|
||||
// and injects the span context into the outbound request headers. If the
|
||||
// provided http.RoundTripper is nil, http.DefaultTransport will be used as the
|
||||
// base http.RoundTripper.
|
||||
func Transport(rt http.RoundTripper) http.RoundTripper {
|
||||
rt = otelhttp.NewTransport(rt,
|
||||
otelhttp.WithClientTrace(func(ctx context.Context) *httptrace.ClientTrace {
|
||||
return otelhttptrace.NewClientTrace(ctx)
|
||||
}),
|
||||
)
|
||||
|
||||
return rt
|
||||
}
|
||||
|
||||
// Middleware returns a new HTTP handler that will trace all requests with the
|
||||
// HTTP method and path as the span name.
|
||||
func Middleware(handler http.Handler) http.Handler {
|
||||
return otelhttp.NewHandler(handler, "",
|
||||
otelhttp.WithSpanNameFormatter(func(_ string, r *http.Request) string {
|
||||
return fmt.Sprintf("%s %s", r.Method, r.URL.Path)
|
||||
}),
|
||||
)
|
||||
}
|
||||
22
tracing/testdata/ca.cer
vendored
Normal file
22
tracing/testdata/ca.cer
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIDkTCCAnmgAwIBAgIJAJNsnimNN3tmMA0GCSqGSIb3DQEBCwUAMF8xCzAJBgNV
|
||||
BAYTAlhYMRUwEwYDVQQHDAxEZWZhdWx0IENpdHkxHDAaBgNVBAoME0RlZmF1bHQg
|
||||
Q29tcGFueSBMdGQxGzAZBgNVBAMMElByb21ldGhldXMgVGVzdCBDQTAeFw0xNTA4
|
||||
MDQxNDA5MjFaFw0yNTA4MDExNDA5MjFaMF8xCzAJBgNVBAYTAlhYMRUwEwYDVQQH
|
||||
DAxEZWZhdWx0IENpdHkxHDAaBgNVBAoME0RlZmF1bHQgQ29tcGFueSBMdGQxGzAZ
|
||||
BgNVBAMMElByb21ldGhldXMgVGVzdCBDQTCCASIwDQYJKoZIhvcNAQEBBQADggEP
|
||||
ADCCAQoCggEBAOlSBU3yWpUELbhzizznR0hnAL7dbEHzfEtEc6N3PoSvMNcqrUVq
|
||||
t4kjBRWzqkZ5uJVkzBPERKEBoOI9pWcrqtMTBkMzHJY2Ep7GHTab10e9KC2IFQT6
|
||||
FKP/jCYixaIVx3azEfajRJooD8r79FGoagWUfHdHyCFWJb/iLt8z8+S91kelSRMS
|
||||
yB9M1ypWomzBz1UFXZp1oiNO5o7/dgXW4MgLUfC2obJ9j5xqpc6GkhWMW4ZFwEr/
|
||||
VLjuzxG9B8tLfQuhnXKGn1W8+WzZVWCWMD/sLfZfmjKaWlwcXzL51g8E+IEIBJqV
|
||||
w51aMI6lDkcvAM7gLq1auLZMVXyKWSKw7XMCAwEAAaNQME4wHQYDVR0OBBYEFMz1
|
||||
BZnlqxJp2HiJSjHK8IsLrWYbMB8GA1UdIwQYMBaAFMz1BZnlqxJp2HiJSjHK8IsL
|
||||
rWYbMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAI2iA3w3TK5J15Pu
|
||||
e4fPFB4jxQqsbUwuyXbCCv/jKLeFNCD4BjM181WZEYjPMumeTBVzU3aF45LWQIG1
|
||||
0DJcrCL4mjMz9qgAoGqA7aDDXiJGbukMgYYsn7vrnVmrZH8T3E8ySlltr7+W578k
|
||||
pJ5FxnbCroQwn0zLyVB3sFbS8E3vpBr3L8oy8PwPHhIScexcNVc3V6/m4vTZsXTH
|
||||
U+vUm1XhDgpDcFMTg2QQiJbfpOYUkwIgnRDAT7t282t2KQWtnlqc3zwPQ1F/6Cpx
|
||||
j19JeNsaF1DArkD7YlyKj/GhZLtHwFHG5cxznH0mLDJTW7bQvqqh2iQTeXmBk1lU
|
||||
mM5lH/s=
|
||||
-----END CERTIFICATE-----
|
||||
273
tracing/tracing.go
Normal file
273
tracing/tracing.go
Normal file
@@ -0,0 +1,273 @@
|
||||
// Copyright 2021 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tracing
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
commoncfg "github.com/prometheus/common/config"
|
||||
"github.com/prometheus/common/version"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
|
||||
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
|
||||
"go.opentelemetry.io/otel/propagation"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
tracesdk "go.opentelemetry.io/otel/sdk/trace"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.37.0"
|
||||
"go.opentelemetry.io/otel/trace"
|
||||
"go.opentelemetry.io/otel/trace/noop"
|
||||
"google.golang.org/grpc/credentials"
|
||||
|
||||
"github.com/prometheus/alertmanager/config"
|
||||
)
|
||||
|
||||
const serviceName = "alertmanager"
|
||||
|
||||
// Manager is capable of building, (re)installing and shutting down
|
||||
// the tracer provider.
|
||||
type Manager struct {
|
||||
logger *slog.Logger
|
||||
done chan struct{}
|
||||
config config.TracingConfig
|
||||
shutdownFunc func() error
|
||||
}
|
||||
|
||||
// NewManager creates a new tracing manager.
|
||||
func NewManager(logger *slog.Logger) *Manager {
|
||||
return &Manager{
|
||||
logger: logger,
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the tracing manager. It registers the global text map propagator and error handler.
|
||||
// It is blocking.
|
||||
func (m *Manager) Run() {
|
||||
otel.SetTextMapPropagator(propagation.TraceContext{})
|
||||
otel.SetErrorHandler(otelErrHandler(func(err error) {
|
||||
m.logger.Error("OpenTelemetry handler returned an error", "err", err)
|
||||
}))
|
||||
<-m.done
|
||||
}
|
||||
|
||||
// ApplyConfig takes care of refreshing the tracing configuration by shutting down
|
||||
// the current tracer provider (if any is registered) and installing a new one.
|
||||
func (m *Manager) ApplyConfig(cfg *config.Config) error {
|
||||
// Update only if a config change is detected. If TLS configuration is
|
||||
// set, we have to restart the manager to make sure that new TLS
|
||||
// certificates are picked up.
|
||||
var blankTLSConfig commoncfg.TLSConfig
|
||||
if reflect.DeepEqual(m.config, cfg.TracingConfig) && (m.config.TLSConfig == nil || *m.config.TLSConfig == blankTLSConfig) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if m.shutdownFunc != nil {
|
||||
if err := m.shutdownFunc(); err != nil {
|
||||
return fmt.Errorf("failed to shut down the tracer provider: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// If no endpoint is set, assume tracing should be disabled.
|
||||
if cfg.TracingConfig.Endpoint == "" {
|
||||
m.config = cfg.TracingConfig
|
||||
m.shutdownFunc = nil
|
||||
otel.SetTracerProvider(noop.NewTracerProvider())
|
||||
m.logger.Info("Tracing provider uninstalled.")
|
||||
return nil
|
||||
}
|
||||
|
||||
tp, shutdownFunc, err := buildTracerProvider(context.Background(), cfg.TracingConfig)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to install a new tracer provider: %w", err)
|
||||
}
|
||||
|
||||
m.shutdownFunc = shutdownFunc
|
||||
m.config = cfg.TracingConfig
|
||||
otel.SetTracerProvider(tp)
|
||||
|
||||
m.logger.Info("Successfully installed a new tracer provider.")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop gracefully shuts down the tracer provider and stops the tracing manager.
|
||||
func (m *Manager) Stop() {
|
||||
defer close(m.done)
|
||||
|
||||
if m.shutdownFunc == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if err := m.shutdownFunc(); err != nil {
|
||||
m.logger.Error("failed to shut down the tracer provider", "err", err)
|
||||
}
|
||||
|
||||
m.logger.Info("Tracing manager stopped")
|
||||
}
|
||||
|
||||
type otelErrHandler func(err error)
|
||||
|
||||
func (o otelErrHandler) Handle(err error) {
|
||||
o(err)
|
||||
}
|
||||
|
||||
// buildTracerProvider return a new tracer provider ready for installation, together
|
||||
// with a shutdown function.
|
||||
func buildTracerProvider(ctx context.Context, tracingCfg config.TracingConfig) (trace.TracerProvider, func() error, error) {
|
||||
client, err := getClient(tracingCfg)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
exp, err := otlptrace.New(ctx, client)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Create a resource describing the service and the runtime.
|
||||
res, err := resource.New(
|
||||
ctx,
|
||||
resource.WithSchemaURL(semconv.SchemaURL),
|
||||
resource.WithAttributes(
|
||||
semconv.ServiceNameKey.String(serviceName),
|
||||
semconv.ServiceVersionKey.String(version.Version),
|
||||
),
|
||||
resource.WithProcessRuntimeDescription(),
|
||||
resource.WithTelemetrySDK(),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
tp := tracesdk.NewTracerProvider(
|
||||
tracesdk.WithBatcher(exp),
|
||||
tracesdk.WithSampler(tracesdk.ParentBased(
|
||||
tracesdk.TraceIDRatioBased(tracingCfg.SamplingFraction),
|
||||
)),
|
||||
tracesdk.WithResource(res),
|
||||
)
|
||||
|
||||
return tp, func() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
err := tp.Shutdown(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}, nil
|
||||
}
|
||||
|
||||
// headersToMap converts prometheus/common Headers to a simple map[string]string.
|
||||
// It takes the first value from Values, Secrets, or Files for each header.
|
||||
func headersToMap(headers *commoncfg.Headers) (map[string]string, error) {
|
||||
if headers == nil || len(headers.Headers) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
result := make(map[string]string)
|
||||
for name, header := range headers.Headers {
|
||||
if len(header.Values) > 0 {
|
||||
result[name] = header.Values[0]
|
||||
} else if len(header.Secrets) > 0 {
|
||||
result[name] = string(header.Secrets[0])
|
||||
} else if len(header.Files) > 0 {
|
||||
// Note: Files would need to be read at runtime. For tracing config,
|
||||
// we only support direct values and secrets.
|
||||
return nil, fmt.Errorf("header files are not supported for tracing configuration")
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// getClient returns an appropriate OTLP client (either gRPC or HTTP), based
|
||||
// on the provided tracing configuration.
|
||||
func getClient(tracingCfg config.TracingConfig) (otlptrace.Client, error) {
|
||||
var client otlptrace.Client
|
||||
switch tracingCfg.ClientType {
|
||||
case config.TracingClientGRPC:
|
||||
opts := []otlptracegrpc.Option{otlptracegrpc.WithEndpoint(tracingCfg.Endpoint)}
|
||||
|
||||
switch {
|
||||
case tracingCfg.Insecure:
|
||||
opts = append(opts, otlptracegrpc.WithInsecure())
|
||||
case tracingCfg.TLSConfig != nil:
|
||||
// Use of TLS Credentials forces the use of TLS. Therefore it can
|
||||
// only be set when `insecure` is set to false.
|
||||
tlsConf, err := commoncfg.NewTLSConfig(tracingCfg.TLSConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
opts = append(opts, otlptracegrpc.WithTLSCredentials(credentials.NewTLS(tlsConf)))
|
||||
}
|
||||
|
||||
if tracingCfg.Compression != "" {
|
||||
opts = append(opts, otlptracegrpc.WithCompressor(tracingCfg.Compression))
|
||||
}
|
||||
|
||||
headers, err := headersToMap(tracingCfg.Headers)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(headers) > 0 {
|
||||
opts = append(opts, otlptracegrpc.WithHeaders(headers))
|
||||
}
|
||||
|
||||
if tracingCfg.Timeout != 0 {
|
||||
opts = append(opts, otlptracegrpc.WithTimeout(time.Duration(tracingCfg.Timeout)))
|
||||
}
|
||||
|
||||
client = otlptracegrpc.NewClient(opts...)
|
||||
case config.TracingClientHTTP:
|
||||
opts := []otlptracehttp.Option{otlptracehttp.WithEndpoint(tracingCfg.Endpoint)}
|
||||
|
||||
switch {
|
||||
case tracingCfg.Insecure:
|
||||
opts = append(opts, otlptracehttp.WithInsecure())
|
||||
case tracingCfg.TLSConfig != nil:
|
||||
tlsConf, err := commoncfg.NewTLSConfig(tracingCfg.TLSConfig)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
opts = append(opts, otlptracehttp.WithTLSClientConfig(tlsConf))
|
||||
}
|
||||
|
||||
if tracingCfg.Compression == config.GzipCompression {
|
||||
opts = append(opts, otlptracehttp.WithCompression(otlptracehttp.GzipCompression))
|
||||
}
|
||||
|
||||
headers, err := headersToMap(tracingCfg.Headers)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(headers) > 0 {
|
||||
opts = append(opts, otlptracehttp.WithHeaders(headers))
|
||||
}
|
||||
|
||||
if tracingCfg.Timeout != 0 {
|
||||
opts = append(opts, otlptracehttp.WithTimeout(time.Duration(tracingCfg.Timeout)))
|
||||
}
|
||||
|
||||
client = otlptracehttp.NewClient(opts...)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown tracing client type: %s", tracingCfg.ClientType)
|
||||
}
|
||||
|
||||
return client, nil
|
||||
}
|
||||
143
tracing/tracing_test.go
Normal file
143
tracing/tracing_test.go
Normal file
@@ -0,0 +1,143 @@
|
||||
// Copyright 2021 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tracing
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
commoncfg "github.com/prometheus/common/config"
|
||||
"github.com/prometheus/common/promslog"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.opentelemetry.io/otel"
|
||||
"go.opentelemetry.io/otel/trace/noop"
|
||||
|
||||
"github.com/prometheus/alertmanager/config"
|
||||
)
|
||||
|
||||
func TestInstallingNewTracerProvider(t *testing.T) {
|
||||
tpBefore := otel.GetTracerProvider()
|
||||
|
||||
m := NewManager(promslog.NewNopLogger())
|
||||
cfg := config.Config{
|
||||
TracingConfig: config.TracingConfig{
|
||||
Endpoint: "localhost:1234",
|
||||
ClientType: config.TracingClientGRPC,
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, m.ApplyConfig(&cfg))
|
||||
require.NotEqual(t, tpBefore, otel.GetTracerProvider())
|
||||
}
|
||||
|
||||
func TestReinstallingTracerProvider(t *testing.T) {
|
||||
m := NewManager(promslog.NewNopLogger())
|
||||
cfg := config.Config{
|
||||
TracingConfig: config.TracingConfig{
|
||||
Endpoint: "localhost:1234",
|
||||
ClientType: config.TracingClientGRPC,
|
||||
Headers: &commoncfg.Headers{
|
||||
Headers: map[string]commoncfg.Header{
|
||||
"foo": {Values: []string{"bar"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, m.ApplyConfig(&cfg))
|
||||
tpFirstConfig := otel.GetTracerProvider()
|
||||
|
||||
// Trying to apply the same config should not reinstall provider.
|
||||
require.NoError(t, m.ApplyConfig(&cfg))
|
||||
require.Equal(t, tpFirstConfig, otel.GetTracerProvider())
|
||||
|
||||
cfg2 := config.Config{
|
||||
TracingConfig: config.TracingConfig{
|
||||
Endpoint: "localhost:1234",
|
||||
ClientType: config.TracingClientHTTP,
|
||||
Headers: &commoncfg.Headers{
|
||||
Headers: map[string]commoncfg.Header{
|
||||
"bar": {Values: []string{"foo"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, m.ApplyConfig(&cfg2))
|
||||
require.NotEqual(t, tpFirstConfig, otel.GetTracerProvider())
|
||||
tpSecondConfig := otel.GetTracerProvider()
|
||||
|
||||
// Setting previously unset option should reinstall provider.
|
||||
cfg2.TracingConfig.Compression = "gzip"
|
||||
require.NoError(t, m.ApplyConfig(&cfg2))
|
||||
require.NotEqual(t, tpSecondConfig, otel.GetTracerProvider())
|
||||
}
|
||||
|
||||
func TestReinstallingTracerProviderWithTLS(t *testing.T) {
|
||||
m := NewManager(promslog.NewNopLogger())
|
||||
cfg := config.Config{
|
||||
TracingConfig: config.TracingConfig{
|
||||
Endpoint: "localhost:1234",
|
||||
ClientType: config.TracingClientGRPC,
|
||||
TLSConfig: &commoncfg.TLSConfig{
|
||||
CAFile: "testdata/ca.cer",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, m.ApplyConfig(&cfg))
|
||||
tpFirstConfig := otel.GetTracerProvider()
|
||||
|
||||
// Trying to apply the same config with TLS should reinstall provider.
|
||||
require.NoError(t, m.ApplyConfig(&cfg))
|
||||
require.NotEqual(t, tpFirstConfig, otel.GetTracerProvider())
|
||||
}
|
||||
|
||||
func TestUninstallingTracerProvider(t *testing.T) {
|
||||
m := NewManager(promslog.NewNopLogger())
|
||||
cfg := config.Config{
|
||||
TracingConfig: config.TracingConfig{
|
||||
Endpoint: "localhost:1234",
|
||||
ClientType: config.TracingClientGRPC,
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, m.ApplyConfig(&cfg))
|
||||
require.NotEqual(t, noop.NewTracerProvider(), otel.GetTracerProvider())
|
||||
|
||||
// Uninstall by passing empty config.
|
||||
cfg2 := config.Config{
|
||||
TracingConfig: config.TracingConfig{},
|
||||
}
|
||||
|
||||
require.NoError(t, m.ApplyConfig(&cfg2))
|
||||
// Make sure we get a no-op tracer provider after uninstallation.
|
||||
require.Equal(t, noop.NewTracerProvider(), otel.GetTracerProvider())
|
||||
}
|
||||
|
||||
func TestTracerProviderShutdown(t *testing.T) {
|
||||
m := NewManager(promslog.NewNopLogger())
|
||||
cfg := config.Config{
|
||||
TracingConfig: config.TracingConfig{
|
||||
Endpoint: "localhost:1234",
|
||||
ClientType: config.TracingClientGRPC,
|
||||
},
|
||||
}
|
||||
|
||||
require.NoError(t, m.ApplyConfig(&cfg))
|
||||
m.Stop()
|
||||
|
||||
// Check if we closed the done channel.
|
||||
_, ok := <-m.done
|
||||
require.False(t, ok)
|
||||
}
|
||||
@@ -14,6 +14,7 @@
|
||||
package types
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -471,7 +472,7 @@ func (a *Alert) Merge(o *Alert) *Alert {
|
||||
// maintain an underlying AlertMarker are expected to update it during a call of
|
||||
// Mutes.
|
||||
type Muter interface {
|
||||
Mutes(model.LabelSet) bool
|
||||
Mutes(ctx context.Context, lset model.LabelSet) bool
|
||||
}
|
||||
|
||||
// A TimeMuter determines if the time is muted by one or more active or mute
|
||||
@@ -482,10 +483,10 @@ type TimeMuter interface {
|
||||
}
|
||||
|
||||
// A MuteFunc is a function that implements the Muter interface.
|
||||
type MuteFunc func(model.LabelSet) bool
|
||||
type MuteFunc func(ctx context.Context, lset model.LabelSet) bool
|
||||
|
||||
// Mutes implements the Muter interface.
|
||||
func (f MuteFunc) Mutes(lset model.LabelSet) bool { return f(lset) }
|
||||
func (f MuteFunc) Mutes(ctx context.Context, lset model.LabelSet) bool { return f(ctx, lset) }
|
||||
|
||||
// A Silence determines whether a given label set is muted.
|
||||
type Silence struct {
|
||||
|
||||
Reference in New Issue
Block a user