1
0
mirror of https://github.com/prometheus/alertmanager.git synced 2026-02-05 06:45:45 +01:00

fix: set context timeout for resolvePeers (#4343)

* fix: set context timeout for resolvePeers

Given resolvePeers is context aware, yet, uses background that may lead to blocking call. This is due to the fact that `LookupIPAddr` is called under the hood that does dns look up which may be blocked in some platforms.

In case of dns lookups are blocked may lead to hanging goroutine due to the fact that main goroutine is being used for the call that leads to application hanging & being unresponsive as well as readiness check to fail.
---------

Signed-off-by: emreya <e.yazici1990@gmail.com>
This commit is contained in:
Emre Yazıcı
2025-11-06 21:09:08 +01:00
committed by GitHub
parent fe2c526235
commit 1fe77cfa58
4 changed files with 40 additions and 19 deletions

View File

@@ -358,6 +358,7 @@ be configured to communicate with each other. This is configured using the
- `--cluster.advertise-address` string: cluster advertise address
- `--cluster.peer` value: initial peers (repeat flag for each additional peer)
- `--cluster.peer-timeout` value: peer timeout period (default "15s")
- `--cluster.peers-resolve-timeout` value: peers resolve timeout period (default "15s")
- `--cluster.gossip-interval` value: cluster message propagation speed
(default "200ms")
- `--cluster.pushpull-interval` value: lower values will increase

View File

@@ -60,7 +60,8 @@ type Peer struct {
mlist *memberlist.Memberlist
delegate *delegate
resolvedPeers []string
resolvedPeers []string
resolvePeersTimeout time.Duration
mtx sync.RWMutex
states map[string]State
@@ -117,15 +118,16 @@ func (s PeerStatus) String() string {
}
const (
DefaultPushPullInterval = 60 * time.Second
DefaultGossipInterval = 200 * time.Millisecond
DefaultTCPTimeout = 10 * time.Second
DefaultProbeTimeout = 500 * time.Millisecond
DefaultProbeInterval = 1 * time.Second
DefaultReconnectInterval = 10 * time.Second
DefaultReconnectTimeout = 6 * time.Hour
DefaultRefreshInterval = 15 * time.Second
MaxGossipPacketSize = 1400
DefaultPushPullInterval = 60 * time.Second
DefaultGossipInterval = 200 * time.Millisecond
DefaultTCPTimeout = 10 * time.Second
DefaultProbeTimeout = 500 * time.Millisecond
DefaultProbeInterval = 1 * time.Second
DefaultReconnectInterval = 10 * time.Second
DefaultReconnectTimeout = 6 * time.Hour
DefaultRefreshInterval = 15 * time.Second
DefaultResolvePeersTimeout = 15 * time.Second
MaxGossipPacketSize = 1400
)
func Create(
@@ -138,6 +140,7 @@ func Create(
pushPullInterval time.Duration,
gossipInterval time.Duration,
tcpTimeout time.Duration,
resolveTimeout time.Duration,
probeTimeout time.Duration,
probeInterval time.Duration,
tlsTransportConfig *TLSTransportConfig,
@@ -168,7 +171,9 @@ func Create(
}
}
resolvedPeers, err := resolvePeers(context.Background(), knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty)
ctx, cancel := context.WithTimeout(context.Background(), resolveTimeout)
defer cancel()
resolvedPeers, err := resolvePeers(ctx, knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty)
if err != nil {
return nil, fmt.Errorf("resolve peers: %w", err)
}
@@ -199,13 +204,14 @@ func Create(
}
p := &Peer{
states: map[string]State{},
stopc: make(chan struct{}),
readyc: make(chan struct{}),
logger: l,
peers: map[string]peer{},
resolvedPeers: resolvedPeers,
knownPeers: knownPeers,
states: map[string]State{},
stopc: make(chan struct{}),
readyc: make(chan struct{}),
logger: l,
peers: map[string]peer{},
resolvedPeers: resolvedPeers,
resolvePeersTimeout: resolveTimeout,
knownPeers: knownPeers,
}
p.register(reg, name)
@@ -445,7 +451,9 @@ func (p *Peer) reconnect() {
func (p *Peer) refresh() {
logger := p.logger.With("msg", "refresh")
resolvedPeers, err := resolvePeers(context.Background(), p.knownPeers, p.advertiseAddr, &net.Resolver{}, false)
ctx, cancel := context.WithTimeout(context.Background(), p.resolvePeersTimeout)
defer cancel()
resolvedPeers, err := resolvePeers(ctx, p.knownPeers, p.advertiseAddr, &net.Resolver{}, false)
if err != nil {
logger.Debug(fmt.Sprintf("%v", p.knownPeers), "err", err)
return

View File

@@ -53,6 +53,7 @@ func testJoinLeave(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
@@ -89,6 +90,7 @@ func testJoinLeave(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
@@ -126,6 +128,7 @@ func testReconnect(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
@@ -153,6 +156,7 @@ func testReconnect(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
@@ -195,6 +199,7 @@ func testRemoveFailedPeers(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
@@ -248,6 +253,7 @@ func testInitiallyFailingPeers(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
@@ -297,6 +303,7 @@ func testTLSConnection(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
tlsTransportConfig1,
@@ -330,6 +337,7 @@ func testTLSConnection(t *testing.T) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
tlsTransportConfig2,
@@ -369,6 +377,7 @@ func testPeerNames(t *testing.T, name1, name2 string) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,
@@ -405,6 +414,7 @@ func testPeerNames(t *testing.T, name1, name2 string) {
DefaultPushPullInterval,
DefaultGossipInterval,
DefaultTCPTimeout,
DefaultResolvePeersTimeout,
DefaultProbeTimeout,
DefaultProbeInterval,
nil,

View File

@@ -159,6 +159,7 @@ func run() int {
clusterPeerName = kingpin.Flag("cluster.peer-name", "Explicit name of the peer, rather than generating a random one").Default("").String()
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
peersResolveTimeout = kingpin.Flag("cluster.peers-resolve-timeout", "Time to resolve peers.").Default(cluster.DefaultResolvePeersTimeout.String()).Duration()
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTCPTimeout.String()).Duration()
@@ -245,6 +246,7 @@ func run() int {
*pushPullInterval,
*gossipInterval,
*tcpTimeout,
*peersResolveTimeout,
*probeTimeout,
*probeInterval,
tlsTransportConfig,