mirror of
https://github.com/prometheus/alertmanager.git
synced 2026-02-05 06:45:45 +01:00
fix: set context timeout for resolvePeers (#4343)
* fix: set context timeout for resolvePeers Given resolvePeers is context aware, yet, uses background that may lead to blocking call. This is due to the fact that `LookupIPAddr` is called under the hood that does dns look up which may be blocked in some platforms. In case of dns lookups are blocked may lead to hanging goroutine due to the fact that main goroutine is being used for the call that leads to application hanging & being unresponsive as well as readiness check to fail. --------- Signed-off-by: emreya <e.yazici1990@gmail.com>
This commit is contained in:
@@ -358,6 +358,7 @@ be configured to communicate with each other. This is configured using the
|
||||
- `--cluster.advertise-address` string: cluster advertise address
|
||||
- `--cluster.peer` value: initial peers (repeat flag for each additional peer)
|
||||
- `--cluster.peer-timeout` value: peer timeout period (default "15s")
|
||||
- `--cluster.peers-resolve-timeout` value: peers resolve timeout period (default "15s")
|
||||
- `--cluster.gossip-interval` value: cluster message propagation speed
|
||||
(default "200ms")
|
||||
- `--cluster.pushpull-interval` value: lower values will increase
|
||||
|
||||
@@ -60,7 +60,8 @@ type Peer struct {
|
||||
mlist *memberlist.Memberlist
|
||||
delegate *delegate
|
||||
|
||||
resolvedPeers []string
|
||||
resolvedPeers []string
|
||||
resolvePeersTimeout time.Duration
|
||||
|
||||
mtx sync.RWMutex
|
||||
states map[string]State
|
||||
@@ -117,15 +118,16 @@ func (s PeerStatus) String() string {
|
||||
}
|
||||
|
||||
const (
|
||||
DefaultPushPullInterval = 60 * time.Second
|
||||
DefaultGossipInterval = 200 * time.Millisecond
|
||||
DefaultTCPTimeout = 10 * time.Second
|
||||
DefaultProbeTimeout = 500 * time.Millisecond
|
||||
DefaultProbeInterval = 1 * time.Second
|
||||
DefaultReconnectInterval = 10 * time.Second
|
||||
DefaultReconnectTimeout = 6 * time.Hour
|
||||
DefaultRefreshInterval = 15 * time.Second
|
||||
MaxGossipPacketSize = 1400
|
||||
DefaultPushPullInterval = 60 * time.Second
|
||||
DefaultGossipInterval = 200 * time.Millisecond
|
||||
DefaultTCPTimeout = 10 * time.Second
|
||||
DefaultProbeTimeout = 500 * time.Millisecond
|
||||
DefaultProbeInterval = 1 * time.Second
|
||||
DefaultReconnectInterval = 10 * time.Second
|
||||
DefaultReconnectTimeout = 6 * time.Hour
|
||||
DefaultRefreshInterval = 15 * time.Second
|
||||
DefaultResolvePeersTimeout = 15 * time.Second
|
||||
MaxGossipPacketSize = 1400
|
||||
)
|
||||
|
||||
func Create(
|
||||
@@ -138,6 +140,7 @@ func Create(
|
||||
pushPullInterval time.Duration,
|
||||
gossipInterval time.Duration,
|
||||
tcpTimeout time.Duration,
|
||||
resolveTimeout time.Duration,
|
||||
probeTimeout time.Duration,
|
||||
probeInterval time.Duration,
|
||||
tlsTransportConfig *TLSTransportConfig,
|
||||
@@ -168,7 +171,9 @@ func Create(
|
||||
}
|
||||
}
|
||||
|
||||
resolvedPeers, err := resolvePeers(context.Background(), knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), resolveTimeout)
|
||||
defer cancel()
|
||||
resolvedPeers, err := resolvePeers(ctx, knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("resolve peers: %w", err)
|
||||
}
|
||||
@@ -199,13 +204,14 @@ func Create(
|
||||
}
|
||||
|
||||
p := &Peer{
|
||||
states: map[string]State{},
|
||||
stopc: make(chan struct{}),
|
||||
readyc: make(chan struct{}),
|
||||
logger: l,
|
||||
peers: map[string]peer{},
|
||||
resolvedPeers: resolvedPeers,
|
||||
knownPeers: knownPeers,
|
||||
states: map[string]State{},
|
||||
stopc: make(chan struct{}),
|
||||
readyc: make(chan struct{}),
|
||||
logger: l,
|
||||
peers: map[string]peer{},
|
||||
resolvedPeers: resolvedPeers,
|
||||
resolvePeersTimeout: resolveTimeout,
|
||||
knownPeers: knownPeers,
|
||||
}
|
||||
|
||||
p.register(reg, name)
|
||||
@@ -445,7 +451,9 @@ func (p *Peer) reconnect() {
|
||||
func (p *Peer) refresh() {
|
||||
logger := p.logger.With("msg", "refresh")
|
||||
|
||||
resolvedPeers, err := resolvePeers(context.Background(), p.knownPeers, p.advertiseAddr, &net.Resolver{}, false)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), p.resolvePeersTimeout)
|
||||
defer cancel()
|
||||
resolvedPeers, err := resolvePeers(ctx, p.knownPeers, p.advertiseAddr, &net.Resolver{}, false)
|
||||
if err != nil {
|
||||
logger.Debug(fmt.Sprintf("%v", p.knownPeers), "err", err)
|
||||
return
|
||||
|
||||
@@ -53,6 +53,7 @@ func testJoinLeave(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
@@ -89,6 +90,7 @@ func testJoinLeave(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
@@ -126,6 +128,7 @@ func testReconnect(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
@@ -153,6 +156,7 @@ func testReconnect(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
@@ -195,6 +199,7 @@ func testRemoveFailedPeers(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
@@ -248,6 +253,7 @@ func testInitiallyFailingPeers(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
@@ -297,6 +303,7 @@ func testTLSConnection(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
tlsTransportConfig1,
|
||||
@@ -330,6 +337,7 @@ func testTLSConnection(t *testing.T) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
tlsTransportConfig2,
|
||||
@@ -369,6 +377,7 @@ func testPeerNames(t *testing.T, name1, name2 string) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
@@ -405,6 +414,7 @@ func testPeerNames(t *testing.T, name1, name2 string) {
|
||||
DefaultPushPullInterval,
|
||||
DefaultGossipInterval,
|
||||
DefaultTCPTimeout,
|
||||
DefaultResolvePeersTimeout,
|
||||
DefaultProbeTimeout,
|
||||
DefaultProbeInterval,
|
||||
nil,
|
||||
|
||||
@@ -159,6 +159,7 @@ func run() int {
|
||||
clusterPeerName = kingpin.Flag("cluster.peer-name", "Explicit name of the peer, rather than generating a random one").Default("").String()
|
||||
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
|
||||
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
|
||||
peersResolveTimeout = kingpin.Flag("cluster.peers-resolve-timeout", "Time to resolve peers.").Default(cluster.DefaultResolvePeersTimeout.String()).Duration()
|
||||
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
|
||||
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
|
||||
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTCPTimeout.String()).Duration()
|
||||
@@ -245,6 +246,7 @@ func run() int {
|
||||
*pushPullInterval,
|
||||
*gossipInterval,
|
||||
*tcpTimeout,
|
||||
*peersResolveTimeout,
|
||||
*probeTimeout,
|
||||
*probeInterval,
|
||||
tlsTransportConfig,
|
||||
|
||||
Reference in New Issue
Block a user