diff --git a/README.md b/README.md index bcde889fa..2fdfdd976 100644 --- a/README.md +++ b/README.md @@ -358,6 +358,7 @@ be configured to communicate with each other. This is configured using the - `--cluster.advertise-address` string: cluster advertise address - `--cluster.peer` value: initial peers (repeat flag for each additional peer) - `--cluster.peer-timeout` value: peer timeout period (default "15s") +- `--cluster.peers-resolve-timeout` value: peers resolve timeout period (default "15s") - `--cluster.gossip-interval` value: cluster message propagation speed (default "200ms") - `--cluster.pushpull-interval` value: lower values will increase diff --git a/cluster/cluster.go b/cluster/cluster.go index fbd425f0e..8eb8f60e2 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -60,7 +60,8 @@ type Peer struct { mlist *memberlist.Memberlist delegate *delegate - resolvedPeers []string + resolvedPeers []string + resolvePeersTimeout time.Duration mtx sync.RWMutex states map[string]State @@ -117,15 +118,16 @@ func (s PeerStatus) String() string { } const ( - DefaultPushPullInterval = 60 * time.Second - DefaultGossipInterval = 200 * time.Millisecond - DefaultTCPTimeout = 10 * time.Second - DefaultProbeTimeout = 500 * time.Millisecond - DefaultProbeInterval = 1 * time.Second - DefaultReconnectInterval = 10 * time.Second - DefaultReconnectTimeout = 6 * time.Hour - DefaultRefreshInterval = 15 * time.Second - MaxGossipPacketSize = 1400 + DefaultPushPullInterval = 60 * time.Second + DefaultGossipInterval = 200 * time.Millisecond + DefaultTCPTimeout = 10 * time.Second + DefaultProbeTimeout = 500 * time.Millisecond + DefaultProbeInterval = 1 * time.Second + DefaultReconnectInterval = 10 * time.Second + DefaultReconnectTimeout = 6 * time.Hour + DefaultRefreshInterval = 15 * time.Second + DefaultResolvePeersTimeout = 15 * time.Second + MaxGossipPacketSize = 1400 ) func Create( @@ -138,6 +140,7 @@ func Create( pushPullInterval time.Duration, gossipInterval time.Duration, tcpTimeout time.Duration, + resolveTimeout time.Duration, probeTimeout time.Duration, probeInterval time.Duration, tlsTransportConfig *TLSTransportConfig, @@ -168,7 +171,9 @@ func Create( } } - resolvedPeers, err := resolvePeers(context.Background(), knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty) + ctx, cancel := context.WithTimeout(context.Background(), resolveTimeout) + defer cancel() + resolvedPeers, err := resolvePeers(ctx, knownPeers, advertiseAddr, &net.Resolver{}, waitIfEmpty) if err != nil { return nil, fmt.Errorf("resolve peers: %w", err) } @@ -199,13 +204,14 @@ func Create( } p := &Peer{ - states: map[string]State{}, - stopc: make(chan struct{}), - readyc: make(chan struct{}), - logger: l, - peers: map[string]peer{}, - resolvedPeers: resolvedPeers, - knownPeers: knownPeers, + states: map[string]State{}, + stopc: make(chan struct{}), + readyc: make(chan struct{}), + logger: l, + peers: map[string]peer{}, + resolvedPeers: resolvedPeers, + resolvePeersTimeout: resolveTimeout, + knownPeers: knownPeers, } p.register(reg, name) @@ -445,7 +451,9 @@ func (p *Peer) reconnect() { func (p *Peer) refresh() { logger := p.logger.With("msg", "refresh") - resolvedPeers, err := resolvePeers(context.Background(), p.knownPeers, p.advertiseAddr, &net.Resolver{}, false) + ctx, cancel := context.WithTimeout(context.Background(), p.resolvePeersTimeout) + defer cancel() + resolvedPeers, err := resolvePeers(ctx, p.knownPeers, p.advertiseAddr, &net.Resolver{}, false) if err != nil { logger.Debug(fmt.Sprintf("%v", p.knownPeers), "err", err) return diff --git a/cluster/cluster_test.go b/cluster/cluster_test.go index 0b2f58052..33295f842 100644 --- a/cluster/cluster_test.go +++ b/cluster/cluster_test.go @@ -53,6 +53,7 @@ func testJoinLeave(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, @@ -89,6 +90,7 @@ func testJoinLeave(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, @@ -126,6 +128,7 @@ func testReconnect(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, @@ -153,6 +156,7 @@ func testReconnect(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, @@ -195,6 +199,7 @@ func testRemoveFailedPeers(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, @@ -248,6 +253,7 @@ func testInitiallyFailingPeers(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, @@ -297,6 +303,7 @@ func testTLSConnection(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, tlsTransportConfig1, @@ -330,6 +337,7 @@ func testTLSConnection(t *testing.T) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, tlsTransportConfig2, @@ -369,6 +377,7 @@ func testPeerNames(t *testing.T, name1, name2 string) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, @@ -405,6 +414,7 @@ func testPeerNames(t *testing.T, name1, name2 string) { DefaultPushPullInterval, DefaultGossipInterval, DefaultTCPTimeout, + DefaultResolvePeersTimeout, DefaultProbeTimeout, DefaultProbeInterval, nil, diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index d29f0576c..3e0bb994e 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -159,6 +159,7 @@ func run() int { clusterPeerName = kingpin.Flag("cluster.peer-name", "Explicit name of the peer, rather than generating a random one").Default("").String() peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings() peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration() + peersResolveTimeout = kingpin.Flag("cluster.peers-resolve-timeout", "Time to resolve peers.").Default(cluster.DefaultResolvePeersTimeout.String()).Duration() gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration() pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration() tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTCPTimeout.String()).Duration() @@ -245,6 +246,7 @@ func run() int { *pushPullInterval, *gossipInterval, *tcpTimeout, + *peersResolveTimeout, *probeTimeout, *probeInterval, tlsTransportConfig,