mirror of
https://github.com/openshift/installer.git
synced 2026-02-05 06:46:36 +01:00
Revert "create: add check for cluster operator stability"
This commit is contained in:
@@ -7,7 +7,6 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
@@ -53,11 +52,6 @@ const (
|
||||
exitCodeInfrastructureFailed
|
||||
exitCodeBootstrapFailed
|
||||
exitCodeInstallFailed
|
||||
exitCodeOperatorStabilityFailed
|
||||
|
||||
// coStabilityThreshold is how long a cluster operator must have Progressing=False
|
||||
// in order to be considered stable. Measured in seconds.
|
||||
coStabilityThreshold float64 = 30
|
||||
)
|
||||
|
||||
// each target is a variable to preserve the order when creating subcommands and still
|
||||
@@ -495,7 +489,7 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
|
||||
defer cancel()
|
||||
|
||||
failing := configv1.ClusterStatusConditionType("Failing")
|
||||
timer.StartTimer("Cluster Operators Available")
|
||||
timer.StartTimer("Cluster Operators")
|
||||
var lastError string
|
||||
_, err = clientwatch.UntilWithSync(
|
||||
clusterVersionContext,
|
||||
@@ -513,7 +507,7 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
|
||||
if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorAvailable) &&
|
||||
cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, failing) &&
|
||||
cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, configv1.OperatorProgressing) {
|
||||
timer.StopTimer("Cluster Operators Available")
|
||||
timer.StopTimer("Cluster Operators")
|
||||
return true, nil
|
||||
}
|
||||
if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, failing) {
|
||||
@@ -545,57 +539,6 @@ func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
|
||||
return errors.Wrap(err, "failed to initialize the cluster")
|
||||
}
|
||||
|
||||
// waitForStableOperators ensures that each cluster operator is "stable", i.e. the
|
||||
// operator has not been in a progressing state for at least a certain duration,
|
||||
// 30 seconds by default. Returns an error if any operator does meet this threshold
|
||||
// after a deadline, 5 minutes by default.
|
||||
func waitForStableOperators(ctx context.Context, config *rest.Config) error {
|
||||
timer.StartTimer("Cluster Operators Stable")
|
||||
|
||||
stabilityCheckDuration := 5 * time.Minute
|
||||
stabilityContext, cancel := context.WithTimeout(ctx, stabilityCheckDuration)
|
||||
defer cancel()
|
||||
|
||||
untilTime := time.Now().Add(stabilityCheckDuration)
|
||||
logrus.Infof("Waiting up to %v (until %v) to ensure each cluster operator has finished progressing...",
|
||||
stabilityCheckDuration, untilTime.Format(time.Kitchen))
|
||||
|
||||
cc, err := configclient.NewForConfig(config)
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "failed to create a config client")
|
||||
}
|
||||
|
||||
coNames, err := getClusterOperatorNames(ctx, cc)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// stabilityCheck closure maintains state of whether any cluster operator
|
||||
// encounters a stability error
|
||||
stabilityCheck := coStabilityChecker()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for _, co := range coNames {
|
||||
wg.Add(1)
|
||||
go func(co string) {
|
||||
defer wg.Done()
|
||||
status, statusErr := getCOProgressingStatus(stabilityContext, cc, co)
|
||||
err = stabilityCheck(co, status, statusErr)
|
||||
}(co)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
timer.StopTimer("Cluster Operators Stable")
|
||||
|
||||
logrus.Info("All cluster operators have completed progressing")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// getConsole returns the console URL from the route 'console' in namespace openshift-console
|
||||
func getConsole(ctx context.Context, config *rest.Config) (string, error) {
|
||||
url := ""
|
||||
@@ -678,10 +621,6 @@ func waitForInstallComplete(ctx context.Context, config *rest.Config, directory
|
||||
return err
|
||||
}
|
||||
|
||||
if err := waitForStableOperators(ctx, config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
consoleURL, err := getConsole(ctx, config)
|
||||
if err == nil {
|
||||
if err = addRouterCAToClusterCA(ctx, config, rootOpts.dir); err != nil {
|
||||
@@ -700,90 +639,3 @@ The cluster should be accessible for troubleshooting as detailed in the document
|
||||
https://docs.openshift.com/container-platform/latest/support/troubleshooting/troubleshooting-installations.html
|
||||
The 'wait-for install-complete' subcommand can then be used to continue the installation`)
|
||||
}
|
||||
|
||||
func getClusterOperatorNames(ctx context.Context, cc *configclient.Clientset) ([]string, error) {
|
||||
listCtx, cancel := context.WithTimeout(ctx, 1*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
cos, err := cc.ConfigV1().ClusterOperators().List(listCtx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
names := []string{}
|
||||
for _, v := range cos.Items {
|
||||
names = append(names, v.Name)
|
||||
}
|
||||
return names, nil
|
||||
}
|
||||
|
||||
func getCOProgressingStatus(ctx context.Context, cc *configclient.Clientset, name string) (*configv1.ClusterOperatorStatusCondition, error) {
|
||||
coListWatcher := cache.NewListWatchFromClient(cc.ConfigV1().RESTClient(),
|
||||
"clusteroperators",
|
||||
"",
|
||||
fields.OneTermEqualSelector("metadata.name", name))
|
||||
|
||||
var pStatus *configv1.ClusterOperatorStatusCondition
|
||||
|
||||
_, err := clientwatch.UntilWithSync(
|
||||
ctx,
|
||||
coListWatcher,
|
||||
&configv1.ClusterOperator{},
|
||||
nil,
|
||||
func(event watch.Event) (bool, error) {
|
||||
switch event.Type {
|
||||
case watch.Added, watch.Modified:
|
||||
cos, ok := event.Object.(*configv1.ClusterOperator)
|
||||
if !ok {
|
||||
logrus.Debugf("Cluster Operator %s status not found", name)
|
||||
return false, nil
|
||||
}
|
||||
progressing := cov1helpers.FindStatusCondition(cos.Status.Conditions, configv1.OperatorProgressing)
|
||||
if progressing == nil {
|
||||
logrus.Debugf("Cluster Operator %s progressing == nil", name)
|
||||
return false, nil
|
||||
}
|
||||
pStatus = progressing
|
||||
|
||||
if meetsStabilityThreshold(pStatus) {
|
||||
logrus.Debugf("Cluster Operator %s is stable", name)
|
||||
return true, nil
|
||||
}
|
||||
logrus.Debugf("Cluster Operator %s is Progressing=%s LastTransitionTime=%v DurationSinceTransition=%.fs Reason=%s Message=%s", name, progressing.Status, progressing.LastTransitionTime.Time, time.Since(progressing.LastTransitionTime.Time).Seconds(), progressing.Reason, progressing.Message)
|
||||
}
|
||||
return false, nil
|
||||
},
|
||||
)
|
||||
return pStatus, err
|
||||
}
|
||||
|
||||
// coStabilityChecker returns a closure which references a shared error variable. err
|
||||
// tracks whether any operator has had a stability error. The closure function will
|
||||
// return an error if any operator has had an instability error, even if the operator
|
||||
// currently being checked is stable.
|
||||
func coStabilityChecker() func(string, *configv1.ClusterOperatorStatusCondition, error) error {
|
||||
var err error
|
||||
|
||||
return func(name string, status *configv1.ClusterOperatorStatusCondition, statusErr error) error {
|
||||
if statusErr == nil {
|
||||
return err
|
||||
}
|
||||
if !errors.Is(statusErr, wait.ErrWaitTimeout) {
|
||||
logrus.Errorf("Error checking cluster operator %s Progressing status: %q", name, statusErr)
|
||||
logrus.Exit(exitCodeOperatorStabilityFailed)
|
||||
err = errors.New("cluster operators are not stable")
|
||||
}
|
||||
if meetsStabilityThreshold(status) {
|
||||
logrus.Debugf("Cluster operator %s is now stable: Progressing=%s LastTransitionTime=%v DurationSinceTransition=%.fs Reason=%s Message=%s", name, status.Status, status.LastTransitionTime.Time, time.Since(status.LastTransitionTime.Time).Seconds(), status.Reason, status.Message)
|
||||
} else {
|
||||
logrus.Errorf("Cluster operator %s does not meet stability threshold of Progressing=false for greater than %.f seconds with Reason: %q and Message: %q", name, coStabilityThreshold, status.Reason, status.Message)
|
||||
logrus.Exit(exitCodeOperatorStabilityFailed)
|
||||
err = errors.New("cluster operators are not stable")
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
func meetsStabilityThreshold(progressing *configv1.ClusterOperatorStatusCondition) bool {
|
||||
return progressing.Status == configv1.ConditionFalse && time.Since(progressing.LastTransitionTime.Time).Seconds() > coStabilityThreshold
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user