1
0
mirror of https://github.com/openshift/installer.git synced 2026-02-05 15:47:14 +01:00

Add assisted-service validations and events to monitor-add-nodes

This commit is contained in:
Richard Su
2024-04-15 20:42:13 -04:00
parent 6c69310306
commit 51a57560cf
5 changed files with 138 additions and 92 deletions

View File

@@ -10,6 +10,7 @@ import (
"github.com/openshift/installer/cmd/openshift-install/command"
agentpkg "github.com/openshift/installer/pkg/agent"
"github.com/openshift/installer/pkg/asset/agent/workflow"
)
const (
@@ -71,7 +72,7 @@ func newWaitForBootstrapCompleteCmd() *cobra.Command {
}
ctx := context.Background()
cluster, err := agentpkg.NewCluster(ctx, kubeconfigPath, rendezvousIP, sshKey)
cluster, err := agentpkg.NewCluster(ctx, kubeconfigPath, rendezvousIP, sshKey, workflow.AgentWorkflowTypeInstall)
if err != nil {
logrus.Exit(exitCodeBootstrapFailed)
}
@@ -106,7 +107,7 @@ func newWaitForInstallCompleteCmd() *cobra.Command {
}
ctx := context.Background()
cluster, err := agentpkg.NewCluster(ctx, kubeconfigPath, rendezvousIP, sshKey)
cluster, err := agentpkg.NewCluster(ctx, kubeconfigPath, rendezvousIP, sshKey, workflow.AgentWorkflowTypeInstall)
if err != nil {
logrus.Exit(exitCodeBootstrapFailed)
}

View File

@@ -2,6 +2,7 @@ package agent
import (
"context"
"fmt"
"net"
"os"
"path/filepath"
@@ -14,6 +15,7 @@ import (
"github.com/openshift/assisted-service/client/installer"
"github.com/openshift/assisted-service/models"
"github.com/openshift/installer/pkg/asset/agent/workflow"
"github.com/openshift/installer/pkg/gather/ssh"
)
@@ -27,6 +29,7 @@ type Cluster struct {
clusterID *strfmt.UUID
clusterInfraEnvID *strfmt.UUID
installHistory *clusterInstallStatusHistory
workflow workflow.AgentWorkflowType
}
type clientSet struct {
@@ -63,7 +66,7 @@ type clusterInstallStatusHistory struct {
}
// NewCluster initializes a Cluster object
func NewCluster(ctx context.Context, kubeconfigPath, rendezvousIP, sshKey string) (*Cluster, error) {
func NewCluster(ctx context.Context, kubeconfigPath, rendezvousIP, sshKey string, workflowType workflow.AgentWorkflowType) (*Cluster, error) {
czero := &Cluster{}
capi := &clientSet{}
@@ -108,6 +111,7 @@ func NewCluster(ctx context.Context, kubeconfigPath, rendezvousIP, sshKey string
czero.Ctx = ctx
czero.API = capi
czero.workflow = workflowType
czero.clusterID = nil
czero.clusterInfraEnvID = nil
czero.assetDir = kubeconfigPath
@@ -167,7 +171,6 @@ func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) {
if configmap {
logrus.Info("Bootstrap configMap status is complete")
czero.installHistory.ClusterBootstrapComplete = true
return true, false, nil
}
if err != nil {
logrus.Debug(err)
@@ -176,104 +179,121 @@ func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) {
// Agent Rest API is available
if agentRestAPILive {
// First time we see the agent Rest API
if !czero.installHistory.RestAPISeen {
logrus.Debug("Agent Rest API Initialized")
czero.installHistory.RestAPISeen = true
czero.installHistory.NotReadyTime = time.Now()
}
// Lazy loading of the clusterID and clusterInfraEnvID
if czero.clusterID == nil {
clusterID, err := czero.API.Rest.getClusterID()
if err != nil {
return false, false, errors.Wrap(err, "Unable to retrieve clusterID from Agent Rest API")
}
czero.clusterID = clusterID
}
if czero.clusterInfraEnvID == nil {
clusterInfraEnvID, err := czero.API.Rest.getClusterInfraEnvID()
if err != nil {
return false, false, errors.Wrap(err, "Unable to retrieve clusterInfraEnvID from Agent Rest API")
}
czero.clusterInfraEnvID = clusterInfraEnvID
}
// Getting cluster metadata from Agent Rest API
clusterMetadata, err := czero.GetClusterRestAPIMetadata()
isBootstrapComplete, exitOnErr, err := czero.LogAssistedServiceStatus()
if err != nil {
return false, false, errors.Wrap(err, "Unable to retrieve cluster metadata from Agent Rest API")
return isBootstrapComplete, exitOnErr, err
}
}
if clusterMetadata == nil {
return false, false, errors.New("cluster metadata returned nil from Agent Rest API")
// cluster bootstrap is not complete
return false, false, nil
}
func (czero *Cluster) LogAssistedServiceStatus() (bool, bool, error) {
resource := "cluster"
logPrefix := ""
if czero.workflow == workflow.AgentWorkflowTypeAddNodes {
resource = "host"
logPrefix = fmt.Sprintf("Node %s: ", czero.API.Rest.NodeZeroIP)
}
// First time we see the agent Rest API
if !czero.installHistory.RestAPISeen {
logrus.Debugf("%sAgent Rest API Initialized", logPrefix)
czero.installHistory.RestAPISeen = true
czero.installHistory.NotReadyTime = time.Now()
}
// Lazy loading of the clusterID and clusterInfraEnvID
if czero.clusterID == nil {
clusterID, err := czero.API.Rest.getClusterID()
if err != nil {
return false, false, errors.Wrap(err, "Unable to retrieve clusterID from Agent Rest API")
}
czero.clusterID = clusterID
}
czero.PrintInstallStatus(clusterMetadata)
if czero.clusterInfraEnvID == nil {
clusterInfraEnvID, err := czero.API.Rest.getClusterInfraEnvID()
if err != nil {
return false, false, errors.Wrap(err, "Unable to retrieve clusterInfraEnvID from Agent Rest API")
}
czero.clusterInfraEnvID = clusterInfraEnvID
}
// If status indicates pending action, log host info to help pinpoint what is missing
if (*clusterMetadata.Status != czero.installHistory.RestAPIPreviousClusterStatus) &&
(*clusterMetadata.Status == models.ClusterStatusInstallingPendingUserAction) {
for _, host := range clusterMetadata.Hosts {
if *host.Status == models.ClusterStatusInstallingPendingUserAction {
// Getting cluster metadata from Agent Rest API
clusterMetadata, err := czero.GetClusterRestAPIMetadata()
if err != nil {
return false, false, errors.Wrap(err, "Unable to retrieve cluster metadata from Agent Rest API")
}
if clusterMetadata == nil {
return false, false, errors.New("cluster metadata returned nil from Agent Rest API")
}
czero.PrintInstallStatus(clusterMetadata)
// If status indicates pending action, log host info to help pinpoint what is missing
if (*clusterMetadata.Status != czero.installHistory.RestAPIPreviousClusterStatus) &&
(*clusterMetadata.Status == models.ClusterStatusInstallingPendingUserAction) {
for _, host := range clusterMetadata.Hosts {
if *host.Status == models.ClusterStatusInstallingPendingUserAction {
if logPrefix != "" {
logrus.Warningf("%s%s %s", logPrefix, host.RequestedHostname, *host.StatusInfo)
} else {
logrus.Warningf("Host %s %s", host.RequestedHostname, *host.StatusInfo)
}
}
}
}
if *clusterMetadata.Status == models.ClusterStatusReady {
stuck, err := czero.IsClusterStuckInReady()
if err != nil {
return false, stuck, err
}
} else {
czero.installHistory.NotReadyTime = time.Now()
}
czero.installHistory.RestAPIPreviousClusterStatus = *clusterMetadata.Status
installing, _ := czero.IsInstalling(*clusterMetadata.Status)
if !installing {
errored, _ := czero.HasErrored(*clusterMetadata.Status)
if errored {
return false, false, errors.New("cluster has stopped installing... working to recover installation")
} else if *clusterMetadata.Status == models.ClusterStatusCancelled {
return false, true, errors.New("cluster installation was cancelled")
}
}
validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logrus.StandardLogger())
if validationsErr != nil {
return false, false, errors.Wrap(validationsErr, "cluster host validations failed")
}
// Print most recent event associated with the clusterInfraEnvID
eventList, err := czero.API.Rest.GetInfraEnvEvents(czero.clusterInfraEnvID)
if *clusterMetadata.Status == models.ClusterStatusReady {
stuck, err := czero.IsClusterStuckInReady()
if err != nil {
return false, false, errors.Wrap(err, "Unable to retrieve events about the cluster from the Agent Rest API")
return false, stuck, err
}
if len(eventList) == 0 {
// No cluster events detected from the Agent Rest API
} else {
mostRecentEvent := eventList[len(eventList)-1]
// Don't print the same status message back to back
if *mostRecentEvent.Message != czero.installHistory.RestAPIPreviousEventMessage {
if *mostRecentEvent.Severity == models.EventSeverityInfo {
logrus.Info(*mostRecentEvent.Message)
} else {
logrus.Warn(*mostRecentEvent.Message)
}
}
czero.installHistory.RestAPIPreviousEventMessage = *mostRecentEvent.Message
czero.installHistory.RestAPIInfraEnvEventList = eventList
} else {
czero.installHistory.NotReadyTime = time.Now()
}
czero.installHistory.RestAPIPreviousClusterStatus = *clusterMetadata.Status
installing, _ := czero.IsInstalling(*clusterMetadata.Status)
if !installing {
errored, _ := czero.HasErrored(*clusterMetadata.Status)
if errored {
return false, false, errors.New(fmt.Sprintf("%s has stopped installing... working to recover installation", resource))
} else if *clusterMetadata.Status == models.ClusterStatusCancelled {
return false, true, errors.New(fmt.Sprintf("%s installation was cancelled", resource))
}
}
validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logrus.StandardLogger(), logPrefix)
if validationsErr != nil {
return false, false, errors.Wrap(validationsErr, "host validations failed")
}
// cluster bootstrap is not complete
// Print most recent event associated with the clusterInfraEnvID
eventList, err := czero.API.Rest.GetInfraEnvEvents(czero.clusterInfraEnvID)
if err != nil {
return false, false, errors.Wrap(err, fmt.Sprintf("Unable to retrieve events about the %s from the Agent Rest API", resource))
}
if len(eventList) == 0 {
// No cluster events detected from the Agent Rest API
} else {
mostRecentEvent := eventList[len(eventList)-1]
// Don't print the same status message back to back
if *mostRecentEvent.Message != czero.installHistory.RestAPIPreviousEventMessage {
if *mostRecentEvent.Severity == models.EventSeverityInfo {
logrus.Infof("%s%s", logPrefix, *mostRecentEvent.Message)
} else {
logrus.Warnf("%s%s", logPrefix, *mostRecentEvent.Message)
}
}
czero.installHistory.RestAPIPreviousEventMessage = *mostRecentEvent.Message
czero.installHistory.RestAPIInfraEnvEventList = eventList
}
return false, false, nil
}
@@ -431,7 +451,7 @@ func (czero *Cluster) PrintInstallationComplete() error {
// PrintInstallStatus Print a human friendly message using the models from the Agent Rest API.
func (czero *Cluster) PrintInstallStatus(cluster *models.Cluster) error {
friendlyStatus := humanFriendlyClusterInstallStatus(*cluster.Status)
friendlyStatus := czero.humanFriendlyClusterInstallStatus(*cluster.Status)
// Don't print the same status message back to back
if *cluster.Status != czero.installHistory.RestAPIPreviousClusterStatus {
logrus.Info(friendlyStatus)
@@ -453,7 +473,7 @@ func (czero *Cluster) CanSSHToNodeZero() bool {
}
// Human friendly install status strings mapped to the Agent Rest API cluster statuses
func humanFriendlyClusterInstallStatus(status string) string {
func (czero *Cluster) humanFriendlyClusterInstallStatus(status string) string {
clusterStoppedInstallingStates := map[string]string{
models.ClusterStatusAddingHosts: "Cluster is adding hosts",
models.ClusterStatusCancelled: "Cluster installation cancelled",
@@ -466,6 +486,22 @@ func humanFriendlyClusterInstallStatus(status string) string {
models.ClusterStatusPreparingForInstallation: "Preparing cluster for installation",
models.ClusterStatusReady: "Cluster is ready for install",
}
return clusterStoppedInstallingStates[status]
hostStoppedInstallingStates := map[string]string{
models.ClusterStatusAddingHosts: "Cluster is adding host",
models.ClusterStatusCancelled: "Host installation cancelled",
models.ClusterStatusError: "Host has error(s)",
models.ClusterStatusFinalizing: "Finalizing host installation",
models.ClusterStatusInstalling: "Host installation in progress",
models.ClusterStatusInstallingPendingUserAction: "Host installation started but now requires user input",
models.ClusterStatusInsufficient: "Host is not ready for install. Check validations",
models.ClusterStatusPendingForInput: "User input is required to continue host installation",
models.ClusterStatusPreparingForInstallation: "Preparing host for installation",
models.ClusterStatusReady: "Host is ready for install",
}
switch czero.workflow {
case workflow.AgentWorkflowTypeAddNodes:
return fmt.Sprintf("Node %s: %s", czero.API.Rest.NodeZeroIP, hostStoppedInstallingStates[status])
default:
return clusterStoppedInstallingStates[status]
}
}

View File

@@ -252,6 +252,12 @@ func MonitorAddNodes(cluster *Cluster, nodeIPAddress string) error {
mon := newAddNodeMonitor(nodeIPAddress, cluster)
wait.Until(func() {
if mon.onOrAfter(AssistedServiceIsUp) &&
mon.onOrBefore(HostInstallationStarted) {
cluster.LogAssistedServiceStatus()
}
for _, checkFunc := range defaultChecks {
lastState := mon.currentState
checkFunc(mon)

View File

@@ -34,7 +34,7 @@ type validationResultHistory struct {
previousMessage string
}
func checkValidations(cluster *models.Cluster, validationResults *validationResults, log *logrus.Logger) error {
func checkValidations(cluster *models.Cluster, validationResults *validationResults, log *logrus.Logger, hostLogPrefix string) error {
clusterLogPrefix := "Cluster validation: "
updatedClusterValidationHistory, err := updateValidationResultHistory(clusterLogPrefix, cluster.ValidationsInfo, validationResults.ClusterValidationHistory, log)
if err != nil {
@@ -43,7 +43,9 @@ func checkValidations(cluster *models.Cluster, validationResults *validationResu
validationResults.ClusterValidationHistory = updatedClusterValidationHistory
for _, h := range cluster.Hosts {
hostLogPrefix := "Host " + h.RequestedHostname + " validation: "
if hostLogPrefix == "" {
hostLogPrefix = "Host " + h.RequestedHostname + " validation: "
}
if _, ok := validationResults.HostValidationHistory[h.RequestedHostname]; !ok {
validationResults.HostValidationHistory[h.RequestedHostname] = make(map[string]*validationResultHistory)
}

View File

@@ -6,11 +6,12 @@ import (
"github.com/sirupsen/logrus"
agentpkg "github.com/openshift/installer/pkg/agent"
"github.com/openshift/installer/pkg/asset/agent/workflow"
)
// NewMonitorAddNodesCommand creates a new command for monitor add nodes.
func NewMonitorAddNodesCommand(directory, kubeconfigPath string, ips []string) error {
cluster, err := agentpkg.NewCluster(context.Background(), kubeconfigPath, ips[0], "")
cluster, err := agentpkg.NewCluster(context.Background(), kubeconfigPath, ips[0], "", workflow.AgentWorkflowTypeAddNodes)
if err != nil {
// TODO exit code enumerate
logrus.Exit(1)