diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index cfe73df11f..1aef7104fd 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -3669,6 +3669,8 @@ Topics: Topics: - Name: About disaster recovery File: about-disaster-recovery + - Name: Quorum restoration + File: quorum-restoration - Name: Restoring to a previous cluster state File: scenario-2-restoring-cluster-state - Name: Recovering from expired control plane certificates diff --git a/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/about-disaster-recovery.adoc b/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/about-disaster-recovery.adoc index 2813696814..326c9ecee5 100644 --- a/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/about-disaster-recovery.adoc +++ b/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/about-disaster-recovery.adoc @@ -17,10 +17,17 @@ state. Disaster recovery requires you to have at least one healthy control plane host. ==== +xref:../../../backup_and_restore/control_plane_backup_and_restore/disaster_recovery/quorum-restoration.adoc#dr-quorum-restoration[Quorum restoration]:: This solution handles situations where you have lost the majority of your control plane hosts, leading to etcd quorum loss and the cluster going offline. This solution does not require an etcd backup. ++ +[NOTE] +==== +If you have a majority of your control plane nodes still available and have an etcd quorum, then xref:../../../backup_and_restore/control_plane_backup_and_restore/replacing-unhealthy-etcd-member.adoc#replacing-unhealthy-etcd-member[replace a single unhealthy etcd member]. +==== + xref:../../../backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc#dr-restoring-cluster-state[Restoring to a previous cluster state]:: This solution handles situations where you want to restore your cluster to a previous state, for example, if an administrator deletes something critical. -This also includes situations where you have lost the majority of your control plane hosts, leading to etcd quorum loss and the cluster going offline. As long as you have taken an etcd backup, you can follow this procedure to restore your cluster to a previous state. +If you have taken an etcd backup, you can restore your cluster to a previous state. + If applicable, you might also need to xref:../../../backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-3-expired-certs.adoc#dr-recovering-expired-certs[recover from expired control plane certificates]. + @@ -30,11 +37,6 @@ Restoring to a previous cluster state is a destructive and destablizing action t Prior to performing a restore, see xref:../../../backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc#dr-scenario-2-restoring-cluster-state-about_dr-restoring-cluster-state[About restoring cluster state] for more information on the impact to the cluster. ==== -+ -[NOTE] -==== -If you have a majority of your masters still available and have an etcd quorum, then follow the procedure to xref:../../../backup_and_restore/control_plane_backup_and_restore/replacing-unhealthy-etcd-member.adoc#replacing-unhealthy-etcd-member[replace a single unhealthy etcd member]. -==== xref:../../../backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-3-expired-certs.adoc#dr-recovering-expired-certs[Recovering from expired control plane certificates]:: This solution handles situations where your control plane certificates have @@ -42,3 +44,9 @@ expired. For example, if you shut down your cluster before the first certificate rotation, which occurs 24 hours after installation, your certificates will not be rotated and will expire. You can follow this procedure to recover from expired control plane certificates. + +// Testing restore procedures +include::modules/dr-testing-restore-procedures.adoc[leveloffset=+1] +[role="_additional-resources"] +.Additional resources +* xref:../../../backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc#dr-restoring-cluster-state[Restoring to a previous cluster state] \ No newline at end of file diff --git a/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/quorum-restoration.adoc b/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/quorum-restoration.adoc new file mode 100644 index 0000000000..2f8c6ef950 --- /dev/null +++ b/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/quorum-restoration.adoc @@ -0,0 +1,12 @@ +:_mod-docs-content-type: ASSEMBLY +[id="dr-quorum-restoration"] += Quorum restoration +include::_attributes/common-attributes.adoc[] +:context: dr-quorum-restoration + +toc::[] + +You can use the `quorum-restore.sh` script to restore etcd quorum on clusters that are offline due to quorum loss. + +// Restoring etcd quorum for high availability clusters +include::modules/dr-restoring-etcd-quorum-ha.adoc[leveloffset=+1] \ No newline at end of file diff --git a/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc b/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc index 3230a8f43a..a3189163ff 100644 --- a/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc +++ b/backup_and_restore/control_plane_backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc @@ -11,6 +11,9 @@ To restore the cluster to a previous state, you must have previously xref:../../ // About restoring to a previous cluster state include::modules/dr-restoring-cluster-state-about.adoc[leveloffset=+1] +// Restoring to a previous cluster state for a single node +include::modules/dr-restoring-cluster-state-sno.adoc[leveloffset=+1] + // Restoring to a previous cluster state include::modules/dr-restoring-cluster-state.adoc[leveloffset=+1] @@ -23,5 +26,3 @@ include::modules/dr-restoring-cluster-state.adoc[leveloffset=+1] * xref:../../../installing/installing_bare_metal/ipi/ipi-install-expanding-the-cluster.adoc#replacing-a-bare-metal-control-plane-node_ipi-install-expanding[Replacing a bare-metal control plane node] include::modules/dr-scenario-cluster-state-issues.adoc[leveloffset=+1] - - diff --git a/modules/dr-restoring-cluster-state-about.adoc b/modules/dr-restoring-cluster-state-about.adoc index dc29df6599..0390ba4c3f 100644 --- a/modules/dr-restoring-cluster-state-about.adoc +++ b/modules/dr-restoring-cluster-state-about.adoc @@ -18,7 +18,7 @@ Restoring to a previous cluster state is a destructive and destablizing action t If you are able to retrieve data using the Kubernetes API server, then etcd is available and you should not restore using an etcd backup. ==== -Restoring etcd effectively takes a cluster back in time and all clients will experience a conflicting, parallel history. This can impact the behavior of watching components like kubelets, Kubernetes controller managers, persistent volume controllers, and OpenShift operators, including the network operator. +Restoring etcd effectively takes a cluster back in time and all clients will experience a conflicting, parallel history. This can impact the behavior of watching components like kubelets, Kubernetes controller managers, persistent volume controllers, and {product-title} Operators, including the network Operator. It can cause Operator churn when the content in etcd does not match the actual content on disk, causing Operators for the Kubernetes API server, Kubernetes controller manager, Kubernetes scheduler, and etcd to get stuck when files on disk conflict with content in etcd. This can require manual actions to resolve the issues. diff --git a/modules/dr-restoring-cluster-state-sno.adoc b/modules/dr-restoring-cluster-state-sno.adoc new file mode 100644 index 0000000000..ae59b40ab8 --- /dev/null +++ b/modules/dr-restoring-cluster-state-sno.adoc @@ -0,0 +1,45 @@ +// Module included in the following assemblies: +// +// * disaster_recovery/scenario-2-restoring-cluster-state.adoc + +:_mod-docs-content-type: PROCEDURE +[id="dr-restoring-cluster-state-sno_{context}"] += Restoring to a previous cluster state for a single node + +You can use a saved etcd backup to restore a previous cluster state on a single node. + +[IMPORTANT] +==== +When you restore your cluster, you must use an etcd backup that was taken from the same z-stream release. For example, an {product-title} {product-version}.2 cluster must use an etcd backup that was taken from {product-version}.2. +==== + +.Prerequisites + +* Access to the cluster as a user with the `cluster-admin` role through a certificate-based `kubeconfig` file, like the one that was used during installation. +* You have SSH access to control plane hosts. +* A backup directory containing both the etcd snapshot and the resources for the static pods, which were from the same backup. The file names in the directory must be in the following formats: `snapshot_.db` and `static_kuberesources_.tar.gz`. + +.Procedure + +. Use SSH to connect to the single node and copy the etcd backup to the `/home/core` directory by running the following command: ++ +[source,terminal] +---- +$ cp /home/core +---- + +. Run the following command in the single node to restore the cluster from a previous backup: ++ +[source,terminal] +---- +$ sudo -E /usr/local/bin/cluster-restore.sh /home/core/ +---- + +. Exit the SSH session. + +. Monitor the recovery progress of the control plane by running the following command: ++ +[source,terminal] +---- +$ oc adm wait-for-stable-cluster +---- \ No newline at end of file diff --git a/modules/dr-restoring-cluster-state.adoc b/modules/dr-restoring-cluster-state.adoc index 226d74e142..d9b869c284 100644 --- a/modules/dr-restoring-cluster-state.adoc +++ b/modules/dr-restoring-cluster-state.adoc @@ -3,6 +3,8 @@ // * disaster_recovery/scenario-2-restoring-cluster-state.adoc // * post_installation_configuration/cluster-tasks.adoc +// Contributors: The documentation for this section changed drastically for 4.18+. + // Contributors: Some changes for the `etcd` restore procedure are only valid for 4.14+. // In the 4.14+ documentation, OVN-K requires different steps because there is no centralized OVN // control plane to be converted. For more information, see PR #64939. @@ -14,23 +16,25 @@ [id="dr-scenario-2-restoring-cluster-state_{context}"] = Restoring to a previous cluster state -You can use a saved `etcd` backup to restore a previous cluster state or restore a cluster that has lost the majority of control plane hosts. +You can use a saved etcd backup to restore a previous cluster state or restore a cluster that has lost the majority of control plane hosts. + +For high availability (HA) clusters, a three-node HA cluster requires you to shutdown etcd on two hosts to avoid a cluster split. Quorum requires a simple majority of nodes. The minimum number of nodes required for quorum on a three-node HA cluster is two. If you start a new cluster from backup on your recovery host, the other etcd members might still be able to form quorum and continue service. [NOTE] ==== -If your cluster uses a control plane machine set, see "Troubleshooting the control plane machine set" for a more simple `etcd` recovery procedure. +If your cluster uses a control plane machine set, see "Troubleshooting the control plane machine set" for a more simple etcd recovery procedure. For {product-title} on a single node, see "Restoring to a previous cluster state for a single node". ==== [IMPORTANT] ==== -When you restore your cluster, you must use an `etcd` backup that was taken from the same z-stream release. For example, an {product-title} 4.7.2 cluster must use an `etcd` backup that was taken from 4.7.2. +When you restore your cluster, you must use an etcd backup that was taken from the same z-stream release. For example, an {product-title} {product-version}.2 cluster must use an etcd backup that was taken from {product-version}.2. ==== .Prerequisites * Access to the cluster as a user with the `cluster-admin` role through a certificate-based `kubeconfig` file, like the one that was used during installation. * A healthy control plane host to use as the recovery host. -* SSH access to control plane hosts. +* You have SSH access to control plane hosts. * A backup directory containing both the `etcd` snapshot and the resources for the static pods, which were from the same backup. The file names in the directory must be in the following formats: `snapshot_.db` and `static_kuberesources_.tar.gz`. [IMPORTANT] @@ -40,7 +44,7 @@ For non-recovery control plane nodes, it is not required to establish SSH connec .Procedure -. Select a control plane host to use as the recovery host. This is the host that you will run the restore operation on. +. Select a control plane host to use as the recovery host. This is the host that you run the restore operation on. . Establish SSH connectivity to each of the control plane nodes, including the recovery host. + @@ -51,641 +55,52 @@ For non-recovery control plane nodes, it is not required to establish SSH connec If you do not complete this step, you will not be able to access the control plane hosts to complete the restore procedure, and you will be unable to recover your cluster from this state. ==== -. Copy the `etcd` backup directory to the recovery control plane host. -+ -This procedure assumes that you copied the `backup` directory containing the `etcd` snapshot and the resources for the static pods to the `/home/core/` directory of your recovery control plane host. - -. Stop the static pods on any other control plane nodes. -+ -[NOTE] -==== -You do not need to stop the static pods on the recovery host. -==== - -.. Access a control plane host that is not the recovery host. - -.. Move the existing etcd pod file out of the kubelet manifest directory by running: +. Using SSH, connect to each control plane node and run the following command to disable etcd: + [source,terminal] ---- -$ sudo mv -v /etc/kubernetes/manifests/etcd-pod.yaml /tmp +$ sudo -E /usr/local/bin/disable-etcd.sh ---- -.. Verify that the `etcd` pods are stopped by using: +. Copy the etcd backup directory to the recovery control plane host. ++ +This procedure assumes that you copied the `backup` directory containing the etcd snapshot and the resources for the static pods to the `/home/core/` directory of your recovery control plane host. + +. Use SSH to connect to the recovery host and restore the cluster from a previous backup by running the following command: + [source,terminal] ---- -$ sudo crictl ps | grep etcd | egrep -v "operator|etcd-guard" ----- -+ -If the output of this command is not empty, wait a few minutes and check again. - -.. Move the existing `kube-apiserver` file out of the kubelet manifest directory by running: -+ -[source,terminal] ----- -$ sudo mv -v /etc/kubernetes/manifests/kube-apiserver-pod.yaml /tmp +$ sudo -E /usr/local/bin/cluster-restore.sh /home/core/ ---- -.. Verify that the `kube-apiserver` containers are stopped by running: -+ -[source,terminal] ----- -$ sudo crictl ps | grep kube-apiserver | egrep -v "operator|guard" ----- -+ -If the output of this command is not empty, wait a few minutes and check again. +. Exit the SSH session. -.. Move the existing `kube-controller-manager` file out of the kubelet manifest directory by using: -+ -[source,terminal] ----- -$ sudo mv -v /etc/kubernetes/manifests/kube-controller-manager-pod.yaml /tmp ----- - -.. Verify that the `kube-controller-manager` containers are stopped by running: -+ -[source,terminal] ----- -$ sudo crictl ps | grep kube-controller-manager | egrep -v "operator|guard" ----- -If the output of this command is not empty, wait a few minutes and check again. - -.. Move the existing `kube-scheduler` file out of the kubelet manifest directory by using: -+ -[source,terminal] ----- -$ sudo mv -v /etc/kubernetes/manifests/kube-scheduler-pod.yaml /tmp ----- - -.. Verify that the `kube-scheduler` containers are stopped by using: -+ -[source,terminal] ----- -$ sudo crictl ps | grep kube-scheduler | egrep -v "operator|guard" ----- -If the output of this command is not empty, wait a few minutes and check again. - -.. Move the `etcd` data directory to a different location with the following example: -+ -[source,terminal] ----- -$ sudo mv -v /var/lib/etcd/ /tmp ----- - -.. If the `/etc/kubernetes/manifests/keepalived.yaml` file exists and the node is deleted, follow these steps: - -... Move the `/etc/kubernetes/manifests/keepalived.yaml` file out of the kubelet manifest directory: -+ -[source,terminal] ----- -$ sudo mv -v /etc/kubernetes/manifests/keepalived.yaml /tmp ----- - -... Verify that any containers managed by the `keepalived` daemon are stopped: -+ -[source,terminal] ----- -$ sudo crictl ps --name keepalived ----- -+ -The output of this command should be empty. If it is not empty, wait a few minutes and check again. - -... Check if the control plane has any Virtual IPs (VIPs) assigned to it: -+ -[source,terminal] ----- -$ ip -o address | egrep '|' ----- - -... For each reported VIP, run the following command to remove it: -+ -[source,terminal] ----- -$ sudo ip address del dev ----- - -.. Repeat this step on each of the other control plane hosts that is not the recovery host. - -. Access the recovery control plane host. - -. If the `keepalived` daemon is in use, verify that the recovery control plane node owns the VIP: -+ -[source,terminal] ----- -$ ip -o address | grep ----- -+ -The address of the VIP is highlighted in the output if it exists. This command returns an empty string if the VIP is not set or configured incorrectly. - -. If the cluster-wide proxy is enabled, be sure that you have exported the `NO_PROXY`, `HTTP_PROXY`, and `HTTPS_PROXY` environment variables. -+ -[TIP] -==== -You can check whether the proxy is enabled by reviewing the output of `oc get proxy cluster -o yaml`. The proxy is enabled if the `httpProxy`, `httpsProxy`, and `noProxy` fields have values set. -==== - -. Run the restore script on the recovery control plane host and pass in the path to the `etcd` backup directory: -+ -[source,terminal] ----- -$ sudo -E /usr/local/bin/cluster-restore.sh /home/core/assets/backup ----- -+ -.Example script output -[source,terminal] ----- -...stopping kube-scheduler-pod.yaml -...stopping kube-controller-manager-pod.yaml -...stopping etcd-pod.yaml -...stopping kube-apiserver-pod.yaml -Waiting for container etcd to stop -.complete -Waiting for container etcdctl to stop -.............................complete -Waiting for container etcd-metrics to stop -complete -Waiting for container kube-controller-manager to stop -complete -Waiting for container kube-apiserver to stop -..........................................................................................complete -Waiting for container kube-scheduler to stop -complete -Moving etcd data-dir /var/lib/etcd/member to /var/lib/etcd-backup -starting restore-etcd static pod -starting kube-apiserver-pod.yaml -static-pod-resources/kube-apiserver-pod-7/kube-apiserver-pod.yaml -starting kube-controller-manager-pod.yaml -static-pod-resources/kube-controller-manager-pod-7/kube-controller-manager-pod.yaml -starting kube-scheduler-pod.yaml -static-pod-resources/kube-scheduler-pod-8/kube-scheduler-pod.yaml ----- -+ - -The cluster-restore.sh script must show that `etcd`, `kube-apiserver`, `kube-controller-manager`, and `kube-scheduler` pods are stopped and then started at the end of the restore process. -+ -[NOTE] -==== -The restore process can cause nodes to enter the `NotReady` state if the node certificates were updated after the last `etcd` backup. -==== - -. Check the nodes to ensure they are in the `Ready` state. -.. Run the following command: -+ -[source,terminal] ----- -$ oc get nodes -w ----- -+ -.Sample output -[source,terminal] ----- -NAME STATUS ROLES AGE VERSION -host-172-25-75-28 Ready master 3d20h v1.31.3 -host-172-25-75-38 Ready infra,worker 3d20h v1.31.3 -host-172-25-75-40 Ready master 3d20h v1.31.3 -host-172-25-75-65 Ready master 3d20h v1.31.3 -host-172-25-75-74 Ready infra,worker 3d20h v1.31.3 -host-172-25-75-79 Ready worker 3d20h v1.31.3 -host-172-25-75-86 Ready worker 3d20h v1.31.3 -host-172-25-75-98 Ready infra,worker 3d20h v1.31.3 ----- -+ -It can take several minutes for all nodes to report their state. - -.. If any nodes are in the `NotReady` state, log in to the nodes and remove all of the PEM files from the `/var/lib/kubelet/pki` directory on each node. You can SSH into the nodes or use the terminal window in the web console. -+ -[source,terminal] ----- -$ ssh -i core@ ----- -+ -.Sample `pki` directory -[source,terminal] ----- -sh-4.4# pwd -/var/lib/kubelet/pki -sh-4.4# ls -kubelet-client-2022-04-28-11-24-09.pem kubelet-server-2022-04-28-11-24-15.pem -kubelet-client-current.pem kubelet-server-current.pem ----- - -. Restart the kubelet service on all control plane hosts. - -.. From the recovery host, run: -+ -[source,terminal] ----- -$ sudo systemctl restart kubelet.service ----- - -.. Repeat this step on all other control plane hosts. - -. Approve the pending Certificate Signing Requests (CSRs): -+ -[NOTE] -==== -Clusters with no worker nodes, such as single-node clusters or clusters consisting of three schedulable control plane nodes, will not have any pending CSRs to approve. You can skip all the commands listed in this step. -==== - -.. Get the list of current CSRs by running: -+ -[source,terminal] ----- -$ oc get csr ----- -+ -.Example output ----- -NAME AGE SIGNERNAME REQUESTOR CONDITION -csr-2s94x 8m3s kubernetes.io/kubelet-serving system:node: Pending <1> -csr-4bd6t 8m3s kubernetes.io/kubelet-serving system:node: Pending <1> -csr-4hl85 13m kubernetes.io/kube-apiserver-client-kubelet system:serviceaccount:openshift-machine-config-operator:node-bootstrapper Pending <2> -csr-zhhhp 3m8s kubernetes.io/kube-apiserver-client-kubelet system:serviceaccount:openshift-machine-config-operator:node-bootstrapper Pending <2> -... ----- -<1> A pending kubelet serving CSR, requested by the node for the kubelet serving endpoint. -<2> A pending kubelet client CSR, requested with the `node-bootstrapper` node bootstrap credentials. - -.. Review the details of a CSR to verify that it is valid by running: -+ -[source,terminal] ----- -$ oc describe csr <1> ----- -<1> `` is the name of a CSR from the list of current CSRs. - -.. Approve each valid `node-bootstrapper` CSR by running: -+ -[source,terminal] ----- -$ oc adm certificate approve ----- - -.. For user-provisioned installations, approve each valid kubelet service CSR by running: -+ -[source,terminal] ----- -$ oc adm certificate approve ----- - -. Verify that the single member control plane has started successfully. - -.. From the recovery host, verify that the `etcd` container is running by using: -+ -[source,terminal] ----- -$ sudo crictl ps | grep etcd | egrep -v "operator|etcd-guard" ----- -+ -.Example output -[source,terminal] ----- -3ad41b7908e32 36f86e2eeaaffe662df0d21041eb22b8198e0e58abeeae8c743c3e6e977e8009 About a minute ago Running etcd 0 7c05f8af362f0 ----- - -.. From the recovery host, verify that the `etcd` pod is running by using: -+ -[source,terminal] ----- -$ oc -n openshift-etcd get pods -l k8s-app=etcd ----- -+ -.Example output -[source,terminal] ----- -NAME READY STATUS RESTARTS AGE -etcd-ip-10-0-143-125.ec2.internal 1/1 Running 1 2m47s ----- -+ -If the status is `Pending`, or the output lists more than one running `etcd` pod, wait a few minutes and check again. - -. If you are using the `OVNKubernetes` network plugin, you must restart `ovnkube-controlplane` pods. -.. Delete all of the `ovnkube-controlplane` pods by running: -+ -[source,terminal] ----- -$ oc -n openshift-ovn-kubernetes delete pod -l app=ovnkube-control-plane ----- -.. Verify that all of the `ovnkube-controlplane` pods were redeployed by using: -+ -[source,terminal] ----- -$ oc -n openshift-ovn-kubernetes get pod -l app=ovnkube-control-plane ----- - -. If you are using the OVN-Kubernetes network plugin, restart the Open Virtual Network (OVN) Kubernetes pods on all the nodes one by one. Use the following steps to restart OVN-Kubernetes pods on each node: -+ -[IMPORTANT] -==== -.Restart OVN-Kubernetes pods in the following order -. The recovery control plane host -. The other control plane hosts (if available) -. The other nodes -==== -+ -[NOTE] -==== -Validating and mutating admission webhooks can reject pods. If you add any additional webhooks with the `failurePolicy` set to `Fail`, then they can reject pods and the restoration process can fail. You can avoid this by saving and deleting webhooks while restoring the cluster state. After the cluster state is restored successfully, you can enable the webhooks again. - -Alternatively, you can temporarily set the `failurePolicy` to `Ignore` while restoring the cluster state. After the cluster state is restored successfully, you can set the `failurePolicy` to `Fail`. -==== - -.. Remove the northbound database (nbdb) and southbound database (sbdb). Access the recovery host and the remaining control plane nodes by using Secure Shell (SSH) and run: -+ -[source,terminal] ----- -$ sudo rm -f /var/lib/ovn-ic/etc/*.db ----- - -.. Restart the OpenVSwitch services. Access the node by using Secure Shell (SSH) and run the following command: -+ -[source,terminal] ----- -$ sudo systemctl restart ovs-vswitchd ovsdb-server ----- - -.. Delete the `ovnkube-node` pod on the node by running the following command, replacing `` with the name of the node that you are restarting: -+ -[source,terminal] ----- -$ oc -n openshift-ovn-kubernetes delete pod -l app=ovnkube-node --field-selector=spec.nodeName== ----- -+ - -.. Verify that the `ovnkube-node` pod is running again with: -+ -[source,terminal] ----- -$ oc -n openshift-ovn-kubernetes get pod -l app=ovnkube-node --field-selector=spec.nodeName== ----- -+ -[NOTE] -==== -It might take several minutes for the pods to restart. -==== - -. Delete and re-create other non-recovery, control plane machines, one by one. After the machines are re-created, a new revision is forced and `etcd` automatically scales up. -+ -** If you use a user-provisioned bare metal installation, you can re-create a control plane machine by using the same method that you used to originally create it. For more information, see "Installing a user-provisioned cluster on bare metal". -+ -[WARNING] -==== -Do not delete and re-create the machine for the recovery host. -==== -+ -** If you are running installer-provisioned infrastructure, or you used the Machine API to create your machines, follow these steps: -+ -[WARNING] -==== -Do not delete and re-create the machine for the recovery host. - -For bare metal installations on installer-provisioned infrastructure, control plane machines are not re-created. For more information, see "Replacing a bare-metal control plane node". -==== -.. Obtain the machine for one of the lost control plane hosts. -+ -In a terminal that has access to the cluster as a cluster-admin user, run the following command: -+ -[source,terminal] ----- -$ oc get machines -n openshift-machine-api -o wide ----- -+ -Example output: -+ -[source,terminal] ----- -NAME PHASE TYPE REGION ZONE AGE NODE PROVIDERID STATE -clustername-8qw5l-master-0 Running m4.xlarge us-east-1 us-east-1a 3h37m ip-10-0-131-183.ec2.internal aws:///us-east-1a/i-0ec2782f8287dfb7e stopped <1> -clustername-8qw5l-master-1 Running m4.xlarge us-east-1 us-east-1b 3h37m ip-10-0-143-125.ec2.internal aws:///us-east-1b/i-096c349b700a19631 running -clustername-8qw5l-master-2 Running m4.xlarge us-east-1 us-east-1c 3h37m ip-10-0-154-194.ec2.internal aws:///us-east-1c/i-02626f1dba9ed5bba running -clustername-8qw5l-worker-us-east-1a-wbtgd Running m4.large us-east-1 us-east-1a 3h28m ip-10-0-129-226.ec2.internal aws:///us-east-1a/i-010ef6279b4662ced running -clustername-8qw5l-worker-us-east-1b-lrdxb Running m4.large us-east-1 us-east-1b 3h28m ip-10-0-144-248.ec2.internal aws:///us-east-1b/i-0cb45ac45a166173b running -clustername-8qw5l-worker-us-east-1c-pkg26 Running m4.large us-east-1 us-east-1c 3h28m ip-10-0-170-181.ec2.internal aws:///us-east-1c/i-06861c00007751b0a running ----- -<1> This is the control plane machine for the lost control plane host, `ip-10-0-131-183.ec2.internal`. - -.. Delete the machine of the lost control plane host by running: -+ -[source,terminal] ----- -$ oc delete machine -n openshift-machine-api clustername-8qw5l-master-0 <1> ----- -<1> Specify the name of the control plane machine for the lost control plane host. -+ -A new machine is automatically provisioned after deleting the machine of the lost control plane host. - -.. Verify that a new machine has been created by running: -+ -[source,terminal] ----- -$ oc get machines -n openshift-machine-api -o wide ----- -+ -Example output: -+ -[source,terminal] ----- -NAME PHASE TYPE REGION ZONE AGE NODE PROVIDERID STATE -clustername-8qw5l-master-1 Running m4.xlarge us-east-1 us-east-1b 3h37m ip-10-0-143-125.ec2.internal aws:///us-east-1b/i-096c349b700a19631 running -clustername-8qw5l-master-2 Running m4.xlarge us-east-1 us-east-1c 3h37m ip-10-0-154-194.ec2.internal aws:///us-east-1c/i-02626f1dba9ed5bba running -clustername-8qw5l-master-3 Provisioning m4.xlarge us-east-1 us-east-1a 85s ip-10-0-173-171.ec2.internal aws:///us-east-1a/i-015b0888fe17bc2c8 running <1> -clustername-8qw5l-worker-us-east-1a-wbtgd Running m4.large us-east-1 us-east-1a 3h28m ip-10-0-129-226.ec2.internal aws:///us-east-1a/i-010ef6279b4662ced running -clustername-8qw5l-worker-us-east-1b-lrdxb Running m4.large us-east-1 us-east-1b 3h28m ip-10-0-144-248.ec2.internal aws:///us-east-1b/i-0cb45ac45a166173b running -clustername-8qw5l-worker-us-east-1c-pkg26 Running m4.large us-east-1 us-east-1c 3h28m ip-10-0-170-181.ec2.internal aws:///us-east-1c/i-06861c00007751b0a running ----- -<1> The new machine, `clustername-8qw5l-master-3` is being created and is ready after the phase changes from `Provisioning` to `Running`. -+ -It might take a few minutes for the new machine to be created. The `etcd` cluster Operator will automatically sync when the machine or node returns to a healthy state. - -.. Repeat these steps for each lost control plane host that is not the recovery host. - -. Turn off the quorum guard by entering: +. Once the API responds, turn off the etcd Operator quorum guard by runnning the following command: + [source,terminal] ---- $ oc patch etcd/cluster --type=merge -p '{"spec": {"unsupportedConfigOverrides": {"useUnsupportedUnsafeNonHANonProductionUnstableEtcd": true}}}' ---- -+ -This command ensures that you can successfully re-create secrets and roll out the static pods. -. In a separate terminal window within the recovery host, export the recovery `kubeconfig` file by running: +. Monitor the recovery progress of the control plane by running the following command: + [source,terminal] ---- -$ export KUBECONFIG=/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost-recovery.kubeconfig +$ oc adm wait-for-stable-cluster ---- -. Force `etcd` redeployment. -+ -In the same terminal window where you exported the recovery `kubeconfig` file, run: -+ -[source,terminal] ----- -$ oc patch etcd cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge <1> ----- -<1> The `forceRedeploymentReason` value must be unique, which is why a timestamp is appended. -+ -The `etcd` redeployment starts. -+ -When the `etcd` cluster Operator performs a redeployment, the existing nodes are started with new pods similar to the initial bootstrap scale up. - -. Turn the quorum guard back on by entering: +. Once recovered, enable the quorum guard by running the following command: + [source,terminal] ---- $ oc patch etcd/cluster --type=merge -p '{"spec": {"unsupportedConfigOverrides": null}}' ---- -. You can verify that the `unsupportedConfigOverrides` section is removed from the object by running: -+ -[source,terminal] ----- -$ oc get etcd/cluster -oyaml ----- +.Troubleshooting -. Verify all nodes are updated to the latest revision. -+ -In a terminal that has access to the cluster as a `cluster-admin` user, run: -+ -[source,terminal] ----- -$ oc get etcd -o=jsonpath='{range .items[0].status.conditions[?(@.type=="NodeInstallerProgressing")]}{.reason}{"\n"}{.message}{"\n"}' ----- -+ -Review the `NodeInstallerProgressing` status condition for `etcd` to verify that all nodes are at the latest revision. The output shows `AllNodesAtLatestRevision` upon successful update: -+ -[source,terminal] ----- -AllNodesAtLatestRevision -3 nodes are at revision 7 <1> ----- -<1> In this example, the latest revision number is `7`. -+ -If the output includes multiple revision numbers, such as `2 nodes are at revision 6; 1 nodes are at revision 7`, this means that the update is still in progress. Wait a few minutes and try again. - -. After `etcd` is redeployed, force new rollouts for the control plane. `kube-apiserver` will reinstall itself on the other nodes because the kubelet is connected to API servers using an internal load balancer. -+ -In a terminal that has access to the cluster as a `cluster-admin` user, run: - -.. Force a new rollout for `kube-apiserver`: -+ -[source,terminal] ----- -$ oc patch kubeapiserver cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge ----- -+ -Verify all nodes are updated to the latest revision. -+ -[source,terminal] ----- -$ oc get kubeapiserver -o=jsonpath='{range .items[0].status.conditions[?(@.type=="NodeInstallerProgressing")]}{.reason}{"\n"}{.message}{"\n"}' ----- -+ -Review the `NodeInstallerProgressing` status condition to verify that all nodes are at the latest revision. The output shows `AllNodesAtLatestRevision` upon successful update: -+ -[source,terminal] ----- -AllNodesAtLatestRevision -3 nodes are at revision 7 <1> ----- -<1> In this example, the latest revision number is `7`. -+ -If the output includes multiple revision numbers, such as `2 nodes are at revision 6; 1 nodes are at revision 7`, this means that the update is still in progress. Wait a few minutes and try again. - -.. Force a new rollout for the Kubernetes controller manager by running the following command: -+ -[source,terminal] ----- -$ oc patch kubecontrollermanager cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge ----- -+ -Verify all nodes are updated to the latest revision by running: -+ -[source,terminal] ----- -$ oc get kubecontrollermanager -o=jsonpath='{range .items[0].status.conditions[?(@.type=="NodeInstallerProgressing")]}{.reason}{"\n"}{.message}{"\n"}' ----- -+ -Review the `NodeInstallerProgressing` status condition to verify that all nodes are at the latest revision. The output shows `AllNodesAtLatestRevision` upon successful update: -+ -[source,terminal] ----- -AllNodesAtLatestRevision -3 nodes are at revision 7 <1> ----- -<1> In this example, the latest revision number is `7`. -+ -If the output includes multiple revision numbers, such as `2 nodes are at revision 6; 1 nodes are at revision 7`, this means that the update is still in progress. Wait a few minutes and try again. - -.. Force a new rollout for the `kube-scheduler` by running: -+ -[source,terminal] ----- -$ oc patch kubescheduler cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$( date --rfc-3339=ns )"'"}}' --type=merge ----- -+ -Verify all nodes are updated to the latest revision by using: -+ -[source,terminal] ----- -$ oc get kubescheduler -o=jsonpath='{range .items[0].status.conditions[?(@.type=="NodeInstallerProgressing")]}{.reason}{"\n"}{.message}{"\n"}' ----- -+ -Review the `NodeInstallerProgressing` status condition to verify that all nodes are at the latest revision. The output shows `AllNodesAtLatestRevision` upon successful update: -+ -[source,terminal] ----- -AllNodesAtLatestRevision -3 nodes are at revision 7 <1> ----- -<1> In this example, the latest revision number is `7`. -+ -If the output includes multiple revision numbers, such as `2 nodes are at revision 6; 1 nodes are at revision 7`, this means that the update is still in progress. Wait a few minutes and try again. - -. Monitor the platform Operators by running: -+ -[source,terminal] ----- -$ oc adm wait-for-stable-cluster ----- -+ -This process can take up to 15 minutes. - -. Verify that all control plane hosts have started and joined the cluster. -+ -In a terminal that has access to the cluster as a `cluster-admin` user, run the following command: -+ -[source,terminal] ----- -$ oc -n openshift-etcd get pods -l k8s-app=etcd ----- -+ -.Example output -[source,terminal] ----- -etcd-ip-10-0-143-125.ec2.internal 2/2 Running 0 9h -etcd-ip-10-0-154-194.ec2.internal 2/2 Running 0 9h -etcd-ip-10-0-173-171.ec2.internal 2/2 Running 0 9h ----- - -To ensure that all workloads return to normal operation following a recovery procedure, restart all control plane nodes. - -[NOTE] -==== -On completion of the previous procedural steps, you might need to wait a few minutes for all services to return to their restored state. For example, authentication by using `oc login` might not immediately work until the OAuth server pods are restarted. - -Consider using the `system:admin` `kubeconfig` file for immediate authentication. This method basis its authentication on SSL/TLS client certificates as against OAuth tokens. You can authenticate with this file by issuing the following command: +If you see no progress rolling out the etcd static pods, you can force redeployment from the `cluster-etcd-operator` by running the following command: [source,terminal] ---- -$ export KUBECONFIG=/auth/kubeconfig ----- - -Issue the following command to display your authenticated user name: - -[source,terminal] ----- -$ oc whoami ----- -==== +$ oc patch etcd cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$(date --rfc-3339=ns )"'"}}' --type=merge +---- \ No newline at end of file diff --git a/modules/dr-restoring-etcd-quorum-ha.adoc b/modules/dr-restoring-etcd-quorum-ha.adoc new file mode 100644 index 0000000000..89e17e6ff5 --- /dev/null +++ b/modules/dr-restoring-etcd-quorum-ha.adoc @@ -0,0 +1,47 @@ +// Module included in the following assemblies: +// +// * disaster_recovery/quorum-restoration.adoc + +:_mod-docs-content-type: PROCEDURE +[id="dr-restoring-etcd-quorum-ha_{context}"] += Restoring etcd quorum for high availability clusters + +You can use the `quorum-restore.sh` script to instantly bring back a new single-member etcd cluster based on its local data directory and mark all other members as invalid by retiring the previous cluster identifier. No prior backup is required to restore the control plane from. + +[WARNING] +==== +You might experience data loss if the host that runs the restoration does not have all data replicated to it. +==== + +.Prerequisites + +* You have SSH access to the node used to restore quorum. + +.Procedure + +. Select a control plane host to use as the recovery host. You run the restore operation on this host. + +. Using SSH, connect to the chosen recovery node and run the following command to restore etcd quorum: ++ +[source,terminal] +---- +$ sudo -E /usr/local/bin/quorum-restore.sh +---- + +. Exit the SSH session. + +. Wait until the control plane recovers by running the following command: ++ +[source,terminal] +---- +$ oc adm wait-for-stable-cluster +---- + +.Troubleshooting + +If you see no progress rolling out the etcd static pods, you can force redeployment from the `cluster-etcd-operator` pod by running the following command: + +[source,terminal] +---- +$ oc patch etcd cluster -p='{"spec": {"forceRedeploymentReason": "recovery-'"$(date --rfc-3339=ns )"'"}}' --type=merge +---- \ No newline at end of file diff --git a/modules/dr-testing-restore-procedures.adoc b/modules/dr-testing-restore-procedures.adoc new file mode 100644 index 0000000000..c93a101ace --- /dev/null +++ b/modules/dr-testing-restore-procedures.adoc @@ -0,0 +1,78 @@ +// Module included in the following assemblies: +// +// * disaster_recovery/about-disaster-recovery.adoc + +:_mod-docs-content-type: PROCEDURE +[id="dr-testing-restore-procedures_{context}"] += Testing restore procedures + +Testing the restore procedure is important to ensure that your automation and workload handle the new cluster state gracefully. Due to the complex nature of etcd quorum and the etcd Operator attempting to mend automatically, it is often difficult to correctly bring your cluster into a broken enough state that it can be restored. + +[WARNING] +==== +You **must** have SSH access to the cluster. Your cluster might be entirely lost without SSH access. +==== + +.Prerequisites + +* You have SSH access to control plane hosts. +* You have installed the {oc-first}. + +.Procedure + +. Use SSH to connect to each of your nonrecovery nodes and run the following commands to disable etcd and the `kubelet` service: + +.. Disable etcd by running the following command: ++ +[source,terminal] +---- +$ sudo /usr/local/bin/disable-etcd.sh +---- + +.. Delete variable data for etcd by running the following command: ++ +[source,terminal] +---- +$ sudo rm -rf /var/lib/etcd +---- + +.. Disable the `kubelet` service by running the following command: ++ +[source,terminal] +---- +$ sudo systemctl disable kubelet.service +---- + +. Exit every SSH session. + +. Run the following command to ensure that your nonrecovery nodes are in a `NOT READY` state: ++ +[source,terminal] +---- +$ oc get nodes +---- + +. Follow the steps in "Restoring to a previous cluster state" to restore your cluster. + +. After you restore the cluster and the API responds, use SSH to connect to each nonrecovery node and enable the `kubelet` service: ++ +[source,terminal] +---- +$ sudo systemctl enable kubelet.service +---- + +. Exit every SSH session. + +. Run the following command to observe your nodes coming back into the `READY` state: ++ +[source,terminal] +---- +$ oc get nodes +---- + +. Run the following command to verify that etcd is available: ++ +[source,terminal] +---- +$ oc get pods -n openshift-etcd +---- \ No newline at end of file