From ff50a48c669cca87ec3793c3695b5a7598c7d45e Mon Sep 17 00:00:00 2001 From: xenolinux Date: Sat, 18 May 2024 00:21:36 +0530 Subject: [PATCH] OSDOCS#10478: Add disaster recovery using the OADP Operator --- _topic_maps/_topic_map.yml | 4 + .../hcp_high_availability/about-hcp-ha.adoc | 15 +++ .../hcp-disaster-recovery-oadp.adoc | 85 ++++++++++++ modules/hcp-dr-oadp-backup-cp-workload.adoc | 124 ++++++++++++++++++ modules/hcp-dr-oadp-observe-velero.adoc | 34 +++++ modules/hcp-dr-oadp-observe.adoc | 39 ++++++ modules/hcp-dr-oadp-restore.adoc | 115 ++++++++++++++++ modules/hcp-mgmt-component-loss-impact.adoc | 33 +++++ 8 files changed, 449 insertions(+) create mode 100644 hosted_control_planes/hcp_high_availability/about-hcp-ha.adoc create mode 100644 hosted_control_planes/hcp_high_availability/hcp-disaster-recovery-oadp.adoc create mode 100644 modules/hcp-dr-oadp-backup-cp-workload.adoc create mode 100644 modules/hcp-dr-oadp-observe-velero.adoc create mode 100644 modules/hcp-dr-oadp-observe.adoc create mode 100644 modules/hcp-dr-oadp-restore.adoc create mode 100644 modules/hcp-mgmt-component-loss-impact.adoc diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index d6e5411eeb..aadf61fd9c 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -2403,6 +2403,8 @@ Topics: - Name: High availability for hosted control planes Dir: hcp_high_availability Topics: + - Name: About high availability for hosted control planes + File: about-hcp-ha - Name: Recovering a failing etcd cluster File: hcp-recovering-etcd-cluster - Name: Backing up and restoring etcd in an on-premise environment @@ -2411,6 +2413,8 @@ Topics: File: hcp-backup-restore-aws - Name: Disaster recovery for a hosted cluster in AWS File: hcp-disaster-recovery-aws + - Name: Disaster recovery for a hosted cluster by using OADP + File: hcp-disaster-recovery-oadp - Name: Troubleshooting hosted control planes File: hcp-troubleshooting --- diff --git a/hosted_control_planes/hcp_high_availability/about-hcp-ha.adoc b/hosted_control_planes/hcp_high_availability/about-hcp-ha.adoc new file mode 100644 index 0000000000..e304b9444c --- /dev/null +++ b/hosted_control_planes/hcp_high_availability/about-hcp-ha.adoc @@ -0,0 +1,15 @@ +:_mod-docs-content-type: ASSEMBLY +[id="about-hcp-ha"] += About high availability for hosted control planes +include::_attributes/common-attributes.adoc[] +:context: about-hcp-ha + +toc::[] + +You can maintain high availability (HA) of hosted control planes by implementing the following actions: + +* Recover etcd members for a hosted cluster. +* Back up and restore etcd for a hosted cluster. +* Perform a disaster recovery process for a hosted cluster. + +include::modules/hcp-mgmt-component-loss-impact.adoc[leveloffset=+1] diff --git a/hosted_control_planes/hcp_high_availability/hcp-disaster-recovery-oadp.adoc b/hosted_control_planes/hcp_high_availability/hcp-disaster-recovery-oadp.adoc new file mode 100644 index 0000000000..15616ff3cd --- /dev/null +++ b/hosted_control_planes/hcp_high_availability/hcp-disaster-recovery-oadp.adoc @@ -0,0 +1,85 @@ +:_mod-docs-content-type: ASSEMBLY +[id="hcp-disaster-recovery-oadp"] += Disaster recovery for a hosted cluster by using {oadp-short} +include::_attributes/common-attributes.adoc[] +:context: hcp-disaster-recovery-oadp + +toc::[] + +You can use the {oadp-first} Operator to perform disaster recovery on {aws-first} and bare metal. + +The disaster recovery process with {oadp-first} involves the following steps: + +. Preparing your platform, such as {aws-full} or bare metal, to use {oadp-short} +. Backing up the data plane workload +. Backing up the control plane workload +. Restoring a hosted cluster by using {oadp-short} + +[id="prerequisites_{context}"] +== Prerequisites + +You must meet the following prerequisites on the management cluster: + +* You xref:../../backup_and_restore/application_backup_and_restore/installing/oadp-installing-operator.adoc#oadp-installing-operator[installed the {oadp-short} Operator]. +* You created a storage class. +* You have access to the cluster with `cluster-admin` privileges. +* You have access to the {oadp-short} subscription through a catalog source. +* You have access to a cloud storage provider that is compatible with {oadp-short}, such as S3, {azure-full}, {gcp-full}, or MinIO. +* In a disconnected environment, you have access to a self-hosted storage provider, for example link:https://docs.redhat.com/en/documentation/red_hat_openshift_data_foundation/[{odf-full}] or link:https://min.io/[MinIO], that is compatible with {oadp-short}. +* Your hosted control planes pods are up and running. + +[id="prepare-aws-oadp_{context}"] +== Preparing {aws-short} to use {oadp-short} + +To perform disaster recovery for a hosted cluster, you can use {oadp-first} on {aws-first} S3 compatible storage. After creating the `DataProtectionApplication` object, new `velero` deployment and `node-agent` pods are created in the `openshift-adp` namespace. + +To prepare {aws-short} to use {oadp-short}, see "Configuring the {oadp-full} with Multicloud Object Gateway". + +[role="_additional-resources"] +.Additional resources + +* xref:../../backup_and_restore/application_backup_and_restore/installing/installing-oadp-aws.adoc#installing-oadp-aws[Configuring the {oadp-full} with Multicloud Object Gateway] + +.Next steps + +* Backing up the data plane workload +* Backing up the control plane workload + +[id="prepare-bm-dr-oadp_{context}"] +== Preparing bare metal to use {oadp-short} + +To perform disaster recovery for a hosted cluster, you can use {oadp-first} on bare metal. After creating the `DataProtectionApplication` object, new `velero` deployment and `node-agent` pods are created in the `openshift-adp` namespace. + +To prepare bare metal to use {oadp-short}, see "Configuring the {oadp-full} with AWS S3 compatible storage". + +[role="_additional-resources"] +.Additional resources + +* xref:../../backup_and_restore/application_backup_and_restore/installing/installing-oadp-mcg.adoc#installing-oadp-mcg[Configuring the {oadp-full} with AWS S3 compatible storage] + +.Next steps + +* Backing up the data plane workload +* Backing up the control plane workload + +[id="backing-up-data-plane-oadp_{context}"] +== Backing up the data plane workload + +If the data plane workload is not important, you can skip this procedure. To back up the data plane workload by using the {oadp-short} Operator, see "Backing up applications". + +[role="_additional-resources"] +.Additional resources + +* xref:../../backup_and_restore/application_backup_and_restore/backing_up_and_restoring/backing-up-applications.adoc#backing-up-applications[Backing up applications] + +.Next steps + +* Restoring a hosted cluster by using {oadp-short} + +include::modules/hcp-dr-oadp-backup-cp-workload.adoc[leveloffset=+1] + +include::modules/hcp-dr-oadp-restore.adoc[leveloffset=+1] + +include::modules/hcp-dr-oadp-observe.adoc[leveloffset=+1] + +include::modules/hcp-dr-oadp-observe-velero.adoc[leveloffset=+1] diff --git a/modules/hcp-dr-oadp-backup-cp-workload.adoc b/modules/hcp-dr-oadp-backup-cp-workload.adoc new file mode 100644 index 0000000000..f17ab43afd --- /dev/null +++ b/modules/hcp-dr-oadp-backup-cp-workload.adoc @@ -0,0 +1,124 @@ +// Module included in the following assemblies: +// +// * hosted_control_planes/hcp-disaster-recovery-oadp.adoc + +:_mod-docs-content-type: REFERENCE +[id="hcp-dr-oadp-backup-cp-workload_{context}"] += Backing up the control plane workload + +You can back up the control plane workload by creating the `Backup` custom resource (CR). + +To monitor and observe the backup process, see "Observing the backup and restore process". + +.Procedure + +. Scale down the `NodePool` replicas to `0` by running the following command: ++ +[source,terminal] +---- +$ oc --kubeconfig \ + scale nodepool -n \ + --replicas 0 +---- + +. Pause the reconciliation of the `HostedCluster` resource by running the following command: ++ +[source,terminal] +---- +$ oc --kubeconfig \ + patch hostedcluster -n \ + --type json -p '[{"op": "add", "path": "/spec/pausedUntil", "value": "true"}]' +---- + +. Pause the reconciliation of the `NodePool` resource by running the following command: ++ +[source,terminal] +---- +$ oc --kubeconfig \ + patch nodepool -n \ + --type json -p '[{"op": "add", "path": "/spec/pausedUntil", "value": "true"}]' +---- + +. Create a YAML file that defines the `Backup` CR: ++ +.Example `backup-control-plane.yaml` file +[%collapsible] +==== +[source,yaml] +---- +apiVersion: velero.io/v1 +kind: Backup +metadata: + name: <1> + namespace: openshift-adp + labels: + velero.io/storage-location: default +spec: + hooks: {} + includedNamespaces: <2> + - <3> + - <4> + includedResources: + - sa + - role + - rolebinding + - pod + - pvc + - pv + - bmh + - configmap + - infraenv <5> + - priorityclasses + - pdb + - agents + - hostedcluster + - nodepool + - secrets + - hostedcontrolplane + - cluster + - agentcluster + - agentmachinetemplate + - agentmachine + - machinedeployment + - machineset + - machine + excludedResources: [] + storageLocation: default + ttl: 2h0m0s + snapshotMoveData: true <6> + datamover: "velero" <6> + defaultVolumesToFsBackup: true <7> +---- +==== +<1> Replace `backup_resource_name` with the name of your `Backup` resource. +<2> Selects specific namespaces to back up objects from them. You must include your hosted cluster namespace and the hosted control plane namespace. +<3> Replace `` with the name of the hosted cluster namespace, for example, `clusters`. +<4> Replace `` with the name of the hosted control plane namespace, for example, `clusters-hosted`. +<5> You must create the `infraenv` resource in a separate namespace. Do not delete the `infraenv` resource during the backup process. +<6> Enables the CSI volume snapshots and uploads the control plane workload automatically to the cloud storage. +<7> Sets the `fs-backup` backing up method for persistent volumes (PVs) as default. This setting is useful when you use a combination of Container Storage Interface (CSI) volume snapshots and the `fs-backup` method. ++ +[NOTE] +==== +If you want to use CSI volume snapshots, you must add the `backup.velero.io/backup-volumes-excludes=` annotation to your PVs. +==== + +. Apply the `Backup` CR by running the following command: ++ +[source,terminal] +---- +$ oc apply -f backup-control-plane.yaml +---- + +.Verification + +* Verify if the value of the `status.phase` is `Completed` by running the following command: ++ +[source,terminal] +---- +$ oc get backup -n openshift-adp -o jsonpath='{.status.phase}' +---- + +.Next steps + +* Restoring a hosted cluster by using OADP diff --git a/modules/hcp-dr-oadp-observe-velero.adoc b/modules/hcp-dr-oadp-observe-velero.adoc new file mode 100644 index 0000000000..05bcc8eac8 --- /dev/null +++ b/modules/hcp-dr-oadp-observe-velero.adoc @@ -0,0 +1,34 @@ +// Module included in the following assemblies: +// +// * hosted_control_planes/hcp-disaster-recovery-oadp.adoc + +:_mod-docs-content-type: PROCEDURE +[id="hcp-dr-oadp-observe-velero_{context}"] += Using the velero CLI to describe the Backup and Restore resources + +When using {oadp-full}, you can get more details of the `Backup` and `Restore` resources by using the `velero` command-line interface (CLI). + +.Procedure + +. Create an alias to use the `velero` CLI from a container by running the following command: ++ +[source,terminal] +---- +$ alias velero='oc -n openshift-adp exec deployment/velero -c velero -it -- ./velero' +---- + +. Get details of your `Restore` custom resource (CR) by running the following command: ++ +[source,terminal] +---- +$ velero restore describe --details <1> +---- +<1> Replace `` with the name of your `Restore` resource. + +. Get details of your `Backup` CR by running the following command: ++ +[source,terminal] +---- +$ velero restore describe --details <1> +---- +<1> Replace `` with the name of your `Backup` resource. diff --git a/modules/hcp-dr-oadp-observe.adoc b/modules/hcp-dr-oadp-observe.adoc new file mode 100644 index 0000000000..f66640aa0f --- /dev/null +++ b/modules/hcp-dr-oadp-observe.adoc @@ -0,0 +1,39 @@ +// Module included in the following assemblies: +// +// * hosted_control_planes/hcp-disaster-recovery-oadp.adoc + +:_mod-docs-content-type: PROCEDURE +[id="hcp-dr-oadp-observe_{context}"] += Observing the backup and restore process + +When using {oadp-first} to backup and restore a hosted cluster, you can monitor and observe the process. + +.Procedure + +. Observe the backup process by running the following command: ++ +[source,terminal] +---- +$ watch "oc get backup -n openshift-adp -o jsonpath='{.status}'" +---- + +. Observe the restore process by running the following command: ++ +[source,terminal] +---- +$ watch "oc get restore -n openshift-adp -o jsonpath='{.status}'" +---- + +. Observe the Velero logs by running the following command: ++ +[source,terminal] +---- +$ oc logs -n openshift-adp -ldeploy=velero -f +---- + +. Observe the progress of all of the {oadp-short} objects by running the following command: ++ +[source,terminal] +---- +$ watch "echo BackupRepositories:;echo;oc get backuprepositories.velero.io -A;echo; echo BackupStorageLocations: ;echo; oc get backupstoragelocations.velero.io -A;echo;echo DataUploads: ;echo;oc get datauploads.velero.io -A;echo;echo DataDownloads: ;echo;oc get datadownloads.velero.io -n openshift-adp; echo;echo VolumeSnapshotLocations: ;echo;oc get volumesnapshotlocations.velero.io -A;echo;echo Backups:;echo;oc get backup -A; echo;echo Restores:;echo;oc get restore -A" +---- diff --git a/modules/hcp-dr-oadp-restore.adoc b/modules/hcp-dr-oadp-restore.adoc new file mode 100644 index 0000000000..f3c5e9ddab --- /dev/null +++ b/modules/hcp-dr-oadp-restore.adoc @@ -0,0 +1,115 @@ +// Module included in the following assemblies: +// +// * hosted_control_planes/hcp-disaster-recovery-oadp.adoc + +:_mod-docs-content-type: PROCEDURE +[id="hcp-dr-oadp-restore_{context}"] += Restoring a hosted cluster by using {oadp-short} + +You can restore the hosted cluster by creating the `Restore` custom resource (CR). + +* If you are using an _in-place_ update, InfraEnv does not need spare nodes. You need to re-provision the worker nodes from the new management cluster. +* If you are using a _replace_ update, you need some spare nodes for InfraEnv to deploy the worker nodes. + +[IMPORTANT] +==== +After you back up your hosted cluster, you must destroy it to initiate the restoring process. To initiate node provisioning, you must back up workloads in the data plane before deleting the hosted cluster. +==== + +.Prerequisites + +* You completed the steps in link:https://docs.redhat.com/en/documentation/red_hat_advanced_cluster_management_for_kubernetes/2.11/html/clusters/cluster_mce_overview#remove-a-cluster-by-using-the-console[Removing a cluster by using the console] to delete your hosted cluster. +* You completed the steps in link:https://docs.redhat.com/en/documentation/red_hat_advanced_cluster_management_for_kubernetes/2.11/html/clusters/cluster_mce_overview#removing-a-cluster-from-management-in-special-cases[Removing remaining resources after removing a cluster]. + +To monitor and observe the backup process, see "Observing the backup and restore process". + +.Procedure + +. Verify that no pods and persistent volume claims (PVCs) are present in the hosted control plane namespace by running the following command: ++ +[source,terminal] +---- +$ oc get pod pvc -n +---- ++ +.Expected output +[source,terminal] +---- +No resources found +---- + +. Create a YAML file that defines the `Restore` CR: ++ +.Example `restore-hosted-cluster.yaml` file +[source,yaml] +---- +apiVersion: velero.io/v1 +kind: Restore +metadata: + name: <1> + namespace: openshift-adp +spec: + backupName: <2> + restorePVs: true <3> + existingResourcePolicy: update <4> + excludedResources: + - nodes + - events + - events.events.k8s.io + - backups.velero.io + - restores.velero.io + - resticrepositories.velero.io +---- +<1> Replace `` with the name of your `Restore` resource. +<2> Replace `` with the name of your `Backup` resource. +<3> Initiates the recovery of persistent volumes (PVs) and its pods. +<4> Ensures that the existing objects are overwritten with the backed up content. ++ +[IMPORTANT] +==== +You must create the `infraenv` resource in a separate namespace. Do not delete the `infraenv` resource during the restore process. The `infraenv` resource is mandatory for the new nodes to be reprovisioned. +==== + +. Apply the `Restore` CR by running the following command: ++ +[source,terminal] +---- +$ oc apply -f restore-hosted-cluster.yaml +---- + +. Verify if the value of the `status.phase` is `Completed` by running the following command: ++ +[source,terminal] +---- +$ oc get hostedcluster -n -o jsonpath='{.status.phase}' +---- + +. After the restore process is complete, start the reconciliation of the `HostedCluster` and `NodePool` resources that you paused during backing up of the control plane workload: + +.. Start the reconciliation of the `HostedCluster` resource by running the following command: ++ +[source,terminal] +---- +$ oc --kubeconfig \ + patch hostedcluster -n \ + --type json -p '[{"op": "add", "path": "/spec/pausedUntil", "value": "false"}]' +---- + +.. Start the reconciliation of the `NodePool` resource by running the following command: ++ +[source,terminal] +---- +$ oc --kubeconfig \ + patch nodepool -n \ + --type json -p '[{"op": "add", "path": "/spec/pausedUntil", "value": "false"}]' +---- + +. Scale the `NodePool` resource to the desired number of replicas by running the following command: ++ +[source,terminal] +---- +$ oc --kubeconfig \ + scale nodepool -n \ + --replicas <1> +---- +<1> Replace `` by an integer value, for example, `3`. diff --git a/modules/hcp-mgmt-component-loss-impact.adoc b/modules/hcp-mgmt-component-loss-impact.adoc new file mode 100644 index 0000000000..5fde391502 --- /dev/null +++ b/modules/hcp-mgmt-component-loss-impact.adoc @@ -0,0 +1,33 @@ +// Module included in the following assemblies: +// +// * hosted_control_planes/hcp-updating.adoc + +:_mod-docs-content-type: CONCEPT +[id="hcp-mgmt-component-loss-impact_{context}"] += Impact of the failed management cluster component + +If the management cluster component fails, your workload remains unaffected. In the {product-title} management cluster, the control plane is decoupled from the data plane to provide resiliency. + +The following table covers the impact of a failed management cluster component on the control plane and the data plane. However, the table does not cover all scenarios for the management cluster component failures. + +.Impact of the failed component on hosted control planes +[cols="1,1,1",options="header"] +|=== +|Name of the failed component |Hosted control plane API status |Hosted cluster data plane status + +|Worker node +|Available +|Available + +|Availability zone +|Available +|Available + +|Management cluster control plane +|Available +|Available + +|Management cluster control plane and worker nodes +|Not available +|Available +|===