diff --git a/_topic_map.yml b/_topic_map.yml index 615c77b973..fbbc0f3231 100644 --- a/_topic_map.yml +++ b/_topic_map.yml @@ -785,7 +785,7 @@ Topics: - Name: Exposing custom application metrics for autoscaling File: exposing-custom-application-metrics-for-autoscaling --- -Name: Metering +Name: Metering Dir: metering Distros: openshift-enterprise,openshift-origin Topics: @@ -826,18 +826,23 @@ Topics: - Name: What huge pages do and how they are consumed by apps File: what-huge-pages-do-and-how-they-are-consumed-by-apps --- -Name: Disaster recovery -Dir: disaster_recovery +Name: Backup and restore +Dir: backup_and_restore Distros: openshift-origin,openshift-enterprise Topics: - Name: Backing up etcd data File: backing-up-etcd -- Name: Recovering from lost master hosts - File: scenario-1-infra-recovery -- Name: Restoring back to a previous cluster state - File: scenario-2-restoring-cluster-state -- Name: Recovering from expired control plane certificates - File: scenario-3-expired-certs +- Name: Disaster recovery + Dir: disaster_recovery + Topics: + - Name: About disaster recovery + File: about-disaster-recovery + - Name: Recovering from lost master hosts + File: scenario-1-infra-recovery + - Name: Restoring to a previous cluster state + File: scenario-2-restoring-cluster-state + - Name: Recovering from expired control plane certificates + File: scenario-3-expired-certs --- Name: CLI reference Dir: cli_reference diff --git a/backup_and_restore/backing-up-etcd.adoc b/backup_and_restore/backing-up-etcd.adoc new file mode 100644 index 0000000000..fef698e660 --- /dev/null +++ b/backup_and_restore/backing-up-etcd.adoc @@ -0,0 +1,26 @@ +[id="backup-etcd"] += Backing up etcd +include::modules/common-attributes.adoc[] +:context: backup-etcd + +toc::[] + +etcd is the key-value store for {product-title}, which persists the state of all +resource objects. + +Back up your cluster's etcd data regularly and store in a secure location +ideally outside the {product-title} environment. Do not take an etcd backup +before the first certificate rotation completes, which occurs 24 hours after +installation, otherwise the backup will contain expired certificates. It is also +recommended to take etcd backups during non-peak usage hours, as it is a +blocking action. + +Once you have an etcd backup, you can xref:../backup_and_restore/disaster_recovery/scenario-1-infra-recovery.adoc#dr-infrastructure-recovery[recover from lost master hosts] +and xref:../backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc#dr-restoring-cluster-state[restore to a previous cluster state]. + +You can perform the xref:../backup_and_restore/backing-up-etcd.adoc#backing-up-etcd-data_backup-etcd[etcd data backup process] +on any master host that has connectivity to the etcd cluster, where the proper +certificates are provided. + +// Backing up etcd data +include::modules/backup-etcd.adoc[leveloffset=+1] diff --git a/backup_and_restore/disaster_recovery/about-disaster-recovery.adoc b/backup_and_restore/disaster_recovery/about-disaster-recovery.adoc new file mode 100644 index 0000000000..61aff6617f --- /dev/null +++ b/backup_and_restore/disaster_recovery/about-disaster-recovery.adoc @@ -0,0 +1,35 @@ +[id="about-dr"] += About disaster recovery +include::modules/common-attributes.adoc[] +:context: about-dr + +toc::[] + +The disaster recovery documentation provides information for administrators on +how to recover from several disaster situations that might occur with their +{product-title} cluster. As an administrator, you might need to follow one or +more of the following procedures in order to return your cluster to a working +state. + +xref:../../backup_and_restore/disaster_recovery/scenario-1-infra-recovery.adoc#dr-infrastructure-recovery[Recovering from lost master hosts]:: +This solution handles situations where you have lost the majority of your master +hosts, leading to etcd quorum loss and the cluster going offline. As long as you +have taken an etcd backup and have at least one remaining healthy master host, +you can follow this procedure to recover your cluster. ++ +If applicable, you might also need to xref:../../backup_and_restore/disaster_recovery/scenario-3-expired-certs.adoc#dr-recovering-expired-certs[recover from expired control plane certificates]. + +xref:../../backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc#dr-restoring-cluster-state[Restoring to a previous cluster state]:: +This solution handles situations where you want to restore your cluster to +a previous state, for example, if an administrator deletes something critical. +As long as you have taken an etcd backup, you can follow this procedure to +restore your cluster to a previous state. ++ +If applicable, you might also need to xref:../../backup_and_restore/disaster_recovery/scenario-3-expired-certs.adoc#dr-recovering-expired-certs[recover from expired control plane certificates]. + +xref:../../backup_and_restore/disaster_recovery/scenario-3-expired-certs.adoc#dr-recovering-expired-certs[Recovering from expired control plane certificates]:: +This solution handles situations where your control plane certificates have +expired. For example, if you shut down your cluster before the first certificate +rotation, which occurs 24 hours after installation, your certificates will not +be rotated and will expire. You can follow this procedure to recover from +expired control plane certificates. diff --git a/disaster_recovery/images b/backup_and_restore/disaster_recovery/images similarity index 100% rename from disaster_recovery/images rename to backup_and_restore/disaster_recovery/images diff --git a/disaster_recovery/modules b/backup_and_restore/disaster_recovery/modules similarity index 100% rename from disaster_recovery/modules rename to backup_and_restore/disaster_recovery/modules diff --git a/disaster_recovery/scenario-1-infra-recovery.adoc b/backup_and_restore/disaster_recovery/scenario-1-infra-recovery.adoc similarity index 70% rename from disaster_recovery/scenario-1-infra-recovery.adoc rename to backup_and_restore/disaster_recovery/scenario-1-infra-recovery.adoc index 018fa272c7..74199350e7 100644 --- a/disaster_recovery/scenario-1-infra-recovery.adoc +++ b/backup_and_restore/disaster_recovery/scenario-1-infra-recovery.adoc @@ -6,7 +6,7 @@ include::modules/common-attributes.adoc[] toc::[] This document describes the process to recover from a complete loss of a master host. This includes -situations where a majority of master hosts have been lost, leading to etcd quorum loss and the cluster going offline. +situations where a majority of master hosts have been lost, leading to etcd quorum loss and the cluster going offline. This procedure assumes that you have at least one healthy master host. At a high level, the procedure is to: @@ -15,7 +15,7 @@ At a high level, the procedure is to: . Correct DNS and load balancer entries. . Grow etcd to full membership. -If the majority of master hosts have been lost, you will need a xref:../disaster_recovery/backing-up-etcd.html#backing-up-etcd-data_backup-etcd[backed up etcd snapshot] to restore etcd quorum on the remaining master host. +If the majority of master hosts have been lost, you will need a xref:../../backup_and_restore/backing-up-etcd.adoc#backing-up-etcd-data_backup-etcd[backed up etcd snapshot] to restore etcd quorum on the remaining master host. // Recovering from lost master hosts include::modules/dr-recover-lost-control-plane-hosts.adoc[leveloffset=+1] diff --git a/backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc b/backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc new file mode 100644 index 0000000000..6ad134bfe4 --- /dev/null +++ b/backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.adoc @@ -0,0 +1,11 @@ +[id="dr-restoring-cluster-state"] += Restoring to a previous cluster state +include::modules/common-attributes.adoc[] +:context: dr-restoring-cluster-state + +toc::[] + +To restore the cluster to a previous state, you must have previously xref:../../backup_and_restore/backing-up-etcd.adoc#backing-up-etcd-data_backup-etcd[backed up etcd data] by creating a snapshot. You will use this snapshot to restore the cluster state. + +// Restoring to a previous cluster state +include::modules/dr-restoring-cluster-state.adoc[leveloffset=+1] diff --git a/disaster_recovery/scenario-3-expired-certs.adoc b/backup_and_restore/disaster_recovery/scenario-3-expired-certs.adoc similarity index 100% rename from disaster_recovery/scenario-3-expired-certs.adoc rename to backup_and_restore/disaster_recovery/scenario-3-expired-certs.adoc diff --git a/disaster_recovery/backing-up-etcd.adoc b/disaster_recovery/backing-up-etcd.adoc deleted file mode 100644 index 88d23be092..0000000000 --- a/disaster_recovery/backing-up-etcd.adoc +++ /dev/null @@ -1,9 +0,0 @@ -[id="backup-etcd"] -= Backing up etcd -include::modules/common-attributes.adoc[] -:context: backup-etcd - -toc::[] - -// Backing up etcd data -include::modules/backup-etcd.adoc[leveloffset=+1] diff --git a/disaster_recovery/scenario-2-restoring-cluster-state.adoc b/disaster_recovery/scenario-2-restoring-cluster-state.adoc deleted file mode 100644 index 9520749dc6..0000000000 --- a/disaster_recovery/scenario-2-restoring-cluster-state.adoc +++ /dev/null @@ -1,11 +0,0 @@ -[id="dr-restoring-cluster-state"] -= Restoring back to a previous cluster state -include::modules/common-attributes.adoc[] -:context: dr-restoring-cluster-state - -toc::[] - -In order to restore the cluster to a previous state, you must have previously xref:../disaster_recovery/backing-up-etcd.html#backing-up-etcd-data_backup-etcd[backed up etcd data] by creating a snapshot. You will use this snapshot to restore the cluster state. - -// Restoring back to a previous cluster state -include::modules/dr-restoring-cluster-state.adoc[leveloffset=+1] diff --git a/modules/dr-restoring-cluster-state.adoc b/modules/dr-restoring-cluster-state.adoc index d3bc9c21a8..0878081adb 100644 --- a/modules/dr-restoring-cluster-state.adoc +++ b/modules/dr-restoring-cluster-state.adoc @@ -3,7 +3,7 @@ // * disaster_recovery/scenario-2-restoring-cluster-state.adoc [id="dr-scenario-2-restoring-cluster-state_{context}"] -= Restoring back to a previous cluster state += Restoring to a previous cluster state You can use a saved etcd snapshot to restore back to a previous cluster state.