From c966c2cf45ef48967c3eca031a91cc9b743a3ba0 Mon Sep 17 00:00:00 2001 From: srir Date: Thu, 4 Sep 2025 18:18:18 +0530 Subject: [PATCH] OSDOCS#12987: Two-node OpenShift cluster with fencing (Technology Preview) --- _topic_maps/_topic_map.yml | 17 + .../installing_two_node_cluster/_attributes | 1 + .../about-two-node-arbiter-installation.adoc | 16 + installing/installing_two_node_cluster/images | 1 + .../installing_tnf/_attributes | 1 + .../installing_tnf/images | 1 + .../installing_tnf/install-post-tnf.adoc | 33 ++ .../installing_tnf/install-tnf.adoc | 18 + .../installing-two-node-fencing.adoc | 67 ++++ .../installing_tnf/modules | 1 + .../installing_tnf/snippets | 1 + .../installing_two_node_cluster/modules | 1 + .../installing_two_node_cluster/snippets | 1 + modules/installation-dns-installer-infra.adoc | 7 + modules/installation-dns-user-infra.adoc | 96 +++-- ...overing-when-auto-recovery-is-unavail.adoc | 161 +++++++++ ...llation-replacing-control-plane-nodes.adoc | 330 ++++++++++++++++++ ...e-install-config-two-node-fencing-ipi.adoc | 70 ++++ ...e-install-config-two-node-fencing-upi.adoc | 48 +++ ...on-two-node-cluster-min-resource-reqs.adoc | 23 ++ ...o-node-creating-manifest-custom-br-ex.adoc | 7 + ...ion-two-node-ingress-lb-configuration.adoc | 71 ++++ .../installation-verifying-etcd-health.adoc | 52 +++ modules/ipi-install-network-requirements.adoc | 52 +-- ...nodes-cluster-enabling-features-about.adoc | 2 + snippets/dns-requirements.adoc | 52 +++ 26 files changed, 1049 insertions(+), 81 deletions(-) create mode 120000 installing/installing_two_node_cluster/_attributes create mode 100644 installing/installing_two_node_cluster/about-two-node-arbiter-installation.adoc create mode 120000 installing/installing_two_node_cluster/images create mode 120000 installing/installing_two_node_cluster/installing_tnf/_attributes create mode 120000 installing/installing_two_node_cluster/installing_tnf/images create mode 100644 installing/installing_two_node_cluster/installing_tnf/install-post-tnf.adoc create mode 100644 installing/installing_two_node_cluster/installing_tnf/install-tnf.adoc create mode 100644 installing/installing_two_node_cluster/installing_tnf/installing-two-node-fencing.adoc create mode 120000 installing/installing_two_node_cluster/installing_tnf/modules create mode 120000 installing/installing_two_node_cluster/installing_tnf/snippets create mode 120000 installing/installing_two_node_cluster/modules create mode 120000 installing/installing_two_node_cluster/snippets create mode 100644 modules/installation-dns-installer-infra.adoc create mode 100644 modules/installation-manual-recovering-when-auto-recovery-is-unavail.adoc create mode 100644 modules/installation-replacing-control-plane-nodes.adoc create mode 100644 modules/installation-sample-install-config-two-node-fencing-ipi.adoc create mode 100644 modules/installation-sample-install-config-two-node-fencing-upi.adoc create mode 100644 modules/installation-two-node-cluster-min-resource-reqs.adoc create mode 100644 modules/installation-two-node-creating-manifest-custom-br-ex.adoc create mode 100644 modules/installation-two-node-ingress-lb-configuration.adoc create mode 100644 modules/installation-verifying-etcd-health.adoc create mode 100644 snippets/dns-requirements.adoc diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index 745942f92f..4fae32fdaf 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -427,6 +427,23 @@ Topics: File: install-sno-preparing-to-install-sno - Name: Installing OpenShift on a single node File: install-sno-installing-sno +- Name: Installing a Two Node OpenShift Cluster + Dir: installing_two_node_cluster + Distros: openshift-origin,openshift-enterprise + Topics: + - Name: Two-Node with Arbiter + File: about-two-node-arbiter-installation + - Name: Two-node with Fencing + Dir: installing_tnf + Distros: openshift-enterprise,openshift-origin + Topics: + - Name: Preparing to install a two-node OpenShift cluster with fencing + File: installing-two-node-fencing + - Name: Installing a two-node OpenShift cluster with fencing + File: install-tnf + - Name: Post-installation troubleshooting and recovery + File: install-post-tnf + - Name: Installing on bare metal Dir: installing_bare_metal Distros: openshift-origin,openshift-enterprise diff --git a/installing/installing_two_node_cluster/_attributes b/installing/installing_two_node_cluster/_attributes new file mode 120000 index 0000000000..20cc1dcb77 --- /dev/null +++ b/installing/installing_two_node_cluster/_attributes @@ -0,0 +1 @@ +../../_attributes/ \ No newline at end of file diff --git a/installing/installing_two_node_cluster/about-two-node-arbiter-installation.adoc b/installing/installing_two_node_cluster/about-two-node-arbiter-installation.adoc new file mode 100644 index 0000000000..e845f1a214 --- /dev/null +++ b/installing/installing_two_node_cluster/about-two-node-arbiter-installation.adoc @@ -0,0 +1,16 @@ +:_mod-docs-content-type: ASSEMBLY +[id="about-two-node-arbiter-installation"] += Two-Node with Arbiter +:context: about-two-node-arbiter-installation + +A Two-Node OpenShift with Arbiter (TNA) cluster is a compact, cost-effective {product-title} topology. The topology consists of two control plane nodes and a lightweight arbiter node. The arbiter node stores the full etcd data, maintaining an etcd quorum and preventing split brain. The arbiter node does not run the additional control plane components `kube-apiserver` and `kube-controller-manager`, nor does it run workloads. + +To install a Two-Node OpenShift with Arbiter cluster, assign an arbiter role to at least one of the nodes and set the control plane node count for the cluster to 2. Although {product-title} does not currently impose a limit on the number of arbiter nodes, the typical deployment includes only one to minimize the use of hardware resources. + +After installation, you can add additional arbiter nodes to a Two-Node OpenShift with Arbiter cluster but not to a standard multi-node cluster. It is also not possible to convert between a Two-Node OpenShift with Arbiter and standard topology. + +You can install a Two-Node Arbiter cluster by using one of the following methods: + +* Installing on bare metal: xref:../installing_bare_metal/ipi/ipi-install-installation-workflow.adoc#ipi-install-config-local-arbiter-node_ipi-install-installation-workflow[Configuring a local arbiter node] + +* Installing with the Agent-based Installer: xref:../../installing/installing_with_agent_based_installer/installing-with-agent-based-installer.adoc#installing-ocp-agent-local-arbiter-node_installing-with-agent-based-installer[Configuring a local arbiter node] diff --git a/installing/installing_two_node_cluster/images b/installing/installing_two_node_cluster/images new file mode 120000 index 0000000000..847b03ed05 --- /dev/null +++ b/installing/installing_two_node_cluster/images @@ -0,0 +1 @@ +../../images/ \ No newline at end of file diff --git a/installing/installing_two_node_cluster/installing_tnf/_attributes b/installing/installing_two_node_cluster/installing_tnf/_attributes new file mode 120000 index 0000000000..20cc1dcb77 --- /dev/null +++ b/installing/installing_two_node_cluster/installing_tnf/_attributes @@ -0,0 +1 @@ +../../_attributes/ \ No newline at end of file diff --git a/installing/installing_two_node_cluster/installing_tnf/images b/installing/installing_two_node_cluster/installing_tnf/images new file mode 120000 index 0000000000..847b03ed05 --- /dev/null +++ b/installing/installing_two_node_cluster/installing_tnf/images @@ -0,0 +1 @@ +../../images/ \ No newline at end of file diff --git a/installing/installing_two_node_cluster/installing_tnf/install-post-tnf.adoc b/installing/installing_two_node_cluster/installing_tnf/install-post-tnf.adoc new file mode 100644 index 0000000000..1d6cc76fb1 --- /dev/null +++ b/installing/installing_two_node_cluster/installing_tnf/install-post-tnf.adoc @@ -0,0 +1,33 @@ +:_mod-docs-content-type: ASSEMBLY +[id="installing-post-tnf"] += Post-installation troubleshooting and recovery +include::_attributes/common-attributes.adoc[] +:context: install-post-tnf + +toc::[] + +The following sections help with recovering from issues in a two-node OpenShift cluster with fencing. + +:FeatureName: Two-node OpenShift cluster with fencing +include::snippets/technology-preview.adoc[leveloffset=+1] + +// Manually recovering from a disruption event when automated recovery is unavailable +include::modules/installation-manual-recovering-when-auto-recovery-is-unavail.adoc[leveloffset=+1] + +[role="_additional-resources"] +== Additional resources + +* xref:../../../backup_and_restore/control_plane_backup_and_restore/backing-up-etcd.adoc#backup-etcd-restoring_backing-up-etcd[Restoring etcd from a backup]. + +* xref:../installing_tnf/install-post-tnf.adoc#installation-verifying-etcd-health_install-post-tnf[Verifying etcd health in a two-node OpenShift cluster with fencing] + +// Replacing control plane nodes +include::modules/installation-replacing-control-plane-nodes.adoc[leveloffset=+1] + +[role="_additional-resources"] +== Additional resources + +* xref:../../../backup_and_restore/control_plane_backup_and_restore/backing-up-etcd.adoc#backup-etcd-restoring_backing-up-etcd[Restoring etcd from a backup]. + +// Verifying etcd health in a two-node OpenShift cluster with fencing +include::modules/installation-verifying-etcd-health.adoc[leveloffset=+1] \ No newline at end of file diff --git a/installing/installing_two_node_cluster/installing_tnf/install-tnf.adoc b/installing/installing_two_node_cluster/installing_tnf/install-tnf.adoc new file mode 100644 index 0000000000..a0622fa772 --- /dev/null +++ b/installing/installing_two_node_cluster/installing_tnf/install-tnf.adoc @@ -0,0 +1,18 @@ +:_mod-docs-content-type: ASSEMBLY +[id="installing-tnf"] += Installing a two-node OpenShift cluster with fencing +include::_attributes/common-attributes.adoc[] +:context: install-tnf + +toc::[] + +You can deploy a two-node OpenShift cluster with fencing by using either the installer-provisioned infrastructure or the user-provisioned infrastructure installation method. The following examples provide sample `install-config.yaml` configurations for both methods. + +:FeatureName: Two-node OpenShift cluster with fencing +include::snippets/technology-preview.adoc[leveloffset=+1] + +// Sample install-config.yaml for a two-node installer-provisioned infrastructure cluster with fencing +include::modules/installation-sample-install-config-two-node-fencing-ipi.adoc[leveloffset=+1] + +// Sample install-config.yaml for a two-node user-provisioned infrastructure cluster with fencing +include::modules/installation-sample-install-config-two-node-fencing-upi.adoc[leveloffset=+1] diff --git a/installing/installing_two_node_cluster/installing_tnf/installing-two-node-fencing.adoc b/installing/installing_two_node_cluster/installing_tnf/installing-two-node-fencing.adoc new file mode 100644 index 0000000000..b9e06dd77e --- /dev/null +++ b/installing/installing_two_node_cluster/installing_tnf/installing-two-node-fencing.adoc @@ -0,0 +1,67 @@ +:_mod-docs-content-type: ASSEMBLY +[id="installing-two-node-fencing"] += Preparing to install a two-node OpenShift cluster with fencing +include::_attributes/common-attributes.adoc[] +:context: installing-two-node-fencing + +toc::[] + +:FeatureName: Two-node OpenShift cluster with fencing +include::snippets/technology-preview.adoc[leveloffset=+1] + +A two-node OpenShift cluster with fencing provides high availability (HA) with a reduced hardware footprint. This configuration is designed for distributed or edge environments where deploying a full three-node control plane cluster is not practical. + +A two-node cluster does not include compute nodes. The two control plane machines run user workloads in addition to managing the cluster. + +Fencing is managed by Pacemaker, which can isolate an unresponsive node by using the Baseboard Management Console (BMC) of the node. After the unresponsive node is fenced, the remaining node can safely continue operating the cluster without the risk of resource corruption. + +[NOTE] +==== +You can deploy a two-node OpenShift cluster with fencing by using either the user-provisioned infrastructure method or the installer-provisioned infrastructure method. +==== + +The two-node OpenShift cluster with fencing requires the following hosts: + +.Minimum required hosts +[options="header"] +|=== + +|Hosts |Description + +|Two control plane machines +|The control plane machines run the Kubernetes and {product-title} services that form the control plane. + +|One temporary bootstrap machine +|You need a bootstrap machine to deploy the {product-title} cluster on the control plane machines. You can remove the bootstrap machine after you install the cluster. + +|=== + +The bootstrap and control plane machines must use Red Hat Enterprise Linux CoreOS (RHCOS) as the operating system. For instructions on installing RHCOS and starting the bootstrap process, see xref:../../../installing/installing_bare_metal/upi/installing-bare-metal-network-customizations.adoc#creating-machines-bare-metal_installing-bare-metal-network-customizations[Installing {op-system} and starting the {product-title} bootstrap process] + +[NOTE] +==== +The requirement to use RHCOS applies only to user-provisioned infrastructure deployments. For installer-provisioned infrastructure deployments, the bootstrap and control plane machines are provisioned automatically by the installation program, and you do not need to manually install RHCOS. +==== + +include::modules/installation-two-node-cluster-min-resource-reqs.adoc[leveloffset=+1] + +// Two-node-dns-requirements - user-provisioned infrastructure +include::modules/installation-dns-user-infra.adoc[leveloffset=+1] + +// Two-node-dns-requirements - installer-provisioned infrastructure +include::modules/installation-dns-installer-infra.adoc[leveloffset=+1] + +// Cofiguration for Ingress LB to get it to work with Pacemaker +include::modules/installation-two-node-ingress-lb-configuration.adoc[leveloffset=+1] + +// Creating a manifest object that includes a customized br-ex bridge +include::modules/installation-two-node-creating-manifest-custom-br-ex.adoc[leveloffset=+1] + +[role="_additional-resources"] +== Additional resources + +* xref:../../../installing/installing_bare_metal/ipi/ipi-install-installation-workflow.adoc#creating-manifest-file-customized-br-ex-bridge_ipi-install-installation-workflow[Creating a manifest file for a customized br-ex bridge] + +* link:https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html/configuring_and_managing_high_availability_clusters/index[Configuring and managing high availability clusters in RHEL]. + + diff --git a/installing/installing_two_node_cluster/installing_tnf/modules b/installing/installing_two_node_cluster/installing_tnf/modules new file mode 120000 index 0000000000..8b0e854007 --- /dev/null +++ b/installing/installing_two_node_cluster/installing_tnf/modules @@ -0,0 +1 @@ +../../modules \ No newline at end of file diff --git a/installing/installing_two_node_cluster/installing_tnf/snippets b/installing/installing_two_node_cluster/installing_tnf/snippets new file mode 120000 index 0000000000..5a3f5add14 --- /dev/null +++ b/installing/installing_two_node_cluster/installing_tnf/snippets @@ -0,0 +1 @@ +../../snippets/ \ No newline at end of file diff --git a/installing/installing_two_node_cluster/modules b/installing/installing_two_node_cluster/modules new file mode 120000 index 0000000000..8b0e854007 --- /dev/null +++ b/installing/installing_two_node_cluster/modules @@ -0,0 +1 @@ +../../modules \ No newline at end of file diff --git a/installing/installing_two_node_cluster/snippets b/installing/installing_two_node_cluster/snippets new file mode 120000 index 0000000000..5a3f5add14 --- /dev/null +++ b/installing/installing_two_node_cluster/snippets @@ -0,0 +1 @@ +../../snippets/ \ No newline at end of file diff --git a/modules/installation-dns-installer-infra.adoc b/modules/installation-dns-installer-infra.adoc new file mode 100644 index 0000000000..19518e6494 --- /dev/null +++ b/modules/installation-dns-installer-infra.adoc @@ -0,0 +1,7 @@ + + +:_mod-docs-content-type: CONCEPT +[id="installation-installer-user-infra_{context}"] += Installer-provisioned DNS requirements + +include::snippets/dns-requirements.adoc[leveloffset=+1] \ No newline at end of file diff --git a/modules/installation-dns-user-infra.adoc b/modules/installation-dns-user-infra.adoc index 05cca0fc00..0320d75734 100644 --- a/modules/installation-dns-user-infra.adoc +++ b/modules/installation-dns-user-infra.adoc @@ -46,9 +46,18 @@ In {product-title} deployments, DNS name resolution is required for the followin * The Kubernetes API * The {product-title} application wildcard -* The bootstrap, control plane, and compute machines +* The bootstrap and control plane machines +ifeval::["{context}" != "installing-two-node-fencing"] +* The compute machines +endif::[] +ifeval::["{context}" == "installing-two-node-fencing"] +Reverse DNS resolution is also required for the Kubernetes API, the bootstrap machine, and the control plane machines. +endif::[] + +ifeval::["{context}" != "installing-two-node-fencing"] Reverse DNS resolution is also required for the Kubernetes API, the bootstrap machine, the control plane machines, and the compute machines. +endif::[] DNS A/AAAA or CNAME records are used for name resolution and PTR records are used for reverse name resolution. The reverse records are important because {op-system-first} uses the reverse records to set the hostnames for all the nodes, unless the hostnames are provided by DHCP. Additionally, the reverse records are used to generate the certificate signing requests (CSR) that {product-title} needs to operate. @@ -84,7 +93,14 @@ names, then proxied API calls can fail, and you cannot retrieve logs from pods. |Routes |`*.apps...` -|A wildcard DNS A/AAAA or CNAME record that refers to the application ingress load balancer. The application ingress load balancer targets the machines that run the Ingress Controller pods. The Ingress Controller pods run on the compute machines by default. These records must be resolvable by both clients external to the cluster and from all the nodes within the cluster. +|A wildcard DNS A/AAAA or CNAME record that refers to the application ingress load balancer. The application ingress load balancer targets the machines that run the Ingress Controller pods. +ifeval::["{context}" == "installing-two-node-fencing"] +By default, the Ingress Controller pods run on compute nodes. In cluster topologies without dedicated compute nodes, such as two-node or three-node clusters, the control plane nodes also carry the worker label, so the Ingress pods are scheduled on the control plane nodes. +endif::[] +ifeval::["{context}" != "installing-two-node-fencing"] +The Ingress Controller pods run on the compute machines by default. +endif::[] +These records must be resolvable by both clients external to the cluster and from all the nodes within the cluster. For example, `console-openshift-console.apps..` is used as a wildcard route to the {product-title} console. @@ -98,10 +114,12 @@ machine. These records must be resolvable by the nodes within the cluster. |DNS A/AAAA or CNAME records and DNS PTR records to identify each machine for the control plane nodes. These records must be resolvable by the nodes within the cluster. +ifeval::["{context}" != "installing-two-node-fencing"] |Compute machines |`...` |DNS A/AAAA or CNAME records and DNS PTR records to identify each machine for the worker nodes. These records must be resolvable by the nodes within the cluster. +endif::[] |=== @@ -115,6 +133,7 @@ In {product-title} 4.4 and later, you do not need to specify etcd host and SRV r You can use the `dig` command to verify name and reverse name resolution. See the section on _Validating DNS resolution for user-provisioned infrastructure_ for detailed validation steps. ==== + [id="installation-dns-user-infra-example_{context}"] == Example DNS configuration for user-provisioned clusters @@ -122,6 +141,13 @@ This section provides A and PTR record configuration samples that meet the DNS r In the examples, the cluster name is `ocp4` and the base domain is `example.com`. +ifeval::["{context}" == "installing-two-node-fencing"] +[NOTE] +==== +In a two-node cluster with fencing, the control plane machines are also schedulable worker nodes. The DNS configuration must therefore include only the two control plane nodes. If you later add compute machines, provide corresponding A and PTR records for them as in a standard user-provisioned installation. +==== +endif::[] + .Example DNS A record configuration for a user-provisioned cluster The following example is a BIND zone file that shows sample A records for name resolution in a user-provisioned cluster. @@ -148,35 +174,40 @@ smtp.example.com. IN A 192.168.1.5 helper.example.com. IN A 192.168.1.5 helper.ocp4.example.com. IN A 192.168.1.5 ; -api.ocp4.example.com. IN A 192.168.1.5 <1> -api-int.ocp4.example.com. IN A 192.168.1.5 <2> +api.ocp4.example.com. IN A 192.168.1.5 +api-int.ocp4.example.com. IN A 192.168.1.5 ; -*.apps.ocp4.example.com. IN A 192.168.1.5 <3> +*.apps.ocp4.example.com. IN A 192.168.1.5 ; -bootstrap.ocp4.example.com. IN A 192.168.1.96 <4> +bootstrap.ocp4.example.com. IN A 192.168.1.96 ; -control-plane0.ocp4.example.com. IN A 192.168.1.97 <5> -control-plane1.ocp4.example.com. IN A 192.168.1.98 <5> -control-plane2.ocp4.example.com. IN A 192.168.1.99 <5> +control-plane0.ocp4.example.com. IN A 192.168.1.97 +control-plane1.ocp4.example.com. IN A 192.168.1.98 ; -compute0.ocp4.example.com. IN A 192.168.1.11 <6> -compute1.ocp4.example.com. IN A 192.168.1.7 <6> +ifeval::["{context}" != "installing-two-node-fencing"] +control-plane2.ocp4.example.com. IN A 192.168.1.99 +; +compute0.ocp4.example.com. IN A 192.168.1.11 +compute1.ocp4.example.com. IN A 192.168.1.7 +endif::[] ; ;EOF ---- -<1> Provides name resolution for the Kubernetes API. The record refers to the IP address of the API load balancer. -<2> Provides name resolution for the Kubernetes API. The record refers to the IP address of the API load balancer and is used for internal cluster communications. -<3> Provides name resolution for the wildcard routes. The record refers to the IP address of the application ingress load balancer. The application ingress load balancer targets the machines that run the Ingress Controller pods. The Ingress Controller pods run on the compute machines by default. +* `api.ocp4.example.com.`: Provides name resolution for the Kubernetes API. The record refers to the IP address of the API load balancer. +* `api-int.ocp4.example.com.`: Provides name resolution for the Kubernetes API. The record refers to the IP address of the API load balancer and is used for internal cluster communications. +* `*.apps.ocp4.example.com.`: Provides name resolution for the wildcard routes. The record refers to the IP address of the application ingress load balancer. The application ingress load balancer targets the machines that run the Ingress Controller pods. + [NOTE] ===== In the example, the same load balancer is used for the Kubernetes API and application ingress traffic. In production scenarios, you can deploy the API and application ingress load balancers separately so that you can scale the load balancer infrastructure for each in isolation. ===== + -<4> Provides name resolution for the bootstrap machine. -<5> Provides name resolution for the control plane machines. -<6> Provides name resolution for the compute machines. +* `bootstrap.ocp4.example.com.`: Provides name resolution for the bootstrap machine. +* `control-plane0.ocp4.example.com.`: Provides name resolution for the control plane machines. +ifeval::["{context}" != "installing-two-node-fencing"] +* `compute0.ocp4.example.com.`: Provides name resolution for the compute machines. +endif::[] ==== .Example DNS PTR record configuration for a user-provisioned cluster @@ -197,26 +228,31 @@ $TTL 1W 1W ) ; minimum (1 week) IN NS ns1.example.com. ; -5.1.168.192.in-addr.arpa. IN PTR api.ocp4.example.com. <1> -5.1.168.192.in-addr.arpa. IN PTR api-int.ocp4.example.com. <2> +5.1.168.192.in-addr.arpa. IN PTR api.ocp4.example.com. +5.1.168.192.in-addr.arpa. IN PTR api-int.ocp4.example.com. ; -96.1.168.192.in-addr.arpa. IN PTR bootstrap.ocp4.example.com. <3> +96.1.168.192.in-addr.arpa. IN PTR bootstrap.ocp4.example.com. ; -97.1.168.192.in-addr.arpa. IN PTR control-plane0.ocp4.example.com. <4> -98.1.168.192.in-addr.arpa. IN PTR control-plane1.ocp4.example.com. <4> -99.1.168.192.in-addr.arpa. IN PTR control-plane2.ocp4.example.com. <4> +97.1.168.192.in-addr.arpa. IN PTR control-plane0.ocp4.example.com. +98.1.168.192.in-addr.arpa. IN PTR control-plane1.ocp4.example.com. ; -11.1.168.192.in-addr.arpa. IN PTR compute0.ocp4.example.com. <5> -7.1.168.192.in-addr.arpa. IN PTR compute1.ocp4.example.com. <5> +ifeval::["{context}" != "installing-two-node-fencing"] +99.1.168.192.in-addr.arpa. IN PTR control-plane2.ocp4.example.com. +; +11.1.168.192.in-addr.arpa. IN PTR compute0.ocp4.example.com. +7.1.168.192.in-addr.arpa. IN PTR compute1.ocp4.example.com. +endif::[] ; ;EOF ---- -<1> Provides reverse DNS resolution for the Kubernetes API. The PTR record refers to the record name of the API load balancer. -<2> Provides reverse DNS resolution for the Kubernetes API. The PTR record refers to the record name of the API load balancer and is used for internal cluster communications. -<3> Provides reverse DNS resolution for the bootstrap machine. -<4> Provides reverse DNS resolution for the control plane machines. -<5> Provides reverse DNS resolution for the compute machines. +* `api.ocp4.example.com.`: Provides reverse DNS resolution for the Kubernetes API. The PTR record refers to the record name of the API load balancer. +* `api-int.ocp4.example.com.`: Provides reverse DNS resolution for the Kubernetes API. The PTR record refers to the record name of the API load balancer and is used for internal cluster communications. +* `bootstrap.ocp4.example.com.`: Provides reverse DNS resolution for the bootstrap machine. +* `control-plane0.ocp4.example.com.`: Provides rebootstrap.ocp4.example.com.verse DNS resolution for the control plane machines. +ifeval::["{context}" != "installing-two-node-fencing"] +* `compute0.ocp4.example.com.`: Provides reverse DNS resolution for the compute machines. +endif::[] ==== [NOTE] diff --git a/modules/installation-manual-recovering-when-auto-recovery-is-unavail.adoc b/modules/installation-manual-recovering-when-auto-recovery-is-unavail.adoc new file mode 100644 index 0000000000..4388e25de8 --- /dev/null +++ b/modules/installation-manual-recovering-when-auto-recovery-is-unavail.adoc @@ -0,0 +1,161 @@ +:_mod-docs-content-type: PROCEDURE +[id="installation-manual-recovering-when-auto-recovery-is-unavail_{context}"] += Manually recovering from a disruption event when automated recovery is unavailable + +You might need to perform manual recovery steps if a disruption event prevents fencing from functioning correctly. In this case, you can run commands directly on the control plane nodes to recover the cluster. There are four main recovery scenarios, which should be attempted in the following order: + +. Update fencing secrets: Refresh the Baseboard Management Console (BMC) credentials if they are incorrect or outdated. +. Recover from a single-node failure: Restore functionality when only one control plane node is down. +. Recover from a complete node failure: Restore functionality when both control plane nodes are down. +. Replace a control plane node that cannot be recovered: Replace the node to restore cluster functionality. + +.Prerequisites + +* You have administrative access to the control plane nodes. +* You can connect to the nodes by using SSH. + +[NOTE] +==== +Do an etcd backup before proceeding to ensure that you can restore the cluster if any issues occur. +==== + +.Procedure + +. Update the fencing secrets: + +.. If the Cluster API is unavilable, update fencing secret by running the following command on one of the cluster nodes: ++ +[source,terminal] +---- +$ sudo pcs stonith update _redfish username= password= +---- ++ +After the Cluster API recovers, or the Cluster API is already available, update fencing secret in the cluster to ensure it stays in sync, as described in the following step. + +.. Edit the username and password for the existing fencing secret for the control plane node by running the following commads: ++ +[source,terminal] +---- +$ oc project openshift-etcd +---- ++ +[source,terminal] +---- +$ oc edit secret -fencing +---- ++ +If the cluster recovers after updating the fencing secrets, no further action is required. If the issue persists, proceed to the next step. + +. Recover from a single-node failure: + +.. Gather initial diagnostics by running the following command: ++ +[source,terminal] +---- +$ sudo pcs status --full +---- ++ +This command provides a detailed view of the current cluster and resource states. You can use the output to identify issues with fencing or etcd startup. + +.. Run the following additional diagnostic commands, if necessary: ++ +Reset the resources on your cluster and instruct Pacemaker to attempt to start them fresh by running the following command: ++ +[source,terminal] +---- +$ sudo pcs resource cleanup +---- ++ +Review all Pacemaker activity on the node by running the following command: ++ +[source,terminal] +---- +$ sudo journalctl -u pacemaker +---- ++ +Diagnose etcd resource startup issues by running the following command: ++ +[source,terminal] +---- +$ sudo journalctl -u pacemaker | grep podman-etcd +---- + +.. View the fencing configuration for the node by running the following command: ++ +[source,terminal] +---- +$ sudo pcs stonith config _redfish +---- ++ +If fencing is required but is not functioning, ensure that the Redfish fencing endpoint is accessible and verify that the credentials are correct. + +.. If etcd is not starting despite fencing being operational, restore etcd from a backup by running the following commands: ++ +[source,terminal] +---- +$ sudo cp -r /var/lib/etcd-backup/* /var/lib/etcd/ +---- ++ +[source,terminal] +---- +$ sudo chown -R etcd:etcd /var/lib/etcd +---- ++ +If the recovery is successful, no further action is required. If the issue persists, proceed to the next step. + +. Recover from a complete node failure: + +.. Power on both control plane nodes. ++ +Pacemaker starts automatically and begins the recovery operation when it detects both nodes are online. If the recovery does not start as expected, use the diagnostic commands described in the previous step to investigate the issue. + +.. Reset the resources on your cluster and instruct Pacemaker to attempt to start them fresh by running the following command: ++ +[source,terminal] +---- +$ sudo pcs resource cleanup +---- + +.. Check resource start order by running the following command: ++ +[source,terminal] +---- +$ sudo pcs status --full +---- + +.. Inspect the pacemaker service journal if kubelet fails by running the following commands: ++ +[source,terminal] +---- +$ sudo journalctl -u pacemaker +---- ++ +[source,terminal] +---- +$ sudo journalctl -u kubelet +---- + +.. Handle out-of-sync etcd. ++ +If one node has a more up-to-date etcd, Pacemaker attempts to fence the lagging node and start it as a learner. If this process stalls, verify the Redfish fencing endpoint and credentials by running the following command: ++ +[source,terminal] +---- +$ sudo pcs stonith config +---- ++ +If the recovery is successful, no further action is required. If the issue persists, perform manual recovery as described in the next step. + +. If you need to manually recover from an event when one of the nodes is not recoverable, follow the procedure in "Replacing control plane nodes in a two-node OpenShift cluster". ++ +When a cluster loses a single node, it enters the degraded mode. In this state, Pacemaker automatically unblocks quorum and allows the cluster to temporarily operate on the remaining node. ++ +If both nodes fail, you must restart both nodes to reestablish quorum so that Pacemaker can resume normal cluster operations. ++ +If only one of the two nodes can be restarted, follow the node replacement procedure to manually reestablish quorum on the surviving node. ++ +If manual recovery is still required and it fails, collect a must-gather and SOS report, and file a bug. + +.Verification + +For information about verifying that both control plane nodes and etcd are operating correctly, see "Verifying etcd health in a two-node OpenShift cluster with fencing". \ No newline at end of file diff --git a/modules/installation-replacing-control-plane-nodes.adoc b/modules/installation-replacing-control-plane-nodes.adoc new file mode 100644 index 0000000000..5d9fafdf09 --- /dev/null +++ b/modules/installation-replacing-control-plane-nodes.adoc @@ -0,0 +1,330 @@ +:_mod-docs-content-type: PROCEDURE +[id="installation-replacing-control-plane-nodes_{context}"] += Replacing control plane nodes in a two-node OpenShift cluster with fencing + +You can replace a failed control plane node in a two-node OpenShift cluster. The replacement node must use the same host name and IP address as the failed node. + +.Prerequisites + +* You have a functioning survivor control plane node. +* You have verified that either the machine is not running or the node is not ready. +* You have access to the cluster as a user with the `cluster-admin` role. +* You know the host name and IP address of the failed node. + +[NOTE] +==== +Do an etcd backup before proceeding to ensure that you can restore the cluster if any issues occur. +==== + +.Procedure + +. Check the quorum state by running the following command: ++ +[source,terminal] +---- +$ sudo pcs quorum status +---- ++ +.Example output +[source,terminal] +---- +Quorum information +------------------ +Date: Fri Oct 3 14:15:31 2025 +Quorum provider: corosync_votequorum +Nodes: 2 +Node ID: 1 +Ring ID: 1.16 +Quorate: Yes + +Votequorum information +---------------------- +Expected votes: 2 +Highest expected: 2 +Total votes: 2 +Quorum: 1 +Flags: 2Node Quorate WaitForAll + +Membership information +---------------------- + Nodeid Votes Qdevice Name + 1 1 NR master-0 (local) + 2 1 NR master-1 +---- + +.. If quorum is lost and one control plane node is still running, restore quorum manually on the survivor node by running the following command: ++ +[source,terminal] +---- +$ sudo pcs quorum unblock +---- + +.. If only one node failed, verify that etcd is running on the survivor node by running the following command: ++ +[source,terminal] +---- +$ sudo pcs resource status etcd +---- + +.. If etcd is not running, restart etcd by running the following command: ++ +[source,terminal] +---- +$ sudo pcs resource cleanup etcd +---- ++ +If etcd still does not start, force it manually on the survivor node, skipping fencing: ++ +[IMPORTANT] +==== +Before running this commands, ensure that the node being replaced is inaccessible. Otherwise, you risk etcd corruption. +==== ++ +[source,terminal] +---- +$ sudo pcs resource debug-stop etcd +---- ++ +[source,terminal] +---- +$ sudo OCF_RESKEY_CRM_meta_notify_start_resource='etcd' pcs resource debug-start etcd +---- ++ +After recovery, etcd must be running successfully on the survivor node. + +. Delete etcd secrets for the failed node by running the following commands: ++ +[source,terminal] +---- +$ oc project openshift-etcd +---- ++ +[source,terminal] +---- +$ oc delete secret etcd-peer- +---- ++ +[source,terminal] +---- +$ oc delete secret etcd-serving- +---- ++ +[source,terminal] +---- +$ oc delete secret etcd-serving-metrics- +---- ++ +[NOTE] +==== +To replace the failed node, you must delete its etcd secrets first. When etcd is running, it might take some time for the API server to respond to these commands. +==== + +. Delete resources for the failed node: + +.. If you have the `BareMetalHost` (BMH) objects, list them to identify the host you are replacing by running the following command: ++ +[source,terminal] +---- +$ oc get bmh -n openshift-machine-api +---- + +.. Delete the BMH object for the failed node by running the following command: ++ +[source,terminal] +---- +$ oc delete bmh/ -n openshift-machine-api +---- + +.. List the `Machine` objects to identify the object that maps to the node that you are replacing by running the following command: ++ +[source,terminal] +---- +$ oc get machines.machine.openshift.io -n openshift-machine-api +---- + +.. Get the label with the machine hash value from the `Machine` object by running the following command: ++ +[source,terminal] +---- +$ oc get machines.machine.openshift.io/ -n openshift-machine-api \ + -o jsonpath='Machine hash label: {.metadata.labels.machine\.openshift\.io/cluster-api-cluster}{"\n"}' +---- ++ +Replace `` with the name of a `Machine` object in your cluster. For example, `ostest-bfs7w-ctrlplane-0`. ++ +You need this label to provision a new `Machine` object. + +.. Delete the `Machine` object for the failed node by running the following command: ++ +[source,terminal] +---- +$ oc delete machines.machine.openshift.io/- -n openshift-machine-api +---- ++ +[NOTE] +==== +The node object is deleted automatically after deleting the `Machine` object. +==== + +. Recreate the failed host by using the same name and IP address: ++ +[IMPORTANT] +==== +You must perform this step only if you are using installer-provisioned infrastructure or the Machine API to create the original node. +For information about replacing a failed bare-metal control plane node, see "Replacing an unhealthy etcd member on bare metal". +==== + +.. Remove the BMH and `Machine` objects. The machine controller automatically deletes the node object. + +.. Provision a new machine by using the following sample configuration: ++ +.Example `Machine` object configuration +[source,yaml] +---- +apiVersion: machine.openshift.io/v1beta1 +kind: Machine +metadata: + annotations: + metal3.io/BareMetalHost: openshift-machine-api/{bmh_name} + finalizers: + - machine.machine.openshift.io + labels: + machine.openshift.io/cluster-api-cluster: {machine_hash_label} + machine.openshift.io/cluster-api-machine-role: master + machine.openshift.io/cluster-api-machine-type: master + name: {machine_name} + namespace: openshift-machine-api +spec: + authoritativeAPI: MachineAPI + metadata: {} + providerSpec: + value: + apiVersion: baremetal.cluster.k8s.io/v1alpha1 + customDeploy: + method: install_coreos + hostSelector: {} + image: + checksum: "" + url: "" + kind: BareMetalMachineProviderSpec + metadata: + creationTimestamp: null + userData: + name: master-user-data-managed +---- ++ +* `metadata.annotations.metal3.io/BareMetalHost`: Replace `{bmh_name}` with the name of the BMH object that is associated with the host that you are replacing. +* `labels.machine.openshift.io/cluster-api-cluster`: Replace `{machine_hash_label}` with the label that you fetched from the machine you deleted. +* `metadata.name`: Replace `{machine_name}` with the name of the machine you deleted. + +.. Create the new BMH object and the secret to store the BMC credentials by running the following command: ++ +[source,terminal] +---- +cat < + namespace: openshift-machine-api +data: + password: + username: +type: Opaque +--- +apiVersion: metal3.io/v1alpha1 +kind: BareMetalHost +metadata: + name: {bmh_name} + namespace: openshift-machine-api +spec: + automatedCleaningMode: disabled + bmc: + address: /{uuid} + credentialsName: + disableCertificateVerification: true + bootMACAddress: {boot_mac_address} + bootMode: UEFI + externallyProvisioned: false + online: true + rootDeviceHints: + deviceName: /dev/disk/by-id/scsi- + userData: + name: master-user-data-managed + namespace: openshift-machine-api +EOF +---- ++ +* `metadata.name`: Specify the name of the secret. +* `metadata.name`: Replace `{bmh_name}` with the name of the BMH object that you deleted. +* `bmc.address`: Replace `{uuid}` with the UUID of the node that you created. +* `bmc.credentialsName`: Replace `name` with the name of the secret that you created. +* `bootMACAddress`: Specify the MAC address of the provisioning network interface. This is the MAC address the node uses to identify itself when communicating with Ironic during provisioning. + +. Verify that the new node has reached the `Provisioned` state by running the following command: ++ +[source,terminal] +---- +$ oc get bmh -o wide +---- ++ +The value of the `STATUS` column in the output of this command must be `Provisioned`. ++ +[NOTE] +==== +The provisioning process can take 10 to 20 minutes to complete. +==== + +. Verify that both control plane nodes are in the `Ready` state by running the following command: ++ +[source,terminal] +---- +$ oc get nodes +---- ++ +The value of the `STATUS` column in the output of this command must be `Ready` for both nodes. + +. Apply the `detached` annotation to the BMH object to prevent the Machine API from managing it by running the following command: ++ +[source,terminal] +---- +$ oc annotate bmh -n openshift-machine-api baremetalhost.metal3.io/detached='' --overwrite +---- + +. Rejoin the replacement node to the pacemaker cluster by running the following command: ++ +[NOTE] +==== +Run the following command on the survivor control plane node, not the node being replaced. +==== ++ +[source,terminal] +---- +$ sudo pcs cluster node remove +---- ++ +[source,terminal] +---- +$ sudo pcs cluster node add addr= --start --enable +---- + +. Delete stale jobs for the failed node by running the following command: ++ +[source,terminal] +---- +$ oc project openshift-etcd +---- ++ +[source,terminal] +---- +$ oc delete job tnf-auth-job- +---- ++ +[source,terminal] +---- +$ oc delete job tnf-after-setup-job- +---- + +.Verification + +For information about verifying that both control plane nodes and etcd are operating correctly, see "Verifying etcd health in a two-node OpenShift cluster with fencing". diff --git a/modules/installation-sample-install-config-two-node-fencing-ipi.adoc b/modules/installation-sample-install-config-two-node-fencing-ipi.adoc new file mode 100644 index 0000000000..b690b05eee --- /dev/null +++ b/modules/installation-sample-install-config-two-node-fencing-ipi.adoc @@ -0,0 +1,70 @@ +:_mod-docs-content-type: CONCEPT +[id="sample-install-config-two-node-fencing-ipi_{context}"] += Sample install-config.yaml for a two-node installer-provisioned infrastructure cluster with fencing + +You can use the following `install-config.yaml` configuration as a template for deploying a two-node OpenShift cluster with fencing by using the installer-provisioned infrastructure method: + +[NOTE] +==== +Do an etcd backup before proceeding to ensure that you can restore the cluster if any issues occur. +==== + +.Sample `install-config.yaml` configuration +[source,yaml] +---- +apiVersion: v1 +baseDomain: example.com +compute: +- name: worker + replicas: 0 +controlPlane: + name: master + replicas: 2 + fencing: + credentials: + - hostname: + address: https:// + username: + password: + certificateVerification: Disabled + - hostname: + address: https:// + username: + password: + certificateVerification: Enabled +metadata: + name: +featureSet: TechPreviewNoUpgrade +platform: + baremetal: + apiVIPs: + - + ingressVIPs: + - + hosts: + - name: + role: master + bmc: + address: + username: + password: + bootMACAddress: + - name: + role: master + bmc: + address: + username: + password: + bootMACAddress: +pullSecret: '' +sshKey: '' +---- +* `compute.replicas`: Set this field to `0` because a two-node fencing cluster does not include worker nodes. +* `controlPlane.replicas`: Set this field to `2` for a two-node fencing deployment. +* `fencing.credentials.hostname`: Provide the Baseboard Management Console (BMC) credentials for each control plane node. These credentials are required for node fencing and prevent split-brain scenarios. +* `fencing.credentials.certificateVerification`: Set this field to `Disabled` if your Redfish URL uses self-signed certificates, which is common for internally-hosted endpoints. Set this field to `Enabled` for URLs with valid CA-signed certificates. +* `metadata.name`: The cluster name is used as a prefix for hostnames and DNS records. +* `featureSet`: Set this field to `TechPreviewNoUpgrade` to enable two-node OpenShift cluster deployments. +* `platform.baremetal.apiVIPs` and `platform.baremetal.ingressVIPs` : Virtual IPs for the API and Ingress endpoints. Ensure they are reachable by all nodes and external clients. +* `pullSecret`: Contains credentials required to pull container images for the cluster components. +* `sshKey`: The SSH public key for accessing cluster nodes after installation. \ No newline at end of file diff --git a/modules/installation-sample-install-config-two-node-fencing-upi.adoc b/modules/installation-sample-install-config-two-node-fencing-upi.adoc new file mode 100644 index 0000000000..ccf8c949a4 --- /dev/null +++ b/modules/installation-sample-install-config-two-node-fencing-upi.adoc @@ -0,0 +1,48 @@ +:_mod-docs-content-type: CONCEPT +[id="sample-install-config-two-node-fencing-upi_{context}"] += Sample install-config.yaml for a two-node user-provisioned infrastructure cluster with fencing + +You can use the following `install-config.yaml` configuration as a template for deploying a two-node OpenShift cluster with fencing by using the user-provisioned infrastructure method: + +[NOTE] +==== +Do an etcd backup before proceeding to ensure that you can restore the cluster if any issues occur. +==== + +.Sample `install-config.yaml` configuration +[source,yaml] +---- +apiVersion: v1 +baseDomain: example.com +compute: +- name: worker + replicas: 0 +controlPlane: + name: master + replicas: 2 + fencing: + credentials: + - hostname: + address: https:// + username: + password: + - hostname: + address: https:// + username: + password: +metadata: + name: +featureSet: TechPreviewNoUpgrade +platform: + none: {} +pullSecret: '' +sshKey: '' +---- +* `compute.replicas`: Set this field to `0` because a two-node fencing cluster does not include worker nodes. +* `controlPlane.replicas`: Set this field to `2` for a two-node fencing deployment. +* `fencing.credentials.hostname`: Provide BMC credentials for each control plane node. +* `metadata.name`: Cluster name is used as a prefix for hostnames and DNS records. +* `featureSet`: Enables two-node OpenShift cluster deployments. +* `platform.none` Set the platform to `none` for user-provisioned infrastructure deployments. Bare-metal hosts are pre-provisioned outside of the installation program. +* `pullSecret`: Contains credentials required to pull container images for the cluster components. +* `sshKey`: The SSH public key for accessing cluster nodes after installation. \ No newline at end of file diff --git a/modules/installation-two-node-cluster-min-resource-reqs.adoc b/modules/installation-two-node-cluster-min-resource-reqs.adoc new file mode 100644 index 0000000000..85298a0143 --- /dev/null +++ b/modules/installation-two-node-cluster-min-resource-reqs.adoc @@ -0,0 +1,23 @@ + +:_mod-docs-content-type: CONCEPT +[id="installation-two-node-fencing-minimum-resource-requirements_{context}"] += Minimum resource requirements for installing the two-node OpenShift cluster with fencing + +Each cluster machine must meet the following minimum requirements: + +.Minimum resource requirements +[cols="2,2,2,2,2,2",options="header"] + +|=== + +| Machine | Operating System | CPU ^[1]^ | RAM | Storage | Input/Output Per Second (IOPS) ^[1]^ +| Bootstrap | RHCOS | 4 | 16 GB | 120 GB | 300 +| Control plane |RHCOS | 4 | 16 GB | 120 GB | 300 + +|=== + +[.small] +-- +1. One CPU is equivalent to one physical core when simultaneous multithreading (SMT), or Hyper-Threading, is not enabled. When enabled, use the following formula to calculate the corresponding ratio: (threads per core × cores) × sockets = CPUs. +2. {product-title} and Kubernetes are sensitive to disk performance, and faster storage is recommended, particularly for etcd on the control plane nodes. Note that on many cloud platforms, storage size and IOPS scale together, so you might need to over-allocate storage volume to obtain sufficient performance. +-- \ No newline at end of file diff --git a/modules/installation-two-node-creating-manifest-custom-br-ex.adoc b/modules/installation-two-node-creating-manifest-custom-br-ex.adoc new file mode 100644 index 0000000000..4a1bd61abf --- /dev/null +++ b/modules/installation-two-node-creating-manifest-custom-br-ex.adoc @@ -0,0 +1,7 @@ +:_mod-docs-content-type: CONCEPT +[id="creating-manifest-custom-br-ex_{context}"] += Creating a manifest object for a customized br-ex bridge + +You must create a manifest object to modify the cluster’s network configuration after installation. The manifest configures the br-ex bridge, which manages external network connectivity for the cluster. + +For instructions on creating this manifest, "Creating a manifest file for a customized br-ex bridge". \ No newline at end of file diff --git a/modules/installation-two-node-ingress-lb-configuration.adoc b/modules/installation-two-node-ingress-lb-configuration.adoc new file mode 100644 index 0000000000..d6cfc74c35 --- /dev/null +++ b/modules/installation-two-node-ingress-lb-configuration.adoc @@ -0,0 +1,71 @@ +:_mod-docs-content-type: PROCEDURE +[id="two-node-ingress-lb-configuration_{context}"] += Configuring an Ingress load balancer for a two-node cluster with fencing + +You must configure an external Ingress load balancer (LB) before you install a two-node OpenShift cluster with fencing. The Ingress LB forwards external application traffic to the Ingress Controller pods that run on the control plane nodes. Both nodes can actively receive traffic. + +.Prerequisites + +* You have two control plane nodes with fencing enabled. +* You have network connectivity from the load balancer to both control plane nodes. +* You created DNS records for `api..` and `*.apps..`. +* You have an external load balancer that supports health checks on endpoints. + +.Procedure + +. Configure the load balancer to forward traffic for the following ports: ++ +* `6443`: Kubernetes API server +* `80` and `443`: Application ingress ++ +You must forward traffic to both control plane nodes. + +. Configure health checks on the load balancer. You must monitor the backend endpoints so that the load balancer only sends traffic to nodes that respond. + +. Configure the load balancer to forward traffic to both control plane nodes. The following example shows how to configure two control plane nodes: ++ +[source,terminal] +---- +frontend api_frontend + bind *:6443 + mode tcp + default_backend api_backend + +backend api_backend + mode tcp + balance roundrobin + server cp0 :6443 check + server cp1 :6443 check + +frontend ingress_frontend + bind *:80 + bind *:443 + mode tcp + default_backend ingress_backend + +backend ingress_backend + mode tcp + balance roundrobin + server cp0 :80 check + server cp1 :80 check + server cp0 :443 check + server cp1 :443 check +---- + +. Verify the load balancer configuration: + +.. From an external client, run the following command: ++ +[source,terminal] +---- +$ curl -k https://api..:6443/version +---- + +.. From an external client, access an application route by running the following command: ++ +[source,terminal] +---- +$ curl https://.. +---- + +You can shut down a control plane node and verify that the load balancer stops sending traffic to that node while the other node continues to serve requests. diff --git a/modules/installation-verifying-etcd-health.adoc b/modules/installation-verifying-etcd-health.adoc new file mode 100644 index 0000000000..0ed919d41d --- /dev/null +++ b/modules/installation-verifying-etcd-health.adoc @@ -0,0 +1,52 @@ +:_mod-docs-content-type: PROCEDURE +[id="installation-verifying-etcd-health_{context}"] += Verifying etcd health in a two-node OpenShift cluster with fencing + +After completing node recovery or maintenance procedures, verify that both control plane nodes and etcd are operating correctly. + +.Prerequisites + +* You have access to the cluster as a user with `cluster-admin` privileges. +* You can access at least one control plane node through SSH. + +.Procedure + +. Check the overall node status by running the following command: ++ +[source,terminal] +---- +$ oc get nodes +---- ++ +This command verifies that both control plane nodes are in the `Ready` state, indicating that they can receive workloads for scheduling. + +. Verify the status of the `cluster-etcd-operator` by running the following command: ++ +[source,terminal] +---- +$ oc describe co/etcd +---- ++ +The `cluster-etcd-operator` manages and reports on the health of your etcd setup. Reviewing its status helps you identify any ongoing issues or degraded conditions. + +. Review the etcd member list by running the following command: ++ +[source,terminal] +---- +$ oc rsh -n openshift-etcd etcdctl member list -w table +---- ++ +This command shows the current etcd members and their roles. Look for any nodes marked as `learner`, which indicates that they are in the process of becoming voting members. + +. Review the Pacemaker resource status by running the following command on either control plane node: ++ +[source,terminal] +---- +$ sudo pcs status --full +---- ++ +This command provides a detailed overview of all resources managed by Pacemaker. You must ensure that the following conditions are met: ++ +** Both nodes are online. +** The `kubelet` and `etcd` resources are running. +** Fencing is correctly configured for both nodes. diff --git a/modules/ipi-install-network-requirements.adoc b/modules/ipi-install-network-requirements.adoc index 61abc99153..a2ea35beee 100644 --- a/modules/ipi-install-network-requirements.adoc +++ b/modules/ipi-install-network-requirements.adoc @@ -74,57 +74,7 @@ When using a VLAN, each NIC must be on a separate VLAN corresponding to the appr [id="network-requirements-dns_{context}"] == DNS requirements -Clients access the {product-title} cluster nodes over the `baremetal` network. A network administrator must configure a subdomain or subzone where the canonical name extension is the cluster name. - -[source,text] ----- -. ----- - -For example: - -[source,text] ----- -test-cluster.example.com ----- - -{product-title} includes functionality that uses cluster membership information to generate A/AAAA records. This resolves the node names to their IP addresses. After the nodes are registered with the API, the cluster can disperse node information without using CoreDNS-mDNS. This eliminates the network traffic associated with multicast DNS. - -CoreDNS requires both TCP and UDP connections to the upstream DNS server to function correctly. Ensure the upstream DNS server can receive both TCP and UDP connections from {product-title} cluster nodes. - -In {product-title} deployments, DNS name resolution is required for the following components: - -* The Kubernetes API -* The {product-title} application wildcard ingress API - -A/AAAA records are used for name resolution and PTR records are used for reverse name resolution. {op-system-first} uses the reverse records or DHCP to set the hostnames for all the nodes. - -Installer-provisioned installation includes functionality that uses cluster membership information to generate A/AAAA records. This resolves the node names to their IP addresses. In each record, `` is the cluster name and `` is the base domain that you specify in the `install-config.yaml` file. A complete DNS record takes the form: `...`. - -.Required DNS records -[cols="1a,3a,5a",options="header"] -|=== - -|Component -|Record -|Description - -|Kubernetes API -|`api...` -|An A/AAAA record and a PTR record identify the API load balancer. These records must be resolvable by both clients external to the cluster and from all the nodes within the cluster. - -|Routes -|`*.apps...` -|The wildcard A/AAAA record refers to the application ingress load balancer. The application ingress load balancer targets the nodes that run the Ingress Controller pods. The Ingress Controller pods run on the worker nodes by default. These records must be resolvable by both clients external to the cluster and from all the nodes within the cluster. - -For example, `console-openshift-console.apps..` is used as a wildcard route to the {product-title} console. - -|=== - -[TIP] -==== -You can use the `dig` command to verify DNS resolution. -==== +include::snippets/dns-requirements.adoc[leveloffset=+1] [id="network-requirements-dhcp-reqs_{context}"] == Dynamic Host Configuration Protocol (DHCP) requirements diff --git a/modules/nodes-cluster-enabling-features-about.adoc b/modules/nodes-cluster-enabling-features-about.adoc index 34e62dfe05..6931dc2d7b 100644 --- a/modules/nodes-cluster-enabling-features-about.adoc +++ b/modules/nodes-cluster-enabling-features-about.adoc @@ -108,6 +108,8 @@ The following Technology Preview features are enabled by this feature set: ** `VSphereMixedNodeEnv` ** `VSphereMultiDisk` ** `VSphereMultiNetworks` +** `VSphereMultiVCenters` +** `TwoNodeOpenShiftClusterWithFencing` -- //// diff --git a/snippets/dns-requirements.adoc b/snippets/dns-requirements.adoc new file mode 100644 index 0000000000..68d411ba3d --- /dev/null +++ b/snippets/dns-requirements.adoc @@ -0,0 +1,52 @@ +:_mod-docs-content-type: SNIPPET +Clients access the {product-title} cluster nodes over the `baremetal` network. A network administrator must configure a subdomain or subzone where the canonical name extension is the cluster name. + +[source,text] +---- +. +---- + +For example: + +[source,text] +---- +test-cluster.example.com +---- + +{product-title} includes functionality that uses cluster membership information to generate A/AAAA records. This resolves the node names to their IP addresses. After the nodes are registered with the API, the cluster can disperse node information without using CoreDNS-mDNS. This eliminates the network traffic associated with multicast DNS. + +CoreDNS requires both TCP and UDP connections to the upstream DNS server to function correctly. Ensure the upstream DNS server can receive both TCP and UDP connections from {product-title} cluster nodes. + +In {product-title} deployments, DNS name resolution is required for the following components: + +* The Kubernetes API +* The {product-title} application wildcard ingress API + +A/AAAA records are used for name resolution and PTR records are used for reverse name resolution. {op-system-first} uses the reverse records or DHCP to set the hostnames for all the nodes. + +Installer-provisioned installation includes functionality that uses cluster membership information to generate A/AAAA records. This resolves the node names to their IP addresses. In each record, `` is the cluster name and `` is the base domain that you specify in the `install-config.yaml` file. A complete DNS record takes the form: `...`. + +.Required DNS records +[cols="1a,3a,5a",options="header"] +|=== + +|Component +|Record +|Description + +|Kubernetes API +|`api...` +|An A/AAAA record and a PTR record identify the API load balancer. These records must be resolvable by both clients external to the cluster and from all the nodes within the cluster. + +|Routes +|`*.apps...` +|The wildcard A/AAAA record refers to the application ingress load balancer. The application ingress load balancer targets the nodes that run the Ingress Controller pods. The Ingress Controller pods run on the worker nodes by default. These records must be resolvable by both clients external to the cluster and from all the nodes within the cluster. + +For example, `console-openshift-console.apps..` is used as a wildcard route to the {product-title} console. + +|=== + +[TIP] +==== +You can use the `dig` command to verify DNS resolution. +====