diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index d4235f5976..37305c0705 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -3443,14 +3443,14 @@ Topics: File: telco-troubleshooting-general-troubleshooting - Name: Cluster maintenance File: telco-troubleshooting-cluster-maintenance -# - Name: Security -# File: telco-troubleshooting-security -# - Name: Certificate maintenance -# File: telco-troubleshooting-cert-maintenance -# - Name: Machine Config Operator -# File: telco-troubleshooting-mco -# - Name: Bare-metal node maintenance -# File: telco-troubleshooting-bmn-maintenance + - Name: Security + File: telco-troubleshooting-security + - Name: Certificate maintenance + File: telco-troubleshooting-cert-maintenance + - Name: Machine Config Operator + File: telco-troubleshooting-mco + - Name: Bare-metal node maintenance + File: telco-troubleshooting-bmn-maintenance --- Name: Specialized hardware and driver enablement Dir: hardware_enablement diff --git a/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc new file mode 100644 index 0000000000..fa2bd2d58b --- /dev/null +++ b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc @@ -0,0 +1,29 @@ +:_mod-docs-content-type: ASSEMBLY +[id="telco-troubleshooting-bmn-maintenance"] += Bare-metal node maintenance +include::_attributes/common-attributes.adoc[] +:context: telco-troubleshooting-bmn-maintenance + +toc::[] + +You can connect to a node for general troubleshooting. +However, in some cases, you need to perform troubleshooting or maintenance tasks on certain hardware components. +This section discusses topics that you need to perform that hardware maintenance. + +include::modules/telco-troubleshooting-bmn-connect-to-node.adoc[leveloffset=+1] +include::modules/telco-troubleshooting-bmn-move-apps-to-pods.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../nodes/nodes/nodes-nodes-working.adoc#nodes-nodes-working_nodes-nodes-working[Working with nodes] + +include::modules/telco-troubleshooting-bmn-replace-dimm.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../storage/index.adoc#storage-overview_storage-overview[{product-title} storage overview] + +include::modules/telco-troubleshooting-bmn-replace-disk.adoc[leveloffset=+1] +include::modules/telco-troubleshooting-bmn-replace-nw-card.adoc[leveloffset=+1] diff --git a/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc new file mode 100644 index 0000000000..ee1849ed11 --- /dev/null +++ b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc @@ -0,0 +1,66 @@ +:_mod-docs-content-type: ASSEMBLY +[id="telco-troubleshooting-cert-maintenance"] += Certificate maintenance +include::_attributes/common-attributes.adoc[] +:context: telco-troubleshooting-cert-maintenance + +toc::[] + +Certificate maintenance is required for continuous cluster authentication. +As a cluster administrator, you must manually renew certain certificates, while others are automatically renewed by the cluster. + +Learn about certificates in {product-title} and how to maintain them by using the following resources: + +* link:https://access.redhat.com/solutions/5018231[Which OpenShift certificates do rotate automatically and which do not in Openshift 4.x?] +* link:https://access.redhat.com/solutions/7000968[Checking etcd certificate expiry in OpenShift 4] + +include::modules/telco-troubleshooting-certs-manual.adoc[leveloffset=+1] +include::modules/telco-troubleshooting-certs-manual-proxy.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../security/certificate_types_descriptions/proxy-certificates.adoc#cert-types-proxy-certificates[Proxy certificates] + +include::modules/telco-troubleshooting-certs-manual-user-provisioned.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../security/certificate_types_descriptions/user-provided-certificates-for-api-server.adoc#cert-types-user-provided-certificates-for-the-api-server[User-provisioned certificates for the API server] + +include::modules/telco-troubleshooting-certs-auto.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../security/certificate_types_descriptions/service-ca-certificates.adoc#cert-types-service-ca-certificates_cert-types-service-ca-certificates[Service CA certificates] +* xref:../../../security/certificate_types_descriptions/node-certificates.adoc#cert-types-node-certificates_cert-types-node-certificates[Node certificates] +* xref:../../../security/certificate_types_descriptions/bootstrap-certificates.adoc#cert-types-bootstrap-certificates_cert-types-bootstrap-certificates[Bootstrap certificates] +* xref:../../../security/certificate_types_descriptions/etcd-certificates.adoc#cert-types-etcd-certificates-cert-types-etcd-certificates[etcd certificates] +* xref:../../../security/certificate_types_descriptions/olm-certificates.adoc#cert-types-olm-certificates_cert-types-olm-certificates[OLM certificates] +* xref:../../../security/certificate_types_descriptions/machine-config-operator-certificates.adoc#cert-types-machine-config-operator-certificates_cert-types-machine-config-operator-certificates[Machine Config Operator certificates] +* xref:../../../security/certificate_types_descriptions/monitoring-and-cluster-logging-operator-component-certificates.adoc#cert-types-monitoring-and-cluster-logging-operator-component-certificates_cert-types-monitoring-and-cluster-logging-operator-component-certificates[Monitoring and cluster logging Operator component certificates] +* xref:../../../security/certificate_types_descriptions/control-plane-certificates.adoc#cert-types-control-plane-certificates_cert-types-control-plane-certificates[Control plane certificates] +* xref:../../../security/certificate_types_descriptions/ingress-certificates.adoc#cert-types-ingress-certificates_cert-types-ingress-certificates[Ingress certificates] + +include::modules/telco-troubleshooting-certs-auto-etcd.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../security/certificate_types_descriptions/etcd-certificates.adoc#cert-types-etcd-certificates_cert-types-etcd-certificates[etcd certificates] + +include::modules/telco-troubleshooting-certs-auto-node.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../security/certificate_types_descriptions/node-certificates.adoc#cert-types-node-certificates_cert-types-node-certificates[Node certificates] + +include::modules/telco-troubleshooting-certs-auto-service-ca.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../security/certificate_types_descriptions/service-ca-certificates.adoc#cert-types-service-ca-certificates_cert-types-service-ca-certificates[Service CA certificates] \ No newline at end of file diff --git a/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-mco.adoc b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-mco.adoc new file mode 100644 index 0000000000..669eeb0bc3 --- /dev/null +++ b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-mco.adoc @@ -0,0 +1,20 @@ +:_mod-docs-content-type: ASSEMBLY +[id="telco-troubleshooting-mco"] += Machine Config Operator +include::_attributes/common-attributes.adoc[] +:context: telco-troubleshooting-mco + +toc::[] + +The Machine Config Operator provides useful information to cluster administrators and controls what is running directly on the bare-metal host. + +The Machine Config Operator differentiates between different groups of nodes in the cluster, allowing control plane nodes and worker nodes to run with different configurations. +These groups of nodes run worker or application pods, which are called `MachineConfigPool` (`mcp`) groups. +The same machine config is applied on all nodes or only on one MCP in the cluster. + +For more information about how and why to apply MCPs in a telco core cluster, see xref:../../../edge_computing/day_2_core_cnf_clusters/updating/telco-update-ocp-update-prep.adoc#telco-update-applying-mcp-labels-to-nodes-before-the-update_ocp-update-prep[Applying MachineConfigPool labels to nodes before the update]. + +For more information about the Machine Config Operator, see xref:../../../operators/operator-reference.adoc#machine-config-operator_cluster-operators-ref[Machine Config Operator]. + +include::modules/telco-troubleshooting-mco-purpose.adoc[leveloffset=+1] +include::modules/telco-troubleshooting-mco-apply-several-mcs.adoc[leveloffset=+1] \ No newline at end of file diff --git a/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-security.adoc b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-security.adoc new file mode 100644 index 0000000000..6ea0cc355b --- /dev/null +++ b/edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-security.adoc @@ -0,0 +1,16 @@ +:_mod-docs-content-type: ASSEMBLY +[id="telco-troubleshooting-security"] += Security +include::_attributes/common-attributes.adoc[] +:context: telco-troubleshooting-security + +toc::[] + +Implementing a robust cluster security profile is important for building resilient telco networks. + +include::modules/telco-troubleshooting-security-authentication.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources + +* xref:../../../authentication/understanding-identity-provider.adoc#supported-identity-providers[Supported identity providers] \ No newline at end of file diff --git a/modules/telco-troubleshooting-bmn-connect-to-node.adoc b/modules/telco-troubleshooting-bmn-connect-to-node.adoc new file mode 100644 index 0000000000..bf42d137ed --- /dev/null +++ b/modules/telco-troubleshooting-bmn-connect-to-node.adoc @@ -0,0 +1,63 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc + +:_mod-docs-content-type: PROCEDURE +[id="telco-troubleshooting-bmn-connect-to-node_{context}"] += Connecting to a bare-metal node in your cluster + +You can connect to bare-metal cluster nodes for general maintenance tasks. + +[NOTE] +==== +Configuring the cluster node from the host operating system is not recommended or supported. +==== + +To troubleshoot your nodes, you can do the following tasks: + +* Retrieve logs from node +* Use debugging +* Use SSH to connect to the node + +[IMPORTANT] +==== +Use SSH only if you cannot connect to the node with the `oc debug` command. +==== + +.Procedure + +. Retrieve the logs from a node by running the following command: ++ +[source,terminal] +---- +$ oc adm node-logs -u crio +---- + +. Use debugging by running the following command: ++ +[source,terminal] +---- +$ oc debug node/ +---- + +. Set `/host` as the root directory within the debug shell. The debug pod mounts the host’s root file system in `/host` within the pod. By changing the root directory to `/host`, you can run binaries contained in the host’s executable paths: ++ +-- +[source,terminal] +---- +# chroot /host +---- + +.Output +[source,terminal] +---- +You are now logged in as root on the node +---- +-- + +. Optional: Use SSH to connect to the node by running the following command: ++ +[source,terminal] +---- +$ ssh core@ +---- \ No newline at end of file diff --git a/modules/telco-troubleshooting-bmn-move-apps-to-pods.adoc b/modules/telco-troubleshooting-bmn-move-apps-to-pods.adoc new file mode 100644 index 0000000000..77710f5531 --- /dev/null +++ b/modules/telco-troubleshooting-bmn-move-apps-to-pods.adoc @@ -0,0 +1,26 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc + +:_mod-docs-content-type: PROCEDURE +[id="telco-troubleshooting-bmn-move-apps-to-pods_{context}"] += Moving applications to pods within the cluster + +For scheduled hardware maintenance, you need to consider how to move your application pods to other nodes within the cluster without affecting the pod workload. + +.Procedure + +* Mark the node as unschedulable by running the following command: ++ +[source,terminal] +---- +$ oc adm cordon +---- + +When the node is unschedulable, no pods can be scheduled on the node. +For more information, see "Working with nodes". + +[NOTE] +==== +When moving CNF applications, you might need to verify ahead of time that there are enough additional worker nodes in the cluster due to anti-affinity and pod disruption budget. +==== \ No newline at end of file diff --git a/modules/telco-troubleshooting-bmn-replace-dimm.adoc b/modules/telco-troubleshooting-bmn-replace-dimm.adoc new file mode 100644 index 0000000000..b7e495dacb --- /dev/null +++ b/modules/telco-troubleshooting-bmn-replace-dimm.adoc @@ -0,0 +1,17 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-bmn-replace-dimm_{context}"] += DIMM memory replacement + +Dual in-line memory module (DIMM) problems sometimes only appear after a server reboots. +You can check the log files for these problems. + +When you perform a standard reboot and the server does not start, you can see a message in the console that there is a faulty DIMM memory. +In that case, you can acknowledge the faulty DIMM and continue rebooting if the remaining memory is sufficient. +Then, you can schedule a maintenance window to replace the faulty DIMM. + +Sometimes, a message in the event logs indicates a bad memory module. +In these cases, you can schedule the memory replacement before the server is rebooted. \ No newline at end of file diff --git a/modules/telco-troubleshooting-bmn-replace-disk.adoc b/modules/telco-troubleshooting-bmn-replace-disk.adoc new file mode 100644 index 0000000000..81af3ef6a2 --- /dev/null +++ b/modules/telco-troubleshooting-bmn-replace-disk.adoc @@ -0,0 +1,14 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-bmn-replace-disk_{context}"] += Disk replacement + +If you do not have disk redundancy configured on your node through hardware or software redundant array of independent disks (RAID), you need to check the following: + +* Does the disk contain running pod images? +* Does the disk contain persistent data for pods? + +For more information, see "{product-title} storage overview" in _Storage_. \ No newline at end of file diff --git a/modules/telco-troubleshooting-bmn-replace-nw-card.adoc b/modules/telco-troubleshooting-bmn-replace-nw-card.adoc new file mode 100644 index 0000000000..9daf2e9d05 --- /dev/null +++ b/modules/telco-troubleshooting-bmn-replace-nw-card.adoc @@ -0,0 +1,16 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-bmn-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-bmn-replace-nw-card_{context}"] += Cluster network card replacement + +When you replace a network card, the MAC address changes. +The MAC address can be part of the DHCP or SR-IOV Operator configuration, router configuration, firewall rules, or application Cloud-native Network Function (CNF) configuration. +Before you bring back a node online after replacing a network card, you must verify that these configurations are up-to-date. + +[IMPORTANT] +==== +If you do not have specific procedures for MAC address changes within the network, contact your network administrator or network hardware vendor. +==== \ No newline at end of file diff --git a/modules/telco-troubleshooting-certs-auto-etcd.adoc b/modules/telco-troubleshooting-certs-auto-etcd.adoc new file mode 100644 index 0000000000..af68ff638a --- /dev/null +++ b/modules/telco-troubleshooting-certs-auto-etcd.adoc @@ -0,0 +1,24 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-certs-auto-etcd_{context}"] += Certificates managed by etcd + +The etcd certificates are used for encrypted communication between etcd member peers as well as encrypted client traffic. +The certificates are renewed automatically within the cluster provided that communication between all nodes and all services is current. +Therefore, if your cluster might lose communication between components during a specific period of time, which is close to the end of the etcd certificate lifetime, it is recommended to renew the certificate in advance. +For example, communication can be lost during an upgrade due to nodes rebooting at different times. + +* You can manually renew etcd certificates by running the following command: ++ +[source,terminal] +---- +$ for each in $(oc get secret -n openshift-etcd | grep "kubernetes.io/tls" | grep -e \ +"etcd-peer\|etcd-serving" | awk '{print $1}'); do oc get secret $each -n openshift-etcd -o \ +jsonpath="{.data.tls\.crt}" | base64 -d | openssl x509 -noout -enddate; done +---- + +For more information about updating etcd certificates, see link:https://access.redhat.com/solutions/7000968[Checking etcd certificate expiry in OpenShift 4]. +For more information about etcd certificates, see "etcd certificates" in _Security and compliance_. \ No newline at end of file diff --git a/modules/telco-troubleshooting-certs-auto-node.adoc b/modules/telco-troubleshooting-certs-auto-node.adoc new file mode 100644 index 0000000000..ed6f695477 --- /dev/null +++ b/modules/telco-troubleshooting-certs-auto-node.adoc @@ -0,0 +1,13 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-certs-auto-node_{context}"] += Node certificates + +Node certificates are self-signed certificates, which means that they are signed by the cluster and they originate from an internal certificate authority (CA) that is generated by the bootstrap process. + +After the cluster is installed, the cluster automatically renews the node certificates. + +For more information, see "Node certificates" in _Security and compliance_. \ No newline at end of file diff --git a/modules/telco-troubleshooting-certs-auto-service-ca.adoc b/modules/telco-troubleshooting-certs-auto-service-ca.adoc new file mode 100644 index 0000000000..b3998991f4 --- /dev/null +++ b/modules/telco-troubleshooting-certs-auto-service-ca.adoc @@ -0,0 +1,13 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-certs-auto-service-ca_{context}"] += Service CA certificates + +The `service-ca` is an Operator that creates a self-signed certificate authority (CA) when an {product-title} cluster is deployed. +This allows user to add certificates to their deployments without manually creating them. +Service CA certificates are self-signed certificates. + +For more information, see "Service CA certificates" in _Security and compliance_. \ No newline at end of file diff --git a/modules/telco-troubleshooting-certs-auto.adoc b/modules/telco-troubleshooting-certs-auto.adoc new file mode 100644 index 0000000000..a83d16e70b --- /dev/null +++ b/modules/telco-troubleshooting-certs-auto.adoc @@ -0,0 +1,20 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-certs-auto_{context}"] += Certificates managed by the cluster + +You only need to check cluster-managed certificates if you detect an issue in the logs. +The following certificates are automatically managed by the cluster: + +* Service CA certificates +* Node certificates +* Bootstrap certificates +* etcd certificates +* OLM certificates +* Machine Config Operator certificates +* Monitoring and cluster logging Operator component certificates +* Control plane certificates +* Ingress certificates \ No newline at end of file diff --git a/modules/telco-troubleshooting-certs-manual-proxy.adoc b/modules/telco-troubleshooting-certs-manual-proxy.adoc new file mode 100644 index 0000000000..a77e6b9673 --- /dev/null +++ b/modules/telco-troubleshooting-certs-manual-proxy.adoc @@ -0,0 +1,29 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc + +:_mod-docs-content-type: PROCEDURE +[id="telco-troubleshooting-certs-manual-proxy_{context}"] += Managing proxy certificates + +Proxy certificates allow users to specify one or more custom certificate authority (CA) certificates that are used by platform components when making egress connections. + +[NOTE] +==== +Certain CAs set expiration dates and you might need to renew these certificates every two years. +==== + +If you did not originally set the requested certificates, you can determine the certificate expiration in several ways. +Most Cloud-native Network Functions (CNFs) use certificates that are not specifically designed for browser-based connectivity. +Therefore, you need to pull the certificate from the `ConfigMap` object of your deployment. + +.Procedure + +* To get the expiration date, run the following command against the certificate file: ++ +[source,terminal] +---- +$ openssl x509 -enddate -noout -in .pem +---- + +For more information about determining how and when to renew your proxy certificates, see "Proxy certificates" in _Security and compliance_. \ No newline at end of file diff --git a/modules/telco-troubleshooting-certs-manual-user-provisioned.adoc b/modules/telco-troubleshooting-certs-manual-user-provisioned.adoc new file mode 100644 index 0000000000..3b46ccbcd3 --- /dev/null +++ b/modules/telco-troubleshooting-certs-manual-user-provisioned.adoc @@ -0,0 +1,13 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-certs-manual-user-provisioned_{context}"] += User-provisioned API server certificates + +The API server is accessible by clients that are external to the cluster at `api..`. +You might want clients to access the API server at a different hostname or without the need to distribute the cluster-managed certificate authority (CA) certificates to the clients. +You must set a custom default certificate to be used by the API server when serving content. + +For more information, see "User-provided certificates for the API server" in _Security and compliance_ \ No newline at end of file diff --git a/modules/telco-troubleshooting-certs-manual.adoc b/modules/telco-troubleshooting-certs-manual.adoc new file mode 100644 index 0000000000..79c25f781c --- /dev/null +++ b/modules/telco-troubleshooting-certs-manual.adoc @@ -0,0 +1,12 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-cert-maintenance.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-certs-manual_{context}"] += Certificates manually managed by the administrator + +The following certificates must be renewed by a cluster administrator: + +* Proxy certificates +* User-provisioned certificates for the API server \ No newline at end of file diff --git a/modules/telco-troubleshooting-mco-apply-several-mcs.adoc b/modules/telco-troubleshooting-mco-apply-several-mcs.adoc new file mode 100644 index 0000000000..308de4116b --- /dev/null +++ b/modules/telco-troubleshooting-mco-apply-several-mcs.adoc @@ -0,0 +1,31 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-mco.adoc + +:_mod-docs-content-type: PROCEDURE +[id="telco-troubleshooting-mco-apply-several-mcs_{context}"] += Applying several machine config files at the same time + +When you need to change the machine config for a group of nodes in the cluster, also known as machine config pools (MCPs), sometimes the changes must be applied with several different machine config files. +The nodes need to restart for the machine config file to be applied. +After each machine config file is applied to the cluster, all nodes restart that are affected by the machine config file. + +To prevent the nodes from restarting for each machine config file, you can apply all of the changes at the same time by pausing each MCP that is updated by the new machine config file. + +.Procedure + +. Pause the affected MCP by running the following command: ++ +[source,terminal] +---- +$ oc patch mcp/ --type merge --patch '{"spec":{"paused":true}}' +---- + +. After you apply all machine config changes to the cluster, run the following command: ++ +[source,terminal] +---- +$ oc patch mcp/ --type merge --patch '{"spec":{"paused":false}}' +---- + +This allows the nodes in your MCP to reboot into the new configurations. \ No newline at end of file diff --git a/modules/telco-troubleshooting-mco-purpose.adoc b/modules/telco-troubleshooting-mco-purpose.adoc new file mode 100644 index 0000000000..231e44c8af --- /dev/null +++ b/modules/telco-troubleshooting-mco-purpose.adoc @@ -0,0 +1,19 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-mco.adoc + +:_mod-docs-content-type: CONCEPT +[id="telco-troubleshooting-mco-purpose_{context}"] += Purpose of the Machine Config Operator + +The Machine Config Operator (MCO) manages and applies configuration and updates of {op-system-first} and container runtime, including everything between the kernel and kubelet. +Managing {op-system} is important since most telecommunications companies run on bare-metal hardware and use some sort of hardware accelerator or kernel modification. +Applying machine configuration to {op-system} manually can cause problems because the MCO monitors each node and what is applied to it. + +You must consider these minor components and how the MCO can help you manage your clusters effectively. + +[IMPORTANT] +==== +You must use the the MCO to perform all changes on worker or control plane nodes. +Do not manually make changes to {op-system} or node files. +==== \ No newline at end of file diff --git a/modules/telco-troubleshooting-security-authentication.adoc b/modules/telco-troubleshooting-security-authentication.adoc new file mode 100644 index 0000000000..cd948ae6e3 --- /dev/null +++ b/modules/telco-troubleshooting-security-authentication.adoc @@ -0,0 +1,34 @@ +// Module included in the following assemblies: +// +// * edge_computing/day_2_core_cnf_clusters/troubleshooting/telco-troubleshooting-security.adoc +:_mod-docs-content-type: PROCEDURE +[id="telco-troubleshooting-security-authentication_{context}"] += Authentication + +Determine which identity providers are in your cluster. +For more information about supported identity providers, see "Supported identity providers" in _Authentication and authorization_. + +After you know which providers are configured, you can inspect the `openshift-authentication` namespace to determine if there are potential issues. + +.Procedure + +. Check the events in the `openshift-authentication` namespace by running the following command: ++ +[source,terminal] +---- +$ oc get events -n openshift-authentication --sort-by='.metadata.creationTimestamp' +---- + +. Check the pods in the `openshift-authentication` namespace by running the following command: ++ +[source,terminal] +---- +$ oc get pod -n openshift-authentication +---- + +. Optional: If you need more information, check the logs of one of the running pods by running the following command: ++ +[source,terminal] +---- +$ oc logs -n openshift-authentication +---- \ No newline at end of file