From 3f6c31ea95a5142e746dba820abdd2b330fca44a Mon Sep 17 00:00:00 2001 From: Kelly Brown Date: Fri, 17 Sep 2021 10:34:31 -0400 Subject: [PATCH] BZ:1994596 - Adding Clearing CRI-O storage section --- modules/cleaning-crio-storage.adoc | 123 ++++++++++++++++++ .../troubleshooting-crio-issues.adoc | 3 + 2 files changed, 126 insertions(+) create mode 100644 modules/cleaning-crio-storage.adoc diff --git a/modules/cleaning-crio-storage.adoc b/modules/cleaning-crio-storage.adoc new file mode 100644 index 0000000000..c934bc8ab7 --- /dev/null +++ b/modules/cleaning-crio-storage.adoc @@ -0,0 +1,123 @@ +[id="cleaning-crio-storage"] + += Cleaning CRI-O storage + +You can manually clear the CRI-O ephemeral storage if you experience the following issues: + +* A node cannot run on any pods and this error appears: +[source, terminal] ++ +---- +Failed to create pod sandbox: rpc error: code = Unknown desc = failed to mount container XXX: error recreating the missing symlinks: error reading name of symlink for XXX: open /var/lib/containers/storage/overlay/XXX/link: no such file or directory +---- ++ +* You cannot create a new container on a working node and the “can’t stat lower layer” error appears: +[source, terminal] ++ +---- +can't stat lower layer ... because it does not exist. Going through storage to recreate the missing symlinks. +---- ++ +* Your node is in the `NotReady` state after a cluster upgrade or if you attempt to reboot it. + +* The container runtime implementation (`crio`) is not working properly. + +* You are unable to start a debug shell on the node using `oc debug node/` because the container runtime instance (`crio`) is not working. + +Follow this process to completely wipe the CRI-O storage and resolve the errors. + +.Prerequisites: + + * You have access to the cluster as a user with the `cluster-admin` role. + * You have installed the OpenShift CLI (`oc`). + +.Procedure + +. Use `cordon` on the node. This is to avoid any workload getting scheduled if the node gets into the `Ready` status. You will know that scheduling is disabled when `SchedulingDisabled` is in your Status section: +[source, terminal] ++ +---- +$ oc adm cordon +---- ++ +. Drain the node as the cluster-admin user: +[source, terminal] ++ +---- +$ oc adm drain --ignore-daemonsets --delete-local-data +---- ++ +. When the node returns, connect back to the node via SSH or Console. Then connect to the root user: +[source, terminal] ++ +---- +$ ssh core@node1.example.com +$ sudo -i +---- ++ +. Manually stop the kublet: +[source, terminal] ++ +---- +# systemctl stop kubelet +---- ++ +. Stop the containers and pods: +[source, terminal] ++ +---- +# crictl rmp -fa +---- ++ +. Manually stop the crio services: +[source, terminal] ++ +---- +# systemctl stop crio +---- ++ +. After you run those commands, you can completely wipe the ephemeral storage: +[source, terminal] ++ +---- +# crio wipe -f +---- ++ +. Start the crio and kublet service: +[source, terminal] ++ +---- +# systemctl start crio +# systemctl start kubelet +---- ++ +. You will know if the clean up worked if the crio and kublet services are started, and the node is in the `Ready` status: +[source, terminal] ++ +---- +$ oc get nodes +---- ++ +.Example output +[source, terminal] ++ +---- +NAME STATUS ROLES AGE VERSION +ci-ln-tkbxyft-f76d1-nvwhr-master-1 Ready, SchedulingDisabled master 133m v1.22.0-rc.0+75ee307 +---- ++ +. Mark the node schedulable. You will know that the scheduling is enabled when `SchedulingDisabled` is no longer in status: +[source, terminal] ++ +---- +$ oc adm uncordon +---- ++ +.Example output +[source, terminal] ++ +---- +NAME STATUS ROLES AGE VERSION +ci-ln-tkbxyft-f76d1-nvwhr-master-1 Ready master 133m v1.22.0-rc.0+75ee307 +---- ++ diff --git a/support/troubleshooting/troubleshooting-crio-issues.adoc b/support/troubleshooting/troubleshooting-crio-issues.adoc index 6ca794a07d..9fedc9f70c 100644 --- a/support/troubleshooting/troubleshooting-crio-issues.adoc +++ b/support/troubleshooting/troubleshooting-crio-issues.adoc @@ -13,3 +13,6 @@ include::modules/verifying-crio-status.adoc[leveloffset=+1] // Gathering CRI-O journald unit logs include::modules/gathering-crio-logs.adoc[leveloffset=+1] + +// Cleaning CRI-O storage +include::modules/cleaning-crio-storage.adoc[leveloffset=+1]