From 3f6c31ea95a5142e746dba820abdd2b330fca44a Mon Sep 17 00:00:00 2001
From: Kelly Brown <kelbrown@redhat.com>
Date: Fri, 17 Sep 2021 10:34:31 -0400
Subject: [PATCH] BZ:1994596 - Adding Clearing CRI-O storage section

---
 modules/cleaning-crio-storage.adoc            | 123 ++++++++++++++++++
 .../troubleshooting-crio-issues.adoc          |   3 +
 2 files changed, 126 insertions(+)
 create mode 100644 modules/cleaning-crio-storage.adoc
diff --git a/modules/cleaning-crio-storage.adoc b/modules/cleaning-crio-storage.adoc
new file mode 100644
index 0000000000..c934bc8ab7
--- /dev/null
+++ b/modules/cleaning-crio-storage.adoc
@@ -0,0 +1,123 @@
+[id="cleaning-crio-storage"]
+
+= Cleaning CRI-O storage
+
+You can manually clear the CRI-O ephemeral storage if you experience the following issues:
+
+* A node cannot run on any pods and this error appears:
+[source, terminal]
++
+----
+Failed to create pod sandbox: rpc error: code = Unknown desc = failed to mount container XXX: error recreating the missing symlinks: error reading name of symlink for XXX: open /var/lib/containers/storage/overlay/XXX/link: no such file or directory
+----
++
+* You cannot create a new container on a working node and the  “can’t stat lower layer” error appears:
+[source, terminal]
++
+----
+can't stat lower layer ...  because it does not exist.  Going through storage to recreate the missing symlinks.
+----
++
+* Your node is in the `NotReady` state after a cluster upgrade or if you attempt to reboot it.
+
+* The container runtime implementation (`crio`) is not working properly.
+
+* You are unable to start a debug shell on the node using `oc debug node/<nodename>` because the container runtime instance (`crio`) is not working.
+
+Follow this process to completely wipe the CRI-O storage and resolve the errors.
+
+.Prerequisites:
+
+  * You have access to the cluster as a user with the `cluster-admin` role.
+  * You have installed the OpenShift CLI (`oc`).
+
+.Procedure
+
+. Use `cordon` on the node. This is to avoid any workload getting scheduled if the node gets into the `Ready` status. You will know that scheduling is disabled when `SchedulingDisabled` is in your Status section:
+[source, terminal]
++
+----
+$ oc adm cordon <nodename>
+----
++
+. Drain the node as the cluster-admin user:
+[source, terminal]
++
+----
+$ oc adm drain <nodename> --ignore-daemonsets --delete-local-data
+----
++
+. When the node returns, connect back to the node via SSH or Console. Then connect to the root user:
+[source, terminal]
++
+----
+$ ssh core@node1.example.com
+$ sudo -i
+----
++
+. Manually stop the kublet:
+[source, terminal]
++
+----
+# systemctl stop kubelet
+----
++
+. Stop the containers and pods:
+[source, terminal]
++
+----
+# crictl rmp -fa
+----
++
+. Manually stop the crio services:
+[source, terminal]
++
+----
+# systemctl stop crio
+----
++
+. After you run those commands, you can completely wipe the ephemeral storage:
+[source, terminal]
++
+----
+# crio wipe -f
+----
++
+. Start the crio and kublet service:
+[source, terminal]
++
+----
+# systemctl start crio
+# systemctl start kubelet
+----
++
+. You will know if the clean up worked if the crio and kublet services are started, and the node is in the `Ready` status:
+[source, terminal]
++
+----
+$ oc get nodes
+----
++
+.Example output
+[source, terminal]
++
+----
+NAME				    STATUS	                ROLES    AGE    VERSION
+ci-ln-tkbxyft-f76d1-nvwhr-master-1  Ready, SchedulingDisabled   master	 133m   v1.22.0-rc.0+75ee307
+----
++
+. Mark the node schedulable. You will know that the scheduling is enabled when `SchedulingDisabled` is no longer in status:
+[source, terminal]
++
+----
+$ oc adm uncordon <nodename>
+----
++
+.Example output
+[source, terminal]
++
+----
+NAME				     STATUS	      ROLES    AGE    VERSION
+ci-ln-tkbxyft-f76d1-nvwhr-master-1   Ready            master   133m   v1.22.0-rc.0+75ee307
+----
++
diff --git a/support/troubleshooting/troubleshooting-crio-issues.adoc b/support/troubleshooting/troubleshooting-crio-issues.adoc
index 6ca794a07d..9fedc9f70c 100644
--- a/support/troubleshooting/troubleshooting-crio-issues.adoc
+++ b/support/troubleshooting/troubleshooting-crio-issues.adoc
@@ -13,3 +13,6 @@ include::modules/verifying-crio-status.adoc[leveloffset=+1]
 
 // Gathering CRI-O journald unit logs
 include::modules/gathering-crio-logs.adoc[leveloffset=+1]
+
+// Cleaning CRI-O storage
+include::modules/cleaning-crio-storage.adoc[leveloffset=+1]