From 07d58eb758f7703641344bb5a5a121f727cf77d3 Mon Sep 17 00:00:00 2001
From: Andrew Burden <aburden@redhat.com>
Date: Wed, 10 Mar 2021 18:50:31 +0100
Subject: [PATCH] New assembly to add the new PromQL metrics for virtualization
 resources. Assembly is also re-using some of the querying metrics modules to
 provide usability context. Edit: additional queries added to reflect changes
 for 4.8 (cnv-11661)

---
 _topic_map.yml                                |   2 +
 ...-for-all-projects-as-an-administrator.adoc |   1 +
 ...-user-defined-projects-as-a-developer.adoc |   1 +
 modules/monitoring-querying-metrics.adoc      |   1 +
 modules/virt-querying-metrics.adoc            | 106 ++++++++++++++++++
 .../virt-prometheus-queries.adoc              |  29 +++++
 6 files changed, 140 insertions(+)
 create mode 100644 modules/virt-querying-metrics.adoc
 create mode 100644 virt/logging_events_monitoring/virt-prometheus-queries.adoc

diff --git a/_topic_map.yml b/_topic_map.yml
index 2c14aaebc8..e564496ddb 100644
--- a/_topic_map.yml
+++ b/_topic_map.yml
@@ -2853,6 +2853,8 @@ Topics:
     File: virt-using-dashboard-to-get-cluster-info
   - Name: OpenShift cluster monitoring, logging, and Telemetry
     File: virt-openshift-cluster-monitoring
+  - Name: Prometheus queries for virtual resources
+    File: virt-prometheus-queries
   - Name: Collecting OpenShift Virtualization data for Red Hat Support
     File: virt-collecting-virt-data
 ---
diff --git a/modules/monitoring-querying-metrics-for-all-projects-as-an-administrator.adoc b/modules/monitoring-querying-metrics-for-all-projects-as-an-administrator.adoc
index 9467199c19..5fa10f5279 100644
--- a/modules/monitoring-querying-metrics-for-all-projects-as-an-administrator.adoc
+++ b/modules/monitoring-querying-metrics-for-all-projects-as-an-administrator.adoc
@@ -1,6 +1,7 @@
 // Module included in the following assemblies:
 //
 // * monitoring/managing-metrics.adoc
+// * virt/logging_events_monitoring/virt-prometheus-queries.adoc
 
 [id="querying-metrics-for-all-projects-as-an-administrator_{context}"]
 = Querying metrics for all projects as a cluster administrator
diff --git a/modules/monitoring-querying-metrics-for-user-defined-projects-as-a-developer.adoc b/modules/monitoring-querying-metrics-for-user-defined-projects-as-a-developer.adoc
index e166131d60..9c434c67c5 100644
--- a/modules/monitoring-querying-metrics-for-user-defined-projects-as-a-developer.adoc
+++ b/modules/monitoring-querying-metrics-for-user-defined-projects-as-a-developer.adoc
@@ -1,6 +1,7 @@
 // Module included in the following assemblies:
 //
 // * monitoring/managing-metrics.adoc
+// * virt/logging_events_monitoring/virt-prometheus-queries.adoc
 
 [id="querying-metrics-for-user-defined-projects-as-a-developer_{context}"]
 = Querying metrics for user-defined projects as a developer
diff --git a/modules/monitoring-querying-metrics.adoc b/modules/monitoring-querying-metrics.adoc
index b584d6e98a..43c400a18c 100644
--- a/modules/monitoring-querying-metrics.adoc
+++ b/modules/monitoring-querying-metrics.adoc
@@ -1,6 +1,7 @@
 // Module included in the following assemblies:
 //
 // * monitoring/managing-metrics.adoc
+// * virt/logging_events_monitoring/virt-prometheus-queries.adoc
 
 [id="querying-metrics_{context}"]
 = Querying metrics
diff --git a/modules/virt-querying-metrics.adoc b/modules/virt-querying-metrics.adoc
new file mode 100644
index 0000000000..c6900179cf
--- /dev/null
+++ b/modules/virt-querying-metrics.adoc
@@ -0,0 +1,106 @@
+// Module included in the following assemblies:
+//
+// * virt/logging_events_monitoring/virt-prometheus-queries.adoc
+
+[id="virt-querying-metrics_{context}"]
+= Virtualization metrics
+
+The following metric descriptions include example Prometheus Query Language (PromQL) queries.
+
+[NOTE]
+====
+These metrics are not an API and might change between versions.
+====
+
+
+[id="virt-promql-vcpu-metrics_{context}"]
+== vCPU metrics
+
+`kubevirt_vmi_vcpu_wait_seconds`::
+Returns the wait time (in seconds) for a virtual machine's vCPU.
+
+A value above '0' means that the vCPU wants to run, but the host scheduler cannot run it yet. This indicates that there is an issue with Input/Output.
+
+.Example query
+[source,promql]
+----
+topk(3, sum by (name, namespace) (round(irate(kubevirt_vmi_vcpu_wait_seconds[6m]), 0.1))) > 0
+----
+The above query returns the top 3 VMs waiting for I/O at every given moment in time over the time period.
+
+[id="virt-promql-network-metrics_{context}"]
+== Network metrics
+
+`kubevirt_vmi_network_receive_bytes_total`::
+Returns the total amount of traffic received (in bytes) on the virtual machine's network.
+
+`kubevirt_vmi_network_transmit_bytes_total`::
+Returns the total amount of traffic transmitted (in bytes) on the virtual machine's network.
+
+These queries can be used to identify virtual machines that are saturating the network.
+
+.Example query
+[source,promql]
+----
+topk(3, sum by (name, namespace) (round(irate(kubevirt_vmi_network_receive_bytes_total[6m]), 0.1)) + sum by (name, namespace) (round(irate(kubevirt_vmi_network_transmit_bytes_total[6m]) , 0.1))) > 0
+----
+The above query returns the top 3 VMs transmitting the most network traffic at every given moment in time over a six-minute time period.
+
+[id="virt-promql-storage-metrics_{context}"]
+== Storage metrics
+
+`kubevirt_vmi_storage_read_traffic_bytes_total`::
+Returns the total amount (in bytes) of the virtual machine's storage-related traffic. 
+
+`kubevirt_vmi_storage_write_traffic_bytes_total`::
+Returns the total amount of storage writes (in bytes) of the virtual machine's storage-related traffic.
+
+These queries can be used to identify virtual machines that are writing large amounts of data.
+
+.Example query
+[source,promql]
+----
+topk(3, sum by (name, namespace) (round(irate(kubevirt_vmi_storage_read_traffic_bytes_total[6m]), 0.1))
++ sum by (name, namespace) (round(irate(kubevirt_vmi_storage_write_traffic_bytes_total[6m]), 0.1))) > 0
+----
+The above query returns the top 3 VMs performing the most storage traffic at every given moment in time over a six-minute time period.
+
+`kubevirt_vmi_storage_iops_read_total`::
+Returns the amount of write I/O operations the virtual machine is performing per second.
+
+`kubevirt_vmi_storage_iops_write_total`::
+Returns the amount of read I/O operations the virtual machine is performing per second.
+
+These queries can be used to determine the I/O performance of storage devices.
+
+.Example query
+[source,promql]
+----
+topk(3, sum by (name, namespace) (round(irate(kubevirt_vmi_storage_iops_read_total[6m]), 0.1))
++ sum by (name, namespace) (round(irate(kubevirt_vmi_storage_iops_write_total[6m]) , 0.1))) > 0
+----
+The above query returns the top 3 VMs performing the most I/O operations per second at every given moment in time over a six-minute time period. 
+
+[id="virt-promql-guest-memory-metrics_{context}"]
+== Guest memory swapping metrics
+`kubevirt_vmi_memory_swap_in_traffic_bytes_total`::
+Returns the total amount (in bytes) of memory the virtual guest is swapping in.
+
+`kubevirt_vmi_memory_swap_out_traffic_bytes_total`::
+Returns the total amount (in bytes) of memory the virtual guest is swapping out. 
+
+Memory swapping indicates that the virtual machine is under memory pressure. Increasing the memory allocation of the virtual machine can mitigate this issue. 
+
+[NOTE]
+====
+These queries only return data for virtual guests that have memory swapping enabled.
+====
+
+.Example query
+[source,promql]
+----
+topk(3, sum by (name, namespace) (round(irate(kubevirt_vmi_memory_swap_in_traffic_bytes_total[6m]), 0.1))
++ sum by (name, namespace) (round(irate(kubevirt_vmi_memory_swap_out_traffic_bytes_total[6m]), 0.1))) > 0
+----
+The above query returns the top 3 VMs where the guest is performing the most memory swapping at every given moment in time over a six-minute time period. 
+
diff --git a/virt/logging_events_monitoring/virt-prometheus-queries.adoc b/virt/logging_events_monitoring/virt-prometheus-queries.adoc
new file mode 100644
index 0000000000..c0cea6c6eb
--- /dev/null
+++ b/virt/logging_events_monitoring/virt-prometheus-queries.adoc
@@ -0,0 +1,29 @@
+[id="virt-prometheus-queries"]
+= Prometheus queries for virtual resources
+include::modules/virt-document-attributes.adoc[]
+:context: virt-prometheus-queries
+toc::[]
+
+{VirtProductName} provides metrics for monitoring how infrastructure resources are consumed in the cluster. The metrics cover the following resources:
+
+* vCPU
+* Network
+* Storage
+* Guest memory swapping
+
+Use the {product-title} monitoring dashboard to query virtualization metrics.
+
+.Prerequisite
+
+* The vCPU metric requires the `schedstats=enable` kernel argument applied to the `MachineConfig` object before it can be used. This kernel argument enables scheduler statistics used for debugging and performance tuning and adds a minor additional load to the scheduler. See the xref:../../post_installation_configuration/machine-configuration-tasks.adoc#nodes-nodes-kernel-arguments_post-install-machine-configuration-tasks[{product-title} machine configuration tasks] documentation for more information on applying a kernel argument.
+
+include::modules/monitoring-querying-metrics.adoc[leveloffset=+1]
+include::modules/monitoring-querying-metrics-for-all-projects-as-an-administrator.adoc[leveloffset=+2]
+include::modules/monitoring-querying-metrics-for-user-defined-projects-as-a-developer.adoc[leveloffset=+2]
+
+include::modules/virt-querying-metrics.adoc[leveloffset=+1]
+
+[id="{context}-additional-resources"]
+== Additional resources
+
+* xref:../../monitoring/understanding-the-monitoring-stack.adoc#understanding-the-monitoring-stack[Understanding the {product-title} monitoring stack]