From 18548331da38020c5b90b484f4ea2a4ecdefdc01 Mon Sep 17 00:00:00 2001 From: Jeana Routh Date: Thu, 5 Dec 2024 14:47:34 -0500 Subject: [PATCH] OSDOCS-12784: Explain CAS and machineset GPU setup --- machine_management/applying-autoscaling.adoc | 5 ++ .../creating-machineset-aws.adoc | 7 +++ .../creating-machineset-azure-stack-hub.adoc | 7 +++ .../creating-machineset-azure.adoc | 7 +++ .../creating-machineset-bare-metal.adoc | 7 +++ .../creating-machineset-gcp.adoc | 7 +++ .../creating-machineset-ibm-cloud.adoc | 7 +++ .../creating-machineset-ibm-power-vs.adoc | 9 +++- .../creating-machineset-nutanix.adoc | 7 +++ .../creating-machineset-osp.adoc | 7 +++ .../creating-machineset-vsphere.adoc | 7 +++ modules/cluster-autoscaler-cr.adoc | 20 ++++---- modules/machineset-label-gpu-autoscaler.adoc | 50 +++++++++++++++++++ .../cluster-tasks.adoc | 15 ++---- 14 files changed, 140 insertions(+), 22 deletions(-) create mode 100644 modules/machineset-label-gpu-autoscaler.adoc diff --git a/machine_management/applying-autoscaling.adoc b/machine_management/applying-autoscaling.adoc index 182839a679..573e35f57d 100644 --- a/machine_management/applying-autoscaling.adoc +++ b/machine_management/applying-autoscaling.adoc @@ -25,10 +25,15 @@ First, deploy the cluster autoscaler to manage automatic resource scaling in you Because the cluster autoscaler is scoped to the entire cluster, you can make only one cluster autoscaler for the cluster. ==== +//Cluster autoscaler resource definition include::modules/cluster-autoscaler-cr.adoc[leveloffset=+3] +//Configuring a priority expander for the cluster autoscaler include::modules/cluster-autoscaler-config-priority-expander.adoc[leveloffset=+3] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+3] + :FeatureName: cluster autoscaler :FeatureResourceName: ClusterAutoscaler include::modules/deploying-resource.adoc[leveloffset=+2] diff --git a/machine_management/creating_machinesets/creating-machineset-aws.adoc b/machine_management/creating_machinesets/creating-machineset-aws.adoc index b066a33ea3..3ff49cd0c5 100644 --- a/machine_management/creating_machinesets/creating-machineset-aws.adoc +++ b/machine_management/creating_machinesets/creating-machineset-aws.adoc @@ -17,6 +17,13 @@ include::modules/machineset-yaml-aws.adoc[leveloffset=+1] //Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + //Assigning machines to placement groups by using machine sets include::modules/machineset-aws-existing-placement-group.adoc[leveloffset=+1] diff --git a/machine_management/creating_machinesets/creating-machineset-azure-stack-hub.adoc b/machine_management/creating_machinesets/creating-machineset-azure-stack-hub.adoc index 20884178cb..dc8fbc04ce 100644 --- a/machine_management/creating_machinesets/creating-machineset-azure-stack-hub.adoc +++ b/machine_management/creating_machinesets/creating-machineset-azure-stack-hub.adoc @@ -17,6 +17,13 @@ include::modules/machineset-yaml-azure-stack-hub.adoc[leveloffset=+1] //Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + //Enabling Azure boot diagnostics on compute machines include::modules/machineset-azure-boot-diagnostics.adoc[leveloffset=+1] diff --git a/machine_management/creating_machinesets/creating-machineset-azure.adoc b/machine_management/creating_machinesets/creating-machineset-azure.adoc index 7e71b17a22..c0e676a0f4 100644 --- a/machine_management/creating_machinesets/creating-machineset-azure.adoc +++ b/machine_management/creating_machinesets/creating-machineset-azure.adoc @@ -17,6 +17,13 @@ include::modules/machineset-yaml-azure.adoc[leveloffset=+1] //Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + //Selecting an Azure Marketplace image include::modules/installation-azure-marketplace-subscribe.adoc[leveloffset=+1] diff --git a/machine_management/creating_machinesets/creating-machineset-bare-metal.adoc b/machine_management/creating_machinesets/creating-machineset-bare-metal.adoc index e68bdbfecc..07bdf0d360 100644 --- a/machine_management/creating_machinesets/creating-machineset-bare-metal.adoc +++ b/machine_management/creating_machinesets/creating-machineset-bare-metal.adoc @@ -14,5 +14,12 @@ include::modules/machineset-yaml-baremetal.adoc[leveloffset=+1] include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + // Mothballed - re-add when available // include::modules/machineset-osp-adding-bare-metal.adoc[leveloffset=+1] diff --git a/machine_management/creating_machinesets/creating-machineset-gcp.adoc b/machine_management/creating_machinesets/creating-machineset-gcp.adoc index 0aaf38c6f4..8223a19c9e 100644 --- a/machine_management/creating_machinesets/creating-machineset-gcp.adoc +++ b/machine_management/creating_machinesets/creating-machineset-gcp.adoc @@ -17,6 +17,13 @@ include::modules/machineset-yaml-gcp.adoc[leveloffset=+1] //Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + //Configuring persistent disk types by using compute machine sets include::modules/machineset-gcp-pd-disk-types.adoc[leveloffset=+1] diff --git a/machine_management/creating_machinesets/creating-machineset-ibm-cloud.adoc b/machine_management/creating_machinesets/creating-machineset-ibm-cloud.adoc index 99682603c7..55334614e6 100644 --- a/machine_management/creating_machinesets/creating-machineset-ibm-cloud.adoc +++ b/machine_management/creating_machinesets/creating-machineset-ibm-cloud.adoc @@ -16,3 +16,10 @@ include::modules/machineset-yaml-ibm-cloud.adoc[leveloffset=+1] //Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] + +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] diff --git a/machine_management/creating_machinesets/creating-machineset-ibm-power-vs.adoc b/machine_management/creating_machinesets/creating-machineset-ibm-power-vs.adoc index 5672dd4879..348bb068b9 100644 --- a/machine_management/creating_machinesets/creating-machineset-ibm-power-vs.adoc +++ b/machine_management/creating_machinesets/creating-machineset-ibm-power-vs.adoc @@ -14,5 +14,12 @@ include::modules/machine-user-provisioned-limitations.adoc[leveloffset=+1] //Sample YAML for a machine set custom resource on {ibm-cloud-title} include::modules/machineset-yaml-ibm-power-vs.adoc[leveloffset=+1] -//Creating a machine set +//Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] + +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] \ No newline at end of file diff --git a/machine_management/creating_machinesets/creating-machineset-nutanix.adoc b/machine_management/creating_machinesets/creating-machineset-nutanix.adoc index 8e7a1bb6e0..042a4c1888 100644 --- a/machine_management/creating_machinesets/creating-machineset-nutanix.adoc +++ b/machine_management/creating_machinesets/creating-machineset-nutanix.adoc @@ -17,6 +17,13 @@ include::modules/machineset-yaml-nutanix.adoc[leveloffset=+1] //Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + //Failure domains for Nutanix clusters include::modules/mapi-failure-domain-nutanix.adoc[leveloffset=+1] [role="_additional-resources"] diff --git a/machine_management/creating_machinesets/creating-machineset-osp.adoc b/machine_management/creating_machinesets/creating-machineset-osp.adoc index eecd1c5db4..db359a9821 100644 --- a/machine_management/creating_machinesets/creating-machineset-osp.adoc +++ b/machine_management/creating_machinesets/creating-machineset-osp.adoc @@ -23,5 +23,12 @@ include::modules/machineset-yaml-osp-sr-iov-port-security.adoc[leveloffset=+1] include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + // Mothballed - re-add when available // include::modules/machineset-osp-adding-bare-metal.adoc[leveloffset=+1] diff --git a/machine_management/creating_machinesets/creating-machineset-vsphere.adoc b/machine_management/creating_machinesets/creating-machineset-vsphere.adoc index 7ed435ae6d..ba6138a8b4 100644 --- a/machine_management/creating_machinesets/creating-machineset-vsphere.adoc +++ b/machine_management/creating_machinesets/creating-machineset-vsphere.adoc @@ -38,5 +38,12 @@ include::modules/machineset-upi-reqs-ignition-config.adoc[leveloffset=+2] //Creating a compute machine set include::modules/machineset-creating.adoc[leveloffset=+1] +//Labeling GPU machine sets for the cluster autoscaler +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1] + +[role="_additional-resources"] +.Additional resources +* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] + //Adding tags to machines by using machine sets include::modules/machine-api-vmw-add-tags.adoc[leveloffset=+1,tag=!controlplane] \ No newline at end of file diff --git a/modules/cluster-autoscaler-cr.adoc b/modules/cluster-autoscaler-cr.adoc index cfc24d91e4..8e6084ff47 100644 --- a/modules/cluster-autoscaler-cr.adoc +++ b/modules/cluster-autoscaler-cr.adoc @@ -9,6 +9,11 @@ This `ClusterAutoscaler` resource definition shows the parameters and sample values for the cluster autoscaler. +[NOTE] +==== +When you change the configuration of an existing cluster autoscaler, it restarts. +==== + [source,yaml] ---- apiVersion: "autoscaling.openshift.io/v1" @@ -45,17 +50,10 @@ spec: <4> Specify the maximum number of cores to deploy in the cluster. <5> Specify the minimum amount of memory, in GiB, in the cluster. <6> Specify the maximum amount of memory, in GiB, in the cluster. -<7> Optional: To configure the cluster autoscaler to deploy GPU-enabled nodes, specify a `type` value that represents the GPU type to use. -For example, you might use `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs. -+ --- -[NOTE] -==== -The `type` value must match the value of the `spec.template.spec.metadata.labels[cluster-api/accelerator]` label in the machine set that manages the GPU-enabled nodes of that type. -Because you use this value as a label on the machine set, it must consist of alphanumeric characters, `-`, `_`, or `.` and must start and end with an alphanumeric character. -==== --- -+ +<7> Optional: To configure the cluster autoscaler to deploy GPU-enabled nodes, specify a `type` value. +This value must match the value of the `spec.template.spec.metadata.labels[cluster-api/accelerator]` label in the machine set that manages the GPU-enabled nodes of that type. +For example, this value might be `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs. +For more information, see "Labeling GPU machine sets for the cluster autoscaler". <8> Specify the minimum number of GPUs of the specified type to deploy in the cluster. <9> Specify the maximum number of GPUs of the specified type to deploy in the cluster. <10> Specify the logging verbosity level between `0` and `10`. The following log level thresholds are provided for guidance: diff --git a/modules/machineset-label-gpu-autoscaler.adoc b/modules/machineset-label-gpu-autoscaler.adoc new file mode 100644 index 0000000000..539a4d93c2 --- /dev/null +++ b/modules/machineset-label-gpu-autoscaler.adoc @@ -0,0 +1,50 @@ +// Module included in the following assemblies: +// +// * machine_management/applying-autoscaling.adoc +// * machine_management/creating_machinesets/creating-machineset-aws.adoc +// * machine_management/creating_machinesets/creating-machineset-azure.adoc +// * machine_management/creating_machinesets/creating-machineset-azure-stack-hub.adoc +// * machine_management/creating_machinesets/creating-machineset-bare-metal.adoc +// * machine_management/creating_machinesets/creating-machineset-gcp.adoc +// * machine_management/creating_machinesets/creating-machineset-ibm-cloud.adoc +// * machine_management/creating_machinesets/creating-machineset-ibm-power-vs.adoc +// * machine_management/creating_machinesets/creating-machineset-nutanix.adoc +// * machine_management/creating_machinesets/creating-machineset-osp.adoc +// * machine_management/creating_machinesets/creating-machineset-vsphere.adoc + +:_mod-docs-content-type: PROCEDURE +[id="machineset-label-gpu-autoscaler_{context}"] += Labeling GPU machine sets for the cluster autoscaler + +You can use a machine set label to indicate which machines the cluster autoscaler can use to deploy GPU-enabled nodes. + +.Prerequisites +* Your cluster uses a cluster autoscaler. + +.Procedure + +* On the machine set that you want to create machines for the cluster autoscaler to use to deploy GPU-enabled nodes, add a `cluster-api/accelerator` label: ++ +-- +[source,yaml] +---- +apiVersion: machine.openshift.io/v1beta1 +kind: MachineSet +metadata: + name: machine-set-name +spec: + template: + spec: + metadata: + labels: + cluster-api/accelerator: nvidia-t4 <1> +---- +<1> Specify a label of your choice that consists of alphanumeric characters, `-`, `_`, or `.` and starts and ends with an alphanumeric character. +For example, you might use `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs. ++ +[NOTE] +==== +You must specify the value of this label for the `spec.resourceLimits.gpus.type` parameter in your `ClusterAutoscaler` CR. +For more information, see "Cluster autoscaler resource definition". +==== +-- \ No newline at end of file diff --git a/post_installation_configuration/cluster-tasks.adoc b/post_installation_configuration/cluster-tasks.adoc index 8b302bfe1b..1f98e29828 100644 --- a/post_installation_configuration/cluster-tasks.adoc +++ b/post_installation_configuration/cluster-tasks.adoc @@ -304,17 +304,12 @@ For information about moving {logging} resources, see: * xref:../observability/logging/scheduling_resources/logging-node-selectors.adoc#logging-node-selectors[Using node selectors to move logging resources] * xref:../observability/logging/scheduling_resources/logging-taints-tolerations.adoc#cluster-logging-logstore-tolerations_logging-taints-tolerations[Using taints and tolerations to control logging pod placement] -include::modules/cluster-autoscaler-about.adoc[leveloffset=+1] -include::modules/cluster-autoscaler-cr.adoc[leveloffset=+2] -:FeatureName: cluster autoscaler -:FeatureResourceName: ClusterAutoscaler -include::modules/deploying-resource.adoc[leveloffset=+2] +[id="custer-tasks-applying-autoscaling"] +== Applying autoscaling to your cluster -include::modules/machine-autoscaler-about.adoc[leveloffset=+1] -include::modules/machine-autoscaler-cr.adoc[leveloffset=+2] -:FeatureName: machine autoscaler -:FeatureResourceName: MachineAutoscaler -include::modules/deploying-resource.adoc[leveloffset=+2] +Applying autoscaling to an {product-title} cluster involves deploying a cluster autoscaler and then deploying machine autoscalers for each machine type in your cluster. + +For more information, see xref:../machine_management/applying-autoscaling.adoc#applying-autoscaling[Applying autoscaling to an {product-title} cluster]. include::modules/nodes-clusters-cgroups-2.adoc[leveloffset=+1]