1
0
mirror of https://github.com/openshift/openshift-docs.git synced 2026-02-05 12:46:18 +01:00

OSDOCS-12784: Explain CAS and machineset GPU setup

This commit is contained in:
Jeana Routh
2024-12-05 14:47:34 -05:00
parent eba5e01382
commit 18548331da
14 changed files with 140 additions and 22 deletions

View File

@@ -25,10 +25,15 @@ First, deploy the cluster autoscaler to manage automatic resource scaling in you
Because the cluster autoscaler is scoped to the entire cluster, you can make only one cluster autoscaler for the cluster.
====
//Cluster autoscaler resource definition
include::modules/cluster-autoscaler-cr.adoc[leveloffset=+3]
//Configuring a priority expander for the cluster autoscaler
include::modules/cluster-autoscaler-config-priority-expander.adoc[leveloffset=+3]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+3]
:FeatureName: cluster autoscaler
:FeatureResourceName: ClusterAutoscaler
include::modules/deploying-resource.adoc[leveloffset=+2]

View File

@@ -17,6 +17,13 @@ include::modules/machineset-yaml-aws.adoc[leveloffset=+1]
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
//Assigning machines to placement groups by using machine sets
include::modules/machineset-aws-existing-placement-group.adoc[leveloffset=+1]

View File

@@ -17,6 +17,13 @@ include::modules/machineset-yaml-azure-stack-hub.adoc[leveloffset=+1]
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
//Enabling Azure boot diagnostics on compute machines
include::modules/machineset-azure-boot-diagnostics.adoc[leveloffset=+1]

View File

@@ -17,6 +17,13 @@ include::modules/machineset-yaml-azure.adoc[leveloffset=+1]
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
//Selecting an Azure Marketplace image
include::modules/installation-azure-marketplace-subscribe.adoc[leveloffset=+1]

View File

@@ -14,5 +14,12 @@ include::modules/machineset-yaml-baremetal.adoc[leveloffset=+1]
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
// Mothballed - re-add when available
// include::modules/machineset-osp-adding-bare-metal.adoc[leveloffset=+1]

View File

@@ -17,6 +17,13 @@ include::modules/machineset-yaml-gcp.adoc[leveloffset=+1]
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
//Configuring persistent disk types by using compute machine sets
include::modules/machineset-gcp-pd-disk-types.adoc[leveloffset=+1]

View File

@@ -16,3 +16,10 @@ include::modules/machineset-yaml-ibm-cloud.adoc[leveloffset=+1]
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]

View File

@@ -14,5 +14,12 @@ include::modules/machine-user-provisioned-limitations.adoc[leveloffset=+1]
//Sample YAML for a machine set custom resource on {ibm-cloud-title}
include::modules/machineset-yaml-ibm-power-vs.adoc[leveloffset=+1]
//Creating a machine set
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]

View File

@@ -17,6 +17,13 @@ include::modules/machineset-yaml-nutanix.adoc[leveloffset=+1]
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
//Failure domains for Nutanix clusters
include::modules/mapi-failure-domain-nutanix.adoc[leveloffset=+1]
[role="_additional-resources"]

View File

@@ -23,5 +23,12 @@ include::modules/machineset-yaml-osp-sr-iov-port-security.adoc[leveloffset=+1]
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
// Mothballed - re-add when available
// include::modules/machineset-osp-adding-bare-metal.adoc[leveloffset=+1]

View File

@@ -38,5 +38,12 @@ include::modules/machineset-upi-reqs-ignition-config.adoc[leveloffset=+2]
//Creating a compute machine set
include::modules/machineset-creating.adoc[leveloffset=+1]
//Labeling GPU machine sets for the cluster autoscaler
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
[role="_additional-resources"]
.Additional resources
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
//Adding tags to machines by using machine sets
include::modules/machine-api-vmw-add-tags.adoc[leveloffset=+1,tag=!controlplane]

View File

@@ -9,6 +9,11 @@
This `ClusterAutoscaler` resource definition shows the parameters and sample values for the cluster autoscaler.
[NOTE]
====
When you change the configuration of an existing cluster autoscaler, it restarts.
====
[source,yaml]
----
apiVersion: "autoscaling.openshift.io/v1"
@@ -45,17 +50,10 @@ spec:
<4> Specify the maximum number of cores to deploy in the cluster.
<5> Specify the minimum amount of memory, in GiB, in the cluster.
<6> Specify the maximum amount of memory, in GiB, in the cluster.
<7> Optional: To configure the cluster autoscaler to deploy GPU-enabled nodes, specify a `type` value that represents the GPU type to use.
For example, you might use `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs.
+
--
[NOTE]
====
The `type` value must match the value of the `spec.template.spec.metadata.labels[cluster-api/accelerator]` label in the machine set that manages the GPU-enabled nodes of that type.
Because you use this value as a label on the machine set, it must consist of alphanumeric characters, `-`, `_`, or `.` and must start and end with an alphanumeric character.
====
--
+
<7> Optional: To configure the cluster autoscaler to deploy GPU-enabled nodes, specify a `type` value.
This value must match the value of the `spec.template.spec.metadata.labels[cluster-api/accelerator]` label in the machine set that manages the GPU-enabled nodes of that type.
For example, this value might be `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs.
For more information, see "Labeling GPU machine sets for the cluster autoscaler".
<8> Specify the minimum number of GPUs of the specified type to deploy in the cluster.
<9> Specify the maximum number of GPUs of the specified type to deploy in the cluster.
<10> Specify the logging verbosity level between `0` and `10`. The following log level thresholds are provided for guidance:

View File

@@ -0,0 +1,50 @@
// Module included in the following assemblies:
//
// * machine_management/applying-autoscaling.adoc
// * machine_management/creating_machinesets/creating-machineset-aws.adoc
// * machine_management/creating_machinesets/creating-machineset-azure.adoc
// * machine_management/creating_machinesets/creating-machineset-azure-stack-hub.adoc
// * machine_management/creating_machinesets/creating-machineset-bare-metal.adoc
// * machine_management/creating_machinesets/creating-machineset-gcp.adoc
// * machine_management/creating_machinesets/creating-machineset-ibm-cloud.adoc
// * machine_management/creating_machinesets/creating-machineset-ibm-power-vs.adoc
// * machine_management/creating_machinesets/creating-machineset-nutanix.adoc
// * machine_management/creating_machinesets/creating-machineset-osp.adoc
// * machine_management/creating_machinesets/creating-machineset-vsphere.adoc
:_mod-docs-content-type: PROCEDURE
[id="machineset-label-gpu-autoscaler_{context}"]
= Labeling GPU machine sets for the cluster autoscaler
You can use a machine set label to indicate which machines the cluster autoscaler can use to deploy GPU-enabled nodes.
.Prerequisites
* Your cluster uses a cluster autoscaler.
.Procedure
* On the machine set that you want to create machines for the cluster autoscaler to use to deploy GPU-enabled nodes, add a `cluster-api/accelerator` label:
+
--
[source,yaml]
----
apiVersion: machine.openshift.io/v1beta1
kind: MachineSet
metadata:
name: machine-set-name
spec:
template:
spec:
metadata:
labels:
cluster-api/accelerator: nvidia-t4 <1>
----
<1> Specify a label of your choice that consists of alphanumeric characters, `-`, `_`, or `.` and starts and ends with an alphanumeric character.
For example, you might use `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs.
+
[NOTE]
====
You must specify the value of this label for the `spec.resourceLimits.gpus.type` parameter in your `ClusterAutoscaler` CR.
For more information, see "Cluster autoscaler resource definition".
====
--

View File

@@ -304,17 +304,12 @@ For information about moving {logging} resources, see:
* xref:../observability/logging/scheduling_resources/logging-node-selectors.adoc#logging-node-selectors[Using node selectors to move logging resources]
* xref:../observability/logging/scheduling_resources/logging-taints-tolerations.adoc#cluster-logging-logstore-tolerations_logging-taints-tolerations[Using taints and tolerations to control logging pod placement]
include::modules/cluster-autoscaler-about.adoc[leveloffset=+1]
include::modules/cluster-autoscaler-cr.adoc[leveloffset=+2]
:FeatureName: cluster autoscaler
:FeatureResourceName: ClusterAutoscaler
include::modules/deploying-resource.adoc[leveloffset=+2]
[id="custer-tasks-applying-autoscaling"]
== Applying autoscaling to your cluster
include::modules/machine-autoscaler-about.adoc[leveloffset=+1]
include::modules/machine-autoscaler-cr.adoc[leveloffset=+2]
:FeatureName: machine autoscaler
:FeatureResourceName: MachineAutoscaler
include::modules/deploying-resource.adoc[leveloffset=+2]
Applying autoscaling to an {product-title} cluster involves deploying a cluster autoscaler and then deploying machine autoscalers for each machine type in your cluster.
For more information, see xref:../machine_management/applying-autoscaling.adoc#applying-autoscaling[Applying autoscaling to an {product-title} cluster].
include::modules/nodes-clusters-cgroups-2.adoc[leveloffset=+1]