mirror of
https://github.com/openshift/openshift-docs.git
synced 2026-02-05 12:46:18 +01:00
OSDOCS-12784: Explain CAS and machineset GPU setup
This commit is contained in:
@@ -25,10 +25,15 @@ First, deploy the cluster autoscaler to manage automatic resource scaling in you
|
||||
Because the cluster autoscaler is scoped to the entire cluster, you can make only one cluster autoscaler for the cluster.
|
||||
====
|
||||
|
||||
//Cluster autoscaler resource definition
|
||||
include::modules/cluster-autoscaler-cr.adoc[leveloffset=+3]
|
||||
|
||||
//Configuring a priority expander for the cluster autoscaler
|
||||
include::modules/cluster-autoscaler-config-priority-expander.adoc[leveloffset=+3]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+3]
|
||||
|
||||
:FeatureName: cluster autoscaler
|
||||
:FeatureResourceName: ClusterAutoscaler
|
||||
include::modules/deploying-resource.adoc[leveloffset=+2]
|
||||
|
||||
@@ -17,6 +17,13 @@ include::modules/machineset-yaml-aws.adoc[leveloffset=+1]
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
//Assigning machines to placement groups by using machine sets
|
||||
include::modules/machineset-aws-existing-placement-group.adoc[leveloffset=+1]
|
||||
|
||||
|
||||
@@ -17,6 +17,13 @@ include::modules/machineset-yaml-azure-stack-hub.adoc[leveloffset=+1]
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
//Enabling Azure boot diagnostics on compute machines
|
||||
include::modules/machineset-azure-boot-diagnostics.adoc[leveloffset=+1]
|
||||
|
||||
|
||||
@@ -17,6 +17,13 @@ include::modules/machineset-yaml-azure.adoc[leveloffset=+1]
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
//Selecting an Azure Marketplace image
|
||||
include::modules/installation-azure-marketplace-subscribe.adoc[leveloffset=+1]
|
||||
|
||||
|
||||
@@ -14,5 +14,12 @@ include::modules/machineset-yaml-baremetal.adoc[leveloffset=+1]
|
||||
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
// Mothballed - re-add when available
|
||||
// include::modules/machineset-osp-adding-bare-metal.adoc[leveloffset=+1]
|
||||
|
||||
@@ -17,6 +17,13 @@ include::modules/machineset-yaml-gcp.adoc[leveloffset=+1]
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
//Configuring persistent disk types by using compute machine sets
|
||||
include::modules/machineset-gcp-pd-disk-types.adoc[leveloffset=+1]
|
||||
|
||||
|
||||
@@ -16,3 +16,10 @@ include::modules/machineset-yaml-ibm-cloud.adoc[leveloffset=+1]
|
||||
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
@@ -14,5 +14,12 @@ include::modules/machine-user-provisioned-limitations.adoc[leveloffset=+1]
|
||||
//Sample YAML for a machine set custom resource on {ibm-cloud-title}
|
||||
include::modules/machineset-yaml-ibm-power-vs.adoc[leveloffset=+1]
|
||||
|
||||
//Creating a machine set
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
@@ -17,6 +17,13 @@ include::modules/machineset-yaml-nutanix.adoc[leveloffset=+1]
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
//Failure domains for Nutanix clusters
|
||||
include::modules/mapi-failure-domain-nutanix.adoc[leveloffset=+1]
|
||||
[role="_additional-resources"]
|
||||
|
||||
@@ -23,5 +23,12 @@ include::modules/machineset-yaml-osp-sr-iov-port-security.adoc[leveloffset=+1]
|
||||
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
// Mothballed - re-add when available
|
||||
// include::modules/machineset-osp-adding-bare-metal.adoc[leveloffset=+1]
|
||||
|
||||
@@ -38,5 +38,12 @@ include::modules/machineset-upi-reqs-ignition-config.adoc[leveloffset=+2]
|
||||
//Creating a compute machine set
|
||||
include::modules/machineset-creating.adoc[leveloffset=+1]
|
||||
|
||||
//Labeling GPU machine sets for the cluster autoscaler
|
||||
include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+1]
|
||||
|
||||
[role="_additional-resources"]
|
||||
.Additional resources
|
||||
* xref:../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition]
|
||||
|
||||
//Adding tags to machines by using machine sets
|
||||
include::modules/machine-api-vmw-add-tags.adoc[leveloffset=+1,tag=!controlplane]
|
||||
@@ -9,6 +9,11 @@
|
||||
|
||||
This `ClusterAutoscaler` resource definition shows the parameters and sample values for the cluster autoscaler.
|
||||
|
||||
[NOTE]
|
||||
====
|
||||
When you change the configuration of an existing cluster autoscaler, it restarts.
|
||||
====
|
||||
|
||||
[source,yaml]
|
||||
----
|
||||
apiVersion: "autoscaling.openshift.io/v1"
|
||||
@@ -45,17 +50,10 @@ spec:
|
||||
<4> Specify the maximum number of cores to deploy in the cluster.
|
||||
<5> Specify the minimum amount of memory, in GiB, in the cluster.
|
||||
<6> Specify the maximum amount of memory, in GiB, in the cluster.
|
||||
<7> Optional: To configure the cluster autoscaler to deploy GPU-enabled nodes, specify a `type` value that represents the GPU type to use.
|
||||
For example, you might use `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs.
|
||||
+
|
||||
--
|
||||
[NOTE]
|
||||
====
|
||||
The `type` value must match the value of the `spec.template.spec.metadata.labels[cluster-api/accelerator]` label in the machine set that manages the GPU-enabled nodes of that type.
|
||||
Because you use this value as a label on the machine set, it must consist of alphanumeric characters, `-`, `_`, or `.` and must start and end with an alphanumeric character.
|
||||
====
|
||||
--
|
||||
+
|
||||
<7> Optional: To configure the cluster autoscaler to deploy GPU-enabled nodes, specify a `type` value.
|
||||
This value must match the value of the `spec.template.spec.metadata.labels[cluster-api/accelerator]` label in the machine set that manages the GPU-enabled nodes of that type.
|
||||
For example, this value might be `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs.
|
||||
For more information, see "Labeling GPU machine sets for the cluster autoscaler".
|
||||
<8> Specify the minimum number of GPUs of the specified type to deploy in the cluster.
|
||||
<9> Specify the maximum number of GPUs of the specified type to deploy in the cluster.
|
||||
<10> Specify the logging verbosity level between `0` and `10`. The following log level thresholds are provided for guidance:
|
||||
|
||||
50
modules/machineset-label-gpu-autoscaler.adoc
Normal file
50
modules/machineset-label-gpu-autoscaler.adoc
Normal file
@@ -0,0 +1,50 @@
|
||||
// Module included in the following assemblies:
|
||||
//
|
||||
// * machine_management/applying-autoscaling.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-aws.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-azure.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-azure-stack-hub.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-bare-metal.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-gcp.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-ibm-cloud.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-ibm-power-vs.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-nutanix.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-osp.adoc
|
||||
// * machine_management/creating_machinesets/creating-machineset-vsphere.adoc
|
||||
|
||||
:_mod-docs-content-type: PROCEDURE
|
||||
[id="machineset-label-gpu-autoscaler_{context}"]
|
||||
= Labeling GPU machine sets for the cluster autoscaler
|
||||
|
||||
You can use a machine set label to indicate which machines the cluster autoscaler can use to deploy GPU-enabled nodes.
|
||||
|
||||
.Prerequisites
|
||||
* Your cluster uses a cluster autoscaler.
|
||||
|
||||
.Procedure
|
||||
|
||||
* On the machine set that you want to create machines for the cluster autoscaler to use to deploy GPU-enabled nodes, add a `cluster-api/accelerator` label:
|
||||
+
|
||||
--
|
||||
[source,yaml]
|
||||
----
|
||||
apiVersion: machine.openshift.io/v1beta1
|
||||
kind: MachineSet
|
||||
metadata:
|
||||
name: machine-set-name
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
metadata:
|
||||
labels:
|
||||
cluster-api/accelerator: nvidia-t4 <1>
|
||||
----
|
||||
<1> Specify a label of your choice that consists of alphanumeric characters, `-`, `_`, or `.` and starts and ends with an alphanumeric character.
|
||||
For example, you might use `nvidia-t4` to represent Nvidia T4 GPUs, or `nvidia-a10g` for A10G GPUs.
|
||||
+
|
||||
[NOTE]
|
||||
====
|
||||
You must specify the value of this label for the `spec.resourceLimits.gpus.type` parameter in your `ClusterAutoscaler` CR.
|
||||
For more information, see "Cluster autoscaler resource definition".
|
||||
====
|
||||
--
|
||||
@@ -304,17 +304,12 @@ For information about moving {logging} resources, see:
|
||||
* xref:../observability/logging/scheduling_resources/logging-node-selectors.adoc#logging-node-selectors[Using node selectors to move logging resources]
|
||||
* xref:../observability/logging/scheduling_resources/logging-taints-tolerations.adoc#cluster-logging-logstore-tolerations_logging-taints-tolerations[Using taints and tolerations to control logging pod placement]
|
||||
|
||||
include::modules/cluster-autoscaler-about.adoc[leveloffset=+1]
|
||||
include::modules/cluster-autoscaler-cr.adoc[leveloffset=+2]
|
||||
:FeatureName: cluster autoscaler
|
||||
:FeatureResourceName: ClusterAutoscaler
|
||||
include::modules/deploying-resource.adoc[leveloffset=+2]
|
||||
[id="custer-tasks-applying-autoscaling"]
|
||||
== Applying autoscaling to your cluster
|
||||
|
||||
include::modules/machine-autoscaler-about.adoc[leveloffset=+1]
|
||||
include::modules/machine-autoscaler-cr.adoc[leveloffset=+2]
|
||||
:FeatureName: machine autoscaler
|
||||
:FeatureResourceName: MachineAutoscaler
|
||||
include::modules/deploying-resource.adoc[leveloffset=+2]
|
||||
Applying autoscaling to an {product-title} cluster involves deploying a cluster autoscaler and then deploying machine autoscalers for each machine type in your cluster.
|
||||
|
||||
For more information, see xref:../machine_management/applying-autoscaling.adoc#applying-autoscaling[Applying autoscaling to an {product-title} cluster].
|
||||
|
||||
include::modules/nodes-clusters-cgroups-2.adoc[leveloffset=+1]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user