diff --git a/machine_management/applying-autoscaling.adoc b/machine_management/applying-autoscaling.adoc index 784036e7ee..573e35f57d 100644 --- a/machine_management/applying-autoscaling.adoc +++ b/machine_management/applying-autoscaling.adoc @@ -32,7 +32,7 @@ include::modules/cluster-autoscaler-cr.adoc[leveloffset=+3] include::modules/cluster-autoscaler-config-priority-expander.adoc[leveloffset=+3] //Labeling GPU machine sets for the cluster autoscaler -include::modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc[leveloffset=+3] +include::modules/machineset-label-gpu-autoscaler.adoc[leveloffset=+3] :FeatureName: cluster autoscaler :FeatureResourceName: ClusterAutoscaler diff --git a/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc b/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc index b222ba1794..f30fbd79a4 100644 --- a/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc +++ b/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc @@ -22,9 +22,45 @@ include::modules/capi-yaml-machine-template-aws.adoc[leveloffset=+2] //Sample YAML for a CAPI AWS compute machine set resource include::modules/capi-yaml-machine-set-aws.adoc[leveloffset=+2] -// [id="cluster-api-supported-features-aws_{context}"] -// == Enabling {aws-full} features with the Cluster API +[id="cluster-api-supported-features-aws_{context}"] +== Enabling {aws-full} features with the Cluster API -// You can enable the following features by updating values in the Cluster API custom resource manifests. +You can enable the following features by updating values in the Cluster API custom resource manifests. -//Not sure what, if anything, we can add here at this time. \ No newline at end of file +//// +//Not yet supported, relies on Cluster API CAS support +// Cluster autoscaler GPU labels +include::modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc[leveloffset=+2] + +[role="_additional-resources"] +.Additional resources +* xref:../../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] +//// + +// Elastic Fabric Adapter instances and placement group options +include::modules/machine-feature-aws-existing-placement-group.adoc[leveloffset=+2] + +// Amazon EC2 Instance Metadata Service configuration options +include::modules/machine-feature-aws-imds-options.adoc[leveloffset=+2] + +//// +//This link is for a note that does not apply to TP clusters, reassess for Cluster API GA +[role="_additional-resources"] +.Additional resources +* xref:../../../machine_configuration/mco-update-boot-images.adoc#mco-update-boot-images[Updated boot images] +//// + +// Dedicated Instances configuration options +include::modules/machine-feature-aws-dedicated-instances.adoc[leveloffset=+2] + +// Non-guaranteed Spot Instances and hourly cost limits +include::modules/machine-feature-agnostic-nonguaranteed-instances.adoc[leveloffset=+2] + +// Capacity Reservation configuration options +include::modules/machine-feature-agnostic-capacity-reservation.adoc[leveloffset=+2] + +//Adding a GPU node to a machine set (stesmith) +include::modules/machine-feature-aws-add-nvidia-gpu-node.adoc[leveloffset=+2] + +// //Deploying the Node Feature Discovery Operator (stesmith) +// include::modules/nvidia-gpu-aws-deploying-the-node-feature-discovery-operator.adoc[leveloffset=+1] \ No newline at end of file diff --git a/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-bare-metal.adoc b/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-bare-metal.adoc index 009614952f..e2f1fd0171 100644 --- a/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-bare-metal.adoc +++ b/machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-bare-metal.adoc @@ -22,6 +22,8 @@ include::modules/capi-yaml-machine-template-bare-metal.adoc[leveloffset=+2] //Sample YAML for a CAPI bare metal compute machine set resource include::modules/capi-yaml-machine-set-bare-metal.adoc[leveloffset=+2] +//// +//Section depends on migration support [id="cluster-api-supported-features-bare-metal_{context}"] == Enabling bare metal features with the Cluster API @@ -33,3 +35,4 @@ include::modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc[leve [role="_additional-resources"] .Additional resources * xref:../../../machine_management/applying-autoscaling.adoc#cluster-autoscaler-cr_applying-autoscaling[Cluster autoscaler resource definition] +//// \ No newline at end of file diff --git a/modules/capi-yaml-machine-template-aws.adoc b/modules/capi-yaml-machine-template-aws.adoc index d1e8edece2..c0fb5cac7c 100644 --- a/modules/capi-yaml-machine-template-aws.adoc +++ b/modules/capi-yaml-machine-template-aws.adoc @@ -19,12 +19,11 @@ metadata: spec: template: spec: # <3> - uncompressedUserData: true iamInstanceProfile: # ... instanceType: m5.large ignition: storageType: UnencryptedUserData - version: "3.2" + version: "3.4" ami: id: # ... subnet: diff --git a/modules/machine-feature-agnostic-capacity-reservation.adoc b/modules/machine-feature-agnostic-capacity-reservation.adoc new file mode 100644 index 0000000000..93bf411659 --- /dev/null +++ b/modules/machine-feature-agnostic-capacity-reservation.adoc @@ -0,0 +1,70 @@ +// Module included in the following assemblies: +// +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc +// There are parallel features in Azure so this module is set up for reuse. + +ifeval::["{context}" == "cluster-api-config-options-aws"] +:aws: +endif::[] + +:_mod-docs-content-type: CONCEPT +[id="machine-feature-agnostic-capacity-reservation_{context}"] += Capacity Reservation configuration options + +{product-title} version {product-version} and later supports +ifdef::azure[on-demand Capacity Reservation with Capacity Reservation groups on {azure-full} clusters.] +ifdef::aws[Capacity Reservations on {aws-full} clusters, including On-Demand Capacity Reservations and Capacity Blocks for ML.] + +You can deploy machines on any available resources that match the parameters of a capacity request that you define. +These parameters specify the +ifdef::azure[VM size,] +ifdef::aws[instance type,] +region, and number of instances that you want to reserve. +If your +ifdef::azure[{azure-short} subscription quota] +ifdef::aws[Capacity Reservation] +can accommodate the capacity request, the deployment succeeds. + +include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template] + +ifdef::azure[] +[NOTE] +==== +You cannot change an existing Capacity Reservation configuration for a machine set. +To use a different Capacity Reservation group, you must replace the machine set and the machines that the previous machine set deployed. +==== +endif::azure[] + +.Sample Capacity Reservation configuration +[source,yaml] +---- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachineTemplate +# ... +spec: + template: + spec: + capacityReservationId: # <1> + marketType: # <2> +# ... +---- +<1> Specify the ID of the +ifdef::azure[Capacity Reservation group] +ifdef::aws[Capacity Block for ML or On-Demand Capacity Reservation] +that you want to deploy machines on. +ifdef::aws[] +<2> Specify the market type to use. +The following values are valid: +`CapacityBlock`:: Use this market type with Capacity Blocks for ML. +`OnDemand`:: Use this market type with On-Demand Capacity Reservations. +`Spot`:: Use this market type with Spot Instances. +This option is not compatible with Capacity Reservations. +endif::aws[] + +For more information, including limitations and suggested use cases for this offering, see +ifdef::azure[link:https://learn.microsoft.com/en-us/azure/virtual-machines/capacity-reservation-overview[On-demand Capacity Reservation] in the {azure-full} documentation.] +ifdef::aws[link:https://docs.aws.amazon.com/en_us/AWSEC2/latest/UserGuide/capacity-reservation-overview.html[On-Demand Capacity Reservations and Capacity Blocks for ML] in the {aws-short} documentation.] + +ifeval::["{context}" == "cluster-api-config-options-aws"] +:!aws: +endif::[] diff --git a/modules/machine-feature-agnostic-nonguaranteed-instances.adoc b/modules/machine-feature-agnostic-nonguaranteed-instances.adoc new file mode 100644 index 0000000000..c56e4710cc --- /dev/null +++ b/modules/machine-feature-agnostic-nonguaranteed-instances.adoc @@ -0,0 +1,67 @@ +// Module included in the following assemblies: +// +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc +// There are parallel features in Azure and GCP so this module is set up for reuse. + +ifeval::["{context}" == "cluster-api-config-options-aws"] +:aws: +endif::[] + +:_mod-docs-content-type: CONCEPT +[id="machine-feature-agnostic-nonguaranteed-instances_{context}"] +ifdef::aws[= Non-guaranteed Spot Instances and hourly cost limits] + +ifdef::aws[] +You can deploy machines as non-guaranteed Spot Instances on {aws-first}. +Spot Instances use spare AWS EC2 capacity and are less expensive than On-Demand Instances. +You can use Spot Instances for workloads that can tolerate interruptions, such as batch or stateless, horizontally scalable workloads. +endif::aws[] + +include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template] + +ifdef::aws[] +[IMPORTANT] +==== +AWS EC2 can reclaim the capacity for a Spot Instance at any time. +==== + +.Sample Spot Instance configuration +[source,yaml] +---- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachineTemplate +# ... +spec: + template: + spec: + spotMarketOptions: <1> + maxPrice: <2> +# ... +---- +<1> Specifies the use of Spot Instances. +<2> Optional: Specifies an hourly cost limit in US dollars for the Spot Instance. +For example, setting the `` value to `2.50` limits the cost of the Spot Instance to USD 2.50 per hour. +When this value is not set, the maximum price charges up to the On-Demand Instance price. ++ +[WARNING] +==== +Setting a specific `maxPrice: ` value might increase the frequency of interruptions compared to using the default On-Demand Instance price. +It is strongly recommended to use the default On-Demand Instance price and to not set the maximum price for Spot Instances. +==== + +Interruptions can occur when using Spot Instances for the following reasons: + +* The instance price exceeds your maximum price +* The demand for Spot Instances increases +* The supply of Spot Instances decreases + +AWS gives a two-minute warning to the user when an interruption occurs. +{product-title} begins to remove the workloads from the affected instances when AWS issues the termination warning. + +When AWS terminates an instance, a termination handler running on the Spot Instance node deletes the machine resource. +To satisfy the compute machine set `replicas` quantity, the compute machine set creates a machine that requests a Spot Instance. +endif::aws[] + +ifeval::["{context}" == "cluster-api-config-options-aws"] +:!aws: +endif::[] diff --git a/modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc b/modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc index bea9be9863..3f891598b5 100644 --- a/modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc +++ b/modules/machine-feature-agnostic-options-label-gpu-autoscaler.adoc @@ -1,7 +1,5 @@ // Module included in the following assemblies: // -// * machine_management/applying-autoscaling.adoc -// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc :_mod-docs-content-type: CONCEPT [id="machine-feature-agnostic-options-label-gpu-autoscaler_{context}"] diff --git a/modules/machine-feature-aws-add-nvidia-gpu-node.adoc b/modules/machine-feature-aws-add-nvidia-gpu-node.adoc new file mode 100644 index 0000000000..de387470ed --- /dev/null +++ b/modules/machine-feature-aws-add-nvidia-gpu-node.adoc @@ -0,0 +1,65 @@ +// Module included in the following assemblies: +// +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc + +:_mod-docs-content-type: CONCEPT +[id="machine-feature-aws-add-nvidia-gpu-node_{context}"] += GPU-enabled machine options + +You can deploy GPU-enabled compute machines on {aws-first}. +The following sample configuration uses an link:https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing[{aws-short} G4dn instance type], which includes an NVIDIA Tesla T4 Tensor Core GPU, as an example. + +For more information about supported instance types, see the following pages in the NVIDIA documentation: + +* link:https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/platform-support.html[NVIDIA GPU Operator Community support matrix] + +* link:https://docs.nvidia.com/ai-enterprise/latest/product-support-matrix/index.html[NVIDIA AI Enterprise support matrix] + +include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template-and-machine-set] + +// Cluster API machine template spec +.Sample GPU-enabled machine template configuration +[source,yaml] +---- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachineTemplate +# ... +spec: + template: + spec: + instanceType: g4dn.xlarge <1> +# ... +---- +<1> Specifies a G4dn instance type. + +// Cluster API machine set spec +.Sample GPU-enabled machine set configuration +[source,yaml] +---- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineSet +metadata: + name: -gpu- <1> + namespace: openshift-cluster-api + labels: + cluster.x-k8s.io/cluster-name: +spec: + clusterName: + replicas: 1 + selector: + matchLabels: + test: example + cluster.x-k8s.io/cluster-name: + cluster.x-k8s.io/set-name: -gpu- <2> + template: + metadata: + labels: + test: example + cluster.x-k8s.io/cluster-name: + cluster.x-k8s.io/set-name: -gpu- <3> + node-role.kubernetes.io/: "" +# ... +---- +<1> Specifies a name that includes the `gpu` role. The name includes the cluster ID as a prefix and the region as a suffix. +<2> Specifies a selector label that matches the machine set name. +<3> Specifies a template label that matches the machine set name. diff --git a/modules/machine-feature-aws-dedicated-instances.adoc b/modules/machine-feature-aws-dedicated-instances.adoc new file mode 100644 index 0000000000..7b90eea764 --- /dev/null +++ b/modules/machine-feature-aws-dedicated-instances.adoc @@ -0,0 +1,33 @@ +// Module included in the following assemblies: +// +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc + +:_mod-docs-content-type: CONCEPT +[id="machine-feature-aws-dedicated-instances_{context}"] += Dedicated Instance configuration options + +You can deploy machines that are backed by Dedicated Instances on {aws-first} clusters. + +Dedicated Instances run in a virtual private cloud (VPC) on hardware that is dedicated to a single customer. +These Amazon EC2 instances are physically isolated at the host hardware level. +The isolation of Dedicated Instances occurs even if the instances belong to different AWS accounts that are linked to a single payer account. +However, other instances that are not dedicated can share hardware with Dedicated Instances if they belong to the same AWS account. + +{product-title} supports instances with public or dedicated tenancy. + +include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template] + +.Sample Dedicated Instances configuration +[source,yaml] +---- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachineTemplate +# ... +spec: + template: + spec: + tenancy: dedicated <1> +# ... +---- +<1> Specifies using instances with dedicated tenancy that run on single-tenant hardware. +If you do not specify this value, instances with public tenancy that run on shared hardware are used by default. \ No newline at end of file diff --git a/modules/machine-feature-aws-existing-placement-group.adoc b/modules/machine-feature-aws-existing-placement-group.adoc new file mode 100644 index 0000000000..a9efa9f6cb --- /dev/null +++ b/modules/machine-feature-aws-existing-placement-group.adoc @@ -0,0 +1,58 @@ +// Module included in the following assemblies: +// +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc + +:_mod-docs-content-type: CONCEPT +[id="machine-feature-aws-existing-placement-group_{context}"] += Elastic Fabric Adapter instances and placement group options + +You can deploy compute machines on link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html[Elastic Fabric Adapter] (EFA) instances within an existing AWS placement group. + +EFA instances do not require placement groups, and you can use placement groups for purposes other than configuring an EFA. +The following example uses an EFA and placement group together to demonstrate a configuration that can improve network performance for machines within the specified placement group. + +include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template] + +.Sample EFA instance and placement group configuration +[source,yaml] +---- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachineTemplate +# ... +spec: + template: + spec: + instanceType: # <1> + networkInterfaceType: efa # <2> + placementGroupName: # <3> + placementGroupPartition: # <4> +# ... +---- +<1> Specifies an instance type that link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types[supports EFAs]. +<2> Specifies the `efa` network interface type. +<3> Specifies the name of the existing AWS placement group to deploy machines in. +<4> Optional: Specifies the partition number of the existing AWS placement group where you want your machines deployed. + +[NOTE] +==== +Ensure that the link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html#limitations-placement-groups[rules and limitations] for the type of placement group that you create are compatible with your intended use case. +==== + +//// +The MAPI version of this has additional parameters in the providerSpec: + +---- +placement: + availabilityZone: # <3> + region: # <4> +---- +<3> Specifies the zone, for example, `us-east-1a`. +<4> Specifies the region, for example, `us-east-1`. + +Do we need to say anything specific about this, or is this just redundant with the failure domain? + +Note: +CAPI has networkInterfaceType: efa +MAPI has networkInterfaceType: EFA +Capitalization matters! +//// \ No newline at end of file diff --git a/modules/machine-feature-aws-imds-options.adoc b/modules/machine-feature-aws-imds-options.adoc new file mode 100644 index 0000000000..e43e4a3757 --- /dev/null +++ b/modules/machine-feature-aws-imds-options.adoc @@ -0,0 +1,60 @@ +// Module included in the following assemblies: +// +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/cluster-api-config-options-aws.adoc + +:_mod-docs-content-type: CONCEPT +[id="machine-feature-aws-imds-options_{context}"] += Amazon EC2 Instance Metadata Service configuration options + +You can restrict the version of the Amazon EC2 Instance Metadata Service (IMDS) that machines on {aws-first} clusters use. +Machines can require the use of link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html[IMDSv2] (AWS documentation), or allow the use of IMDSv1 in addition to IMDSv2. + +//// +This is true but does not apply to TP clusters, reassess for Cluster API GA +[NOTE] +==== +To use IMDSv2 on AWS clusters that were created with {product-title} version 4.6 or earlier, you must update your boot image. +For more information, see "Updated boot images". +==== +//// + +include::snippets/apply-machine-configuration-method.adoc[tag=method-machine-template] + +[IMPORTANT] +==== +Before creating machines that require IMDSv2, ensure that any workloads that interact with the IMDS support IMDSv2. +==== + +.Sample IMDS configuration +[source,yaml] +---- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 +kind: AWSMachineTemplate +# ... +spec: + template: + spec: + instanceMetadataOptions: + httpEndpoint: enabled + httpPutResponseHopLimit: 1 <1> + httpTokens: optional <2> + instanceMetadataTags: disabled +# ... +---- +<1> Specifies the number of network hops allowed for IMDSv2 calls. +If no value is specified, this parameter is set to `1` by default. +<2> Specifies whether to require the use of IMDSv2. +If no value is specified, this parameter is set to `optional` by default. +The following values are valid: +`optional`:: Allow the use of both IMDSv1 and IMDSv2. +`required`:: Require IMDSv2. + +[NOTE] +==== +The Machine API does not support the `httpEndpoint`, `httpPutResponseHopLimit`, and `instanceMetadataTags` fields. +If you migrate a Cluster API machine template that uses this feature to a Machine API compute machine set, any Machine API machines that it creates will not have these fields and the underlying instances will not use these settings. +Any existing machines that the migrated machine set manages will retain these fields and the underlying instances will continue to use these settings. +==== + +Requiring the use of IMDSv2 might cause timeouts. +For more information, including mitigation strategies, see link:https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#imds-considerations[Instance metadata access considerations] (AWS documentation). diff --git a/snippets/apply-machine-configuration-method.adoc b/snippets/apply-machine-configuration-method.adoc new file mode 100644 index 0000000000..0d5109df49 --- /dev/null +++ b/snippets/apply-machine-configuration-method.adoc @@ -0,0 +1,39 @@ +// Text snippet included in the following modules: +// +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/modules/machine-feature-aws-existing-placement-group.adoc +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/modules/machine-feature-aws-imds-options.adoc +// * machine_management/cluster_api_machine_management/cluster_api_provider_configurations/modules/machine-feature-agnostic-nonguaranteed-instances.adoc +// * +// * +// * +// * +// * +// * +// * +// * + +:_mod-docs-content-type: SNIPPET + +//Cluster API machine template +tag::method-machine-template[] +To deploy compute machines with your configuration, configure the appropriate values in a machine template YAML file. +Then, configure a machine set YAML file to reference the machine template when it deploys machines. +end::method-machine-template[] + +//Cluster API or Machine API machine set +tag::method-compute-machine-set[] +To deploy compute machines with your configuration, configure the appropriate values in a machine set YAML file to use when it deploys machines. +end::method-compute-machine-set[] + +//Cluster API machine template and machine set +tag::method-machine-template-and-machine-set[] +To deploy compute machines with your configuration, configure the appropriate values in a machine template YAML file and a machine set YAML file that references the machine template when it deploys machines. +end::method-machine-template-and-machine-set[] + +//Control plane machine set +tag::method-control-plane-machine-set[] +To deploy control machines with your configuration, configure the appropriate values in your control plane machine set YAML file. + +* For clusters that use the default `RollingUpdate` update strategy, the control plane machine set propagates changes to your control plane configuration automatically. +* For clusters that use the `OnDelete` update strategy, you must replace your control plane machines manually. +end::method-control-plane-machine-set[]