mirror of
https://github.com/openshift/openshift-docs.git
synced 2026-02-05 12:46:18 +01:00
257 lines
7.8 KiB
Plaintext
257 lines
7.8 KiB
Plaintext
// Module included in the following assemblies:
|
|
//
|
|
// * operators/user/das-dynamic-accelerator-slicer-operator.adoc
|
|
|
|
:_mod-docs-content-type: PROCEDURE
|
|
[id="das-operator-installing-web-console_{context}"]
|
|
= Installing the Dynamic Accelerator Slicer Operator using the web console
|
|
|
|
As a cluster administrator, you can install the Dynamic Accelerator Slicer (DAS) Operator using the {product-title} web console.
|
|
|
|
.Prerequisites
|
|
|
|
* You have access to an {product-title} cluster using an account with `cluster-admin` permissions.
|
|
* You have installed the required prerequisites:
|
|
** cert-manager Operator for Red Hat OpenShift
|
|
** Node Feature Discovery (NFD) Operator
|
|
** NVIDIA GPU Operator
|
|
** NodeFeatureDiscovery CR
|
|
|
|
.Procedure
|
|
|
|
. Configure the NVIDIA GPU Operator for MIG support:
|
|
|
|
.. In the {product-title} web console, navigate to *Ecosystem* -> *Installed Operators*.
|
|
|
|
.. Select the *NVIDIA GPU Operator* from the list of installed operators.
|
|
|
|
.. Click the *ClusterPolicy* tab and then click *Create ClusterPolicy*.
|
|
|
|
.. In the YAML editor, replace the default content with the following cluster policy configuration to disable the default NVIDIA device plugin and enable MIG support:
|
|
+
|
|
[source,yaml]
|
|
----
|
|
apiVersion: nvidia.com/v1
|
|
kind: ClusterPolicy
|
|
metadata:
|
|
name: gpu-cluster-policy
|
|
spec:
|
|
daemonsets:
|
|
rollingUpdate:
|
|
maxUnavailable: "1"
|
|
updateStrategy: RollingUpdate
|
|
dcgm:
|
|
enabled: true
|
|
dcgmExporter:
|
|
config:
|
|
name: ""
|
|
enabled: true
|
|
serviceMonitor:
|
|
enabled: true
|
|
devicePlugin:
|
|
config:
|
|
default: ""
|
|
name: ""
|
|
enabled: false
|
|
mps:
|
|
root: /run/nvidia/mps
|
|
driver:
|
|
certConfig:
|
|
name: ""
|
|
enabled: true
|
|
kernelModuleConfig:
|
|
name: ""
|
|
licensingConfig:
|
|
configMapName: ""
|
|
nlsEnabled: true
|
|
repoConfig:
|
|
configMapName: ""
|
|
upgradePolicy:
|
|
autoUpgrade: true
|
|
drain:
|
|
deleteEmptyDir: false
|
|
enable: false
|
|
force: false
|
|
timeoutSeconds: 300
|
|
maxParallelUpgrades: 1
|
|
maxUnavailable: 25%
|
|
podDeletion:
|
|
deleteEmptyDir: false
|
|
force: false
|
|
timeoutSeconds: 300
|
|
waitForCompletion:
|
|
timeoutSeconds: 0
|
|
useNvidiaDriverCRD: false
|
|
useOpenKernelModules: false
|
|
virtualTopology:
|
|
config: ""
|
|
gdrcopy:
|
|
enabled: false
|
|
gds:
|
|
enabled: false
|
|
gfd:
|
|
enabled: true
|
|
mig:
|
|
strategy: mixed
|
|
migManager:
|
|
config:
|
|
default: ""
|
|
name: default-mig-parted-config
|
|
enabled: true
|
|
env:
|
|
- name: WITH_REBOOT
|
|
value: 'true'
|
|
- name: MIG_PARTED_MODE_CHANGE_ONLY
|
|
value: 'true'
|
|
nodeStatusExporter:
|
|
enabled: true
|
|
operator:
|
|
defaultRuntime: crio
|
|
initContainer: {}
|
|
runtimeClass: nvidia
|
|
use_ocp_driver_toolkit: true
|
|
sandboxDevicePlugin:
|
|
enabled: true
|
|
sandboxWorkloads:
|
|
defaultWorkload: container
|
|
enabled: false
|
|
toolkit:
|
|
enabled: true
|
|
installDir: /usr/local/nvidia
|
|
validator:
|
|
plugin:
|
|
env:
|
|
- name: WITH_WORKLOAD
|
|
value: "false"
|
|
cuda:
|
|
env:
|
|
- name: WITH_WORKLOAD
|
|
value: "false"
|
|
vfioManager:
|
|
enabled: true
|
|
vgpuDeviceManager:
|
|
enabled: true
|
|
vgpuManager:
|
|
enabled: false
|
|
----
|
|
|
|
.. Click *Create* to apply the cluster policy.
|
|
|
|
.. Navigate to *Workloads* -> *Pods* and select the `nvidia-gpu-operator` namespace to monitor the cluster policy deployment.
|
|
|
|
.. Wait for the NVIDIA GPU Operator cluster policy to reach the `Ready` state. You can monitor this by:
|
|
+
|
|
... Navigating to *Ecosystem* -> *Installed Operators* -> *NVIDIA GPU Operator*.
|
|
... Clicking the *ClusterPolicy* tab and checking that the status shows `ready`.
|
|
|
|
.. Verify that all pods in the NVIDIA GPU Operator namespace are running by selecting the `nvidia-gpu-operator` namespace and navigating to *Workloads* -> *Pods*.
|
|
|
|
.. Label nodes with MIG-capable GPUs to enable MIG mode:
|
|
+
|
|
... Navigate to *Compute* -> *Nodes*.
|
|
... Select a node that has MIG-capable GPUs.
|
|
... Click *Actions* -> *Edit Labels*.
|
|
... Add the label `nvidia.com/mig.config=all-enabled`.
|
|
... Click *Save*.
|
|
... Repeat for each node with MIG-capable GPUs.
|
|
+
|
|
[IMPORTANT]
|
|
====
|
|
After applying the MIG label, the labeled nodes will reboot to enable MIG mode. Wait for the nodes to come back online before proceeding.
|
|
====
|
|
|
|
.. Verify that MIG mode is successfully enabled on the GPU nodes by checking that the `nvidia.com/mig.config=all-enabled` label appears in the *Labels* section. To locate the label, navigate to *Compute → Nodes*, select the GPU node, and click the *Details* tab.
|
|
|
|
. In the {product-title} web console, click *Ecosystem* -> *Software Catalog*.
|
|
|
|
. Search for *Dynamic Accelerator Slicer* or *DAS* in the filter box to locate the DAS Operator.
|
|
|
|
. Select the *Dynamic Accelerator Slicer* and click *Install*.
|
|
|
|
. On the *Install Operator* page:
|
|
.. Select *All namespaces on the cluster (default)* for the installation mode.
|
|
.. Select *Installed Namespace* -> *Operator recommended Namespace: Project das-operator*.
|
|
.. If creating a new namespace, enter `das-operator` as the namespace name.
|
|
.. Select an update channel.
|
|
.. Select *Automatic* or *Manual* for the approval strategy.
|
|
|
|
. Click *Install*.
|
|
|
|
. In the {product-title} web console, click *Ecosystem* -> *Installed Operators*.
|
|
|
|
. Select *DAS Operator* from the list.
|
|
|
|
. In the *Provided APIs* table column, click *DASOperator*. This takes you to the *DASOperator* tab of the *Operator details* page.
|
|
|
|
. Click *Create DASOperator*. This takes you to the *Create DASOperator* YAML view.
|
|
|
|
. In the YAML editor, paste the following example:
|
|
+
|
|
.Example `DASOperator` CR
|
|
[source,yaml]
|
|
----
|
|
apiVersion: inference.redhat.com/v1alpha1
|
|
kind: DASOperator
|
|
metadata:
|
|
name: cluster <1>
|
|
namespace: das-operator
|
|
spec:
|
|
logLevel: Normal
|
|
operatorLogLevel: Normal
|
|
managementState: Managed
|
|
----
|
|
<1> The name of the `DASOperator` CR must be `cluster`.
|
|
|
|
. Click *Create*.
|
|
|
|
.Verification
|
|
|
|
To verify that the DAS Operator installed successfully:
|
|
|
|
. Navigate to the *Ecosystem* -> *Installed Operators* page.
|
|
. Ensure that *Dynamic Accelerator Slicer* is listed in the `das-operator` namespace with a *Status* of *Succeeded*.
|
|
|
|
To verify that the `DASOperator` CR installed successfully:
|
|
|
|
* After you create the `DASOperator` CR, the web console brings you to the *DASOperator list view*. The *Status* field of the CR changes to *Available* when all of the components are running.
|
|
|
|
* Optional. You can verify that the `DASOperator` CR installed successfully by running the following command in the OpenShift CLI:
|
|
+
|
|
[source,terminal]
|
|
----
|
|
$ oc get dasoperator -n das-operator
|
|
----
|
|
+
|
|
.Example output
|
|
+
|
|
[source,terminal]
|
|
----
|
|
NAME STATUS AGE
|
|
cluster Available 3m
|
|
----
|
|
|
|
[NOTE]
|
|
====
|
|
During installation an Operator might display a *Failed* status. If the installation later succeeds with an *Succeeded* message, you can ignore the *Failed* message.
|
|
====
|
|
|
|
You can also verify the installation by checking the pods:
|
|
|
|
. Navigate to the *Workloads* -> *Pods* page and select the `das-operator` namespace.
|
|
. Verify that all DAS Operator component pods are running:
|
|
** `das-operator` pods (main operator controllers)
|
|
** `das-operator-webhook` pods (webhook servers)
|
|
** `das-scheduler` pods (scheduler plugins)
|
|
** `das-daemonset` pods (only on nodes with MIG-compatible GPUs)
|
|
|
|
[NOTE]
|
|
====
|
|
The `das-daemonset` pods will only appear on nodes that have MIG-compatible GPU hardware. If you do not see any daemonset pods, verify that your cluster has nodes with supported GPU hardware and that the NVIDIA GPU Operator is properly configured.
|
|
====
|
|
|
|
.Troubleshooting
|
|
Use the following procedure if the Operator does not appear to be installed:
|
|
|
|
. Navigate to the *Ecosystem* -> *Installed Operators* page and inspect the *Operator Subscriptions* and *Install Plans* tabs for any failure or errors under *Status*.
|
|
. Navigate to the *Workloads* -> *Pods* page and check the logs for pods in the `das-operator` namespace.
|