From 3b4a603037cee2ec6bd7d607002470d0fcac6e3a Mon Sep 17 00:00:00 2001 From: Andrea Hoffer Date: Mon, 8 Dec 2025 11:22:42 -0500 Subject: [PATCH] OSDOCS-16981: CQA updates for AI workloads book intro and LWS docs --- ai_workloads/index.adoc | 1 + ai_workloads/leader_worker_set/index.adoc | 3 + .../leader_worker_set/lws-managing.adoc | 1 + .../leader_worker_set/lws-release-notes.adoc | 3 + .../leader_worker_set/lws-uninstalling.adoc | 3 +- modules/ai-operators.adoc | 1 + modules/ai-rhoai.adoc | 1 + modules/lws-about.adoc | 3 + modules/lws-arch.adoc | 5 +- modules/lws-config.adoc | 80 ++++++++++++------- modules/lws-install-operator.adoc | 3 +- modules/lws-remove-resources.adoc | 3 +- modules/lws-rn-1.0.0.adoc | 3 + modules/lws-uninstall.adoc | 3 +- 14 files changed, 80 insertions(+), 33 deletions(-) diff --git a/ai_workloads/index.adoc b/ai_workloads/index.adoc index 65e4157b2d..777bc26787 100644 --- a/ai_workloads/index.adoc +++ b/ai_workloads/index.adoc @@ -7,6 +7,7 @@ include::_attributes/common-attributes.adoc[] toc::[] +[role="_abstract"] {product-title} provides a secure, scalable foundation for running artificial intelligence (AI) workloads across training, inference, and data science workflows. // Operators for running AI workloads diff --git a/ai_workloads/leader_worker_set/index.adoc b/ai_workloads/leader_worker_set/index.adoc index 1c3e08b4a2..5eb04d0ebe 100644 --- a/ai_workloads/leader_worker_set/index.adoc +++ b/ai_workloads/leader_worker_set/index.adoc @@ -7,6 +7,9 @@ include::_attributes/common-attributes.adoc[] toc::[] +[role="_abstract"] +Use the {lws-operator} to manage multi-node AI/ML inference deployments efficiently. The {lws-operator} treats groups of pods as one unit to simplify scaling, recovery, and updates for large workloads. + Using large language models (LLMs) for AI/ML inference often requires significant compute resources, and workloads typically must be sharded across multiple nodes. This can make deployments complex, creating challenges around scaling, recovery from failures, and efficient pod placement. The {lws-operator} simplifies these multi-node deployments by treating a group of pods as a single, coordinated unit. It manages the lifecycle of each pod in the group, scales the entire group together, and performs updates and failure recovery at the group level to ensure consistency. diff --git a/ai_workloads/leader_worker_set/lws-managing.adoc b/ai_workloads/leader_worker_set/lws-managing.adoc index f710d4d84a..d055a5ce68 100644 --- a/ai_workloads/leader_worker_set/lws-managing.adoc +++ b/ai_workloads/leader_worker_set/lws-managing.adoc @@ -7,6 +7,7 @@ include::_attributes/common-attributes.adoc[] toc::[] +[role="_abstract"] You can use the {lws-operator} to manage distributed inference workloads and process large-scale inference requests efficiently. // Installing the {lws-operator} diff --git a/ai_workloads/leader_worker_set/lws-release-notes.adoc b/ai_workloads/leader_worker_set/lws-release-notes.adoc index d8d1ec85f7..4cca49e12e 100644 --- a/ai_workloads/leader_worker_set/lws-release-notes.adoc +++ b/ai_workloads/leader_worker_set/lws-release-notes.adoc @@ -7,6 +7,9 @@ include::_attributes/common-attributes.adoc[] toc::[] +[role="_abstract"] +Review the {lws-operator} release notes to track its development and learn what is new and changed with each release. + You can use the {lws-operator} to manage distributed inference workloads and process large-scale inference requests efficiently. These release notes track the development of the {lws-operator}. diff --git a/ai_workloads/leader_worker_set/lws-uninstalling.adoc b/ai_workloads/leader_worker_set/lws-uninstalling.adoc index 40059db45f..35da14e97f 100644 --- a/ai_workloads/leader_worker_set/lws-uninstalling.adoc +++ b/ai_workloads/leader_worker_set/lws-uninstalling.adoc @@ -7,7 +7,8 @@ include::_attributes/common-attributes.adoc[] toc::[] -You can remove the {lws-operator} from {product-title} by uninstalling the Operator and removing its related resources. +[role="_abstract"] +If you no longer need the {lws-operator} in your cluster, you can uninstall the Operator and remove its related resources. // Uninstalling the {lws-operator} include::modules/lws-uninstall.adoc[leveloffset=+1] diff --git a/modules/ai-operators.adoc b/modules/ai-operators.adoc index 957c33d137..29315d8d45 100644 --- a/modules/ai-operators.adoc +++ b/modules/ai-operators.adoc @@ -6,6 +6,7 @@ [id="ai-operators_{context}"] = Operators for running AI workloads +[role="_abstract"] You can use Operators to run artificial intelligence (AI) and machine learning (ML) workloads on {product-title}. With Operators, you can build a customized environment that meets your specific AI/ML requirements while continuing to use {product-title} as the core platform for your applications. {product-title} provides several Operators that can help you run AI workloads: diff --git a/modules/ai-rhoai.adoc b/modules/ai-rhoai.adoc index 9b16b705ef..06e43c39a3 100644 --- a/modules/ai-rhoai.adoc +++ b/modules/ai-rhoai.adoc @@ -8,6 +8,7 @@ // TODO: This needs approval from RHOAI team before it can be included +[role="_abstract"] If your organization requires an integrated environment to develop, train, serve, test, and monitor AI/ML models and applications, consider {rhoai-full}. {rhoai-full} is a platform for data scientists and developers of artificial intelligence and machine learning (AI/ML) applications. {rhoai-full} builds on {product-title} and provides a preconfigured set of tools, accelerators, and other features to manage the full AI/ML lifecycle. This approach reduces the need to assemble and maintain individual Operators or components for AI workloads. diff --git a/modules/lws-about.adoc b/modules/lws-about.adoc index 008f12fb88..b4991a1277 100644 --- a/modules/lws-about.adoc +++ b/modules/lws-about.adoc @@ -6,6 +6,9 @@ [id="lws-about_{context}"] = About the {lws-operator} +[role="_abstract"] +Use the {lws-operator} to deploy groups of pods as a single, manageable unit. This helps you to deploy large AI/ML inference workloads, such as sharded large language models (LLMs). + The {lws-operator} is based on the link:https://lws.sigs.k8s.io/[LeaderWorkerSet] open source project. `LeaderWorkerSet` is a custom Kubernetes API that can be used to deploy a group of pods as a unit. This is useful for artificial intelligence (AI) and machine learning (ML) inference workloads, where large language models (LLMs) are sharded across multiple nodes. With the `LeaderWorkerSet` API, pods are grouped into units consisting of one leader and multiple workers, all managed together as a single entity. Each pod in a group has a unique pod identity. Pods within a group are created in parallel and share identical lifecycle stages. Rollouts, rolling updates, and pod failure restarts are performed as a group. diff --git a/modules/lws-arch.adoc b/modules/lws-arch.adoc index 45a814d191..5e268b55fa 100644 --- a/modules/lws-arch.adoc +++ b/modules/lws-arch.adoc @@ -6,7 +6,10 @@ [id="lws-arch_{context}"] = LeaderWorkerSet architecture -The following diagram shows how the `LeaderWorkerSet` API organizes groups of pods into a single unit, with one pod as the leader and the rest as the workers, to coordinate distributed workloads: +[role="_abstract"] +Review the LeaderWorkerSet architecture to learn how the `LeaderWorkerSet` API organizes groups of pods into a single unit, with one pod as the leader and the rest as the workers, to coordinate distributed workloads. + +The following diagram describes the LeaderWorkerSet architecture: .Leader worker set architecture image::587_OpenShift_lws_0925.png[Leader worker set architecture] diff --git a/modules/lws-config.adoc b/modules/lws-config.adoc index f666d8470f..791967b32b 100644 --- a/modules/lws-config.adoc +++ b/modules/lws-config.adoc @@ -6,6 +6,7 @@ [id="lws-config_{context}"] = Deploying a leader worker set +[role="_abstract"] You can use the {lws-operator} to deploy a leader worker set to assist with managing distributed workloads across nodes. .Prerequisites @@ -29,20 +30,20 @@ apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: generation: 1 - name: my-lws <1> - namespace: my-namespace <2> + name: my-lws + namespace: my-namespace spec: leaderWorkerTemplate: - leaderTemplate: <3> + leaderTemplate: metadata: {} spec: containers: - image: nginxinc/nginx-unprivileged:1.27 name: leader resources: {} - restartPolicy: RecreateGroupOnPodRestart <4> - size: 3 <5> - workerTemplate: <6> + restartPolicy: RecreateGroupOnPodRestart + size: 3 + workerTemplate: metadata: {} spec: containers: @@ -53,24 +54,45 @@ spec: protocol: TCP resources: {} networkConfig: - subdomainPolicy: Shared <7> - replicas: 2 <8> + subdomainPolicy: Shared + replicas: 2 rolloutStrategy: rollingUpdateConfiguration: - maxSurge: 1 <9> + maxSurge: 1 maxUnavailable: 1 type: RollingUpdate startupPolicy: LeaderCreated ---- -<1> Specify the name of the leader worker set resource. -<2> Specify the namespace for the leader worker set to run in. -<3> Specify the pod template for the leader pods. -<4> Specify the restart policy for when pod failures occur. Allowed values are `RecreateGroupOnPodRestart` to restart the whole group or `None` to not restart the group. -<5> Specify the number of pods to create for each group, including the leader pod. For example, a value of `3` creates 1 leader pod and 2 worker pods. The default value is `1`. -<6> Specify the pod template for the worker pods. -<7> Specify the policy to use when creating the headless service. Allowed values are `UniquePerReplica` or `Shared`. The default value is `Shared`. -<8> Specify the number of replicas, or leader-worker groups. The default value is `1`. -<9> Specify the maximum number of replicas that can be scheduled above the `replicas` value during rolling updates. The value can be specified as an integer or a percentage. ++ +where: + +`metadata.name`:: +Specifies the name of the leader worker set resource. + +`metadata.namespace`:: +Specifies the namespace for the leader worker set to run in. + +`spec.leaderWorkerTemplate.leaderTemplate`:: +Specifies the pod template for the leader pods. + +`spec.leaderWorkerTemplate.restartPolicy`:: +Specifies the restart policy for when pod failures occur. Allowed values are `RecreateGroupOnPodRestart` to restart the whole group or `None` to not restart the group. + +`spec.leaderWorkerTemplate.size`:: +Specifies the number of pods to create for each group, including the leader pod. For example, a value of `3` creates 1 leader pod and 2 worker pods. The default value is `1`. + +`spec.leaderWorkerTemplate.workerTemplate`:: +Specifies the pod template for the worker pods. + +`spec.networkConfig.subdomainPolicy`:: +Specifies the policy to use when creating the headless service. Allowed values are `UniquePerReplica` or `Shared`. The default value is `Shared`. + +`spec.replicas`:: +Specifies the number of replicas, or leader-worker groups. The default value is `1`. + +`spec.rolloutStrategy.rollingUpdateConfiguration.maxSurge`:: +Specifies the maximum number of replicas that can be scheduled above the `replicas` value during rolling updates. The value can be specified as an integer or a percentage. + + For more information on all available fields to configure, see link:https://lws.sigs.k8s.io/docs/reference/leaderworkerset.v1/[LeaderWorkerSet API] upstream documentation. @@ -94,15 +116,16 @@ $ oc get pods -n my-namespace [source,terminal] ---- NAME READY STATUS RESTARTS AGE -my-lws-0 1/1 Running 0 4s <1> +my-lws-0 1/1 Running 0 4s my-lws-0-1 1/1 Running 0 3s my-lws-0-2 1/1 Running 0 3s -my-lws-1 1/1 Running 0 7s <2> +my-lws-1 1/1 Running 0 7s my-lws-1-1 1/1 Running 0 6s my-lws-1-2 1/1 Running 0 6s ---- -<1> The leader pod for the first group. -<2> The leader pod for the second group. ++ +** `my-lws-0` is the leader pod for the first group. +** `my-lws-1` is the leader pod for the second group. . Review the stateful sets by running the following command: + @@ -115,10 +138,11 @@ $ oc get statefulsets [source,terminal] ---- NAME READY AGE -my-lws 4/4 111s <1> -my-lws-0 2/2 57s <2> -my-lws-1 2/2 60s <3> +my-lws 4/4 111s +my-lws-0 2/2 57s +my-lws-1 2/2 60s ---- -<1> The leader stateful set for all leader-worker groups. -<2> The worker stateful set for the first group. -<3> The worker stateful set for the second group. ++ +** `my-lws` is the leader stateful set for all leader-worker groups. +** `my-lws-0` is the worker stateful set for the first group. +** `my-lws-1` is the worker stateful set for the second group. diff --git a/modules/lws-install-operator.adoc b/modules/lws-install-operator.adoc index d9a80f095d..687f50e38f 100644 --- a/modules/lws-install-operator.adoc +++ b/modules/lws-install-operator.adoc @@ -6,7 +6,8 @@ [id="lws-install-operator_{context}"] = Installing the {lws-operator} -You can use the web console to install the {lws-operator}. +[role="_abstract"] +You can install the {lws-operator} through the {product-title} web console to begin managing distributed AI workloads. .Prerequisites diff --git a/modules/lws-remove-resources.adoc b/modules/lws-remove-resources.adoc index 7d55150458..057bae35aa 100644 --- a/modules/lws-remove-resources.adoc +++ b/modules/lws-remove-resources.adoc @@ -6,7 +6,8 @@ [id="lws-remove-resources_{context}"] = Uninstalling {lws-operator} resources -Optionally, after uninstalling the {lws-operator}, you can remove its related resources from your cluster. +[role="_abstract"] +Optionally, remove custom resources (CRs) and the associated namespace after the {lws-operator} is uninstalled. This cleans up all remaining Leader Worker Set artifacts. .Prerequisites diff --git a/modules/lws-rn-1.0.0.adoc b/modules/lws-rn-1.0.0.adoc index 0893ef03d4..ecd9777bc9 100644 --- a/modules/lws-rn-1.0.0.adoc +++ b/modules/lws-rn-1.0.0.adoc @@ -8,6 +8,9 @@ [id="lws-rn-1.0.0_{context}"] = Release notes for {lws-operator} 1.0.0 +[role="_abstract"] +Review the release notes for {lws-operator} 1.0.0 to learn what is new and updated with this release. + Issued: 18 September 2025 The following advisories are available for the {lws-operator} 1.0.0: diff --git a/modules/lws-uninstall.adoc b/modules/lws-uninstall.adoc index 42601f1de0..a3d7b3bc6f 100644 --- a/modules/lws-uninstall.adoc +++ b/modules/lws-uninstall.adoc @@ -6,7 +6,8 @@ [id="lws-uninstall_{context}"] = Uninstalling the {lws-operator} -You can use the web console to uninstall the {lws-operator}. +[role="_abstract"] +You can use the web console to uninstall the {lws-operator} if you no longer need the Operator in your cluster. .Prerequisites