diff --git a/_attributes/common-attributes.adoc b/_attributes/common-attributes.adoc index 1a189e5a35..f4b4908e61 100644 --- a/_attributes/common-attributes.adoc +++ b/_attributes/common-attributes.adoc @@ -79,6 +79,7 @@ endif::[] :descheduler-operator: Kube Descheduler Operator :cli-manager: CLI Manager Operator :lws-operator: Leader Worker Set Operator +:js-operator: JobSet Operator //Kueue :kueue-name: Red{nbsp}Hat build of Kueue :kueue-op: Red Hat Build of Kueue Operator diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index e546f78c19..feb5b3aeda 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -3472,6 +3472,16 @@ Topics: File: lws-managing - Name: Uninstalling the Leader Worker Set Operator File: lws-uninstalling +- Name: JobSet Operator + Dir: jobset_operator + Distros: openshift-enterprise + Topics: + - Name: Jobset Operator overview + File: index + - Name: Installing the JobSet Operator + File: jobset-install + - Name: JobSet Operator release notes + File: jobset-release-notes --- Name: Edge computing Dir: edge_computing diff --git a/ai_workloads/jobset_operator/_attributes b/ai_workloads/jobset_operator/_attributes new file mode 120000 index 0000000000..20cc1dcb77 --- /dev/null +++ b/ai_workloads/jobset_operator/_attributes @@ -0,0 +1 @@ +../../_attributes/ \ No newline at end of file diff --git a/ai_workloads/jobset_operator/images b/ai_workloads/jobset_operator/images new file mode 120000 index 0000000000..847b03ed05 --- /dev/null +++ b/ai_workloads/jobset_operator/images @@ -0,0 +1 @@ +../../images/ \ No newline at end of file diff --git a/ai_workloads/jobset_operator/index.adoc b/ai_workloads/jobset_operator/index.adoc new file mode 100644 index 0000000000..f1b06fcc54 --- /dev/null +++ b/ai_workloads/jobset_operator/index.adoc @@ -0,0 +1,22 @@ +:_mod-docs-content-type: ASSEMBLY +include::_attributes/common-attributes.adoc[] +[id="js-about"] += {js-operator} overview + +:context: js-about + +toc::[] + +Use the {js-operator} on {product-title} to easily manage and run large-scale, coordinated workloads like high-performance computing (HPC) and AI training. The {js-operator} can help you gain fast recovery and efficient resource use through features like multi-template job support and stable networking. + +:FeatureName: {js-operator} +include::snippets/technology-preview.adoc[] + +// About the {js-operator} +include::modules/about-jobset.adoc[leveloffset=+1] + +[role="_additional-resources"] +[id="js-about_additional-resources"] +== Additional resources + +* link:https://jobset.sigs.k8s.io/docs/overview/[JobSet documentation (Kubernetes)] \ No newline at end of file diff --git a/ai_workloads/jobset_operator/jobset-install.adoc b/ai_workloads/jobset_operator/jobset-install.adoc new file mode 100644 index 0000000000..915b339cb1 --- /dev/null +++ b/ai_workloads/jobset_operator/jobset-install.adoc @@ -0,0 +1,16 @@ +:_mod-docs-content-type: ASSEMBLY +include::_attributes/common-attributes.adoc[] +[id="js-install"] += Installing the {js-operator} + +:context: js-install + +toc::[] + +Install the {js-operator} on {product-title} to enable management of large-scale, coordinated computing workloads, giving your applications a unified API and failure recovery. + +:FeatureName: {js-operator} +include::snippets/technology-preview.adoc[] + +// Installing the {js-operator} +include::modules/installing-jobset.adoc[leveloffset=+1] \ No newline at end of file diff --git a/ai_workloads/jobset_operator/jobset-release-notes.adoc b/ai_workloads/jobset_operator/jobset-release-notes.adoc new file mode 100644 index 0000000000..6505251139 --- /dev/null +++ b/ai_workloads/jobset_operator/jobset-release-notes.adoc @@ -0,0 +1,18 @@ +:_mod-docs-content-type: ASSEMBLY +include::_attributes/common-attributes.adoc[] +[id="js-release-notes"] += {js-operator} release notes + +:context: js-release-notes + +toc::[] + +Track the development, features, and fixes for the {js-operator}, which manages coordinated, large-scale computing workloads on {product-title}. + +:FeatureName: {js-operator} +include::snippets/technology-preview.adoc[] + +For more information, see xref:../../ai_workloads/jobset_operator/index.adoc#js-about[About the {js-operator}]. + +//Release notes for JobSet Operator 0.1.0 +include::modules/js-rn-initial.adoc[leveloffset=+1] \ No newline at end of file diff --git a/ai_workloads/jobset_operator/modules b/ai_workloads/jobset_operator/modules new file mode 120000 index 0000000000..36719b9de7 --- /dev/null +++ b/ai_workloads/jobset_operator/modules @@ -0,0 +1 @@ +../../modules/ \ No newline at end of file diff --git a/ai_workloads/jobset_operator/snippets b/ai_workloads/jobset_operator/snippets new file mode 120000 index 0000000000..5a3f5add14 --- /dev/null +++ b/ai_workloads/jobset_operator/snippets @@ -0,0 +1 @@ +../../snippets/ \ No newline at end of file diff --git a/modules/about-jobset.adoc b/modules/about-jobset.adoc new file mode 100644 index 0000000000..5483c2a008 --- /dev/null +++ b/modules/about-jobset.adoc @@ -0,0 +1,22 @@ +// Module included in the following assemblies: +// +// * ai_workloads/jobset_operator/index.adoc + +:_mod-docs-content-type: CONCEPT +[id="js-about_{context}"] += About the {js-operator} + +[role="_abstract"] +Use the {js-operator} on {product-title} to manage large, distributed, and coordinated computing workloads, such as high-performance computing (HPC) or artificial intelligence (AI) training, and gain automatic stability, coordination, and failure recovery. + +The {js-operator} is based on the link:https://jobset.sigs.k8s.io/docs/overview/[JobSet] open source project. + +{js-operator} is designed to manage a group of jobs as a single, coordinated unit. This is especially useful for fields like HPC and training massive AI models where you need a team of machines to run for hours or days. + +You can use the {js-operator} to solve problems that are too big or too complex for a standard {product-title} job. The {js-operator} provides coordination, stability, and recovery. + +The {js-operator} automatically sets up stable headless service to get an IP address so workers can find and communicate with each other, even after a failure and restart. It also provides automatic failure recovery. If one small part of a large training job fails, the Operator can be configured to restart the entire group of workers from a saved checkpoint. This saves time and computing costs. + +The {js-operator} offers startup control, allowing you to define a specific startup sequence to ensure dependencies are met. For example, making sure the leader is running before any workers attempt to connect. + +{js-operator} makes managing large, distributed, and coordinated computing tasks on {product-title} easier, turning many individual components into one resilient and manageable system. \ No newline at end of file diff --git a/modules/installing-jobset.adoc b/modules/installing-jobset.adoc new file mode 100644 index 0000000000..1b002cb3ab --- /dev/null +++ b/modules/installing-jobset.adoc @@ -0,0 +1,62 @@ +// Module included in the following assemblies: +// +// * ai_workloads/jobset_operator/jobset-install.adoc + +:_mod-docs-content-type: PROCEDURE +[id="js-install_{context}"] += Installing the {js-operator} + +[role="_abstract"] +Install the {js-operator} on {product-title} using the web console to begin managing large-scale, coordinated computing workloads. + +.Prerequisites + +* You have access to the cluster with `cluster-admin` privileges. +* You have access to the {product-title} web console. +* You have installed the {cert-manager-operator}. + +.Procedure + +. Log in to the {product-title} web console. + +. Verify that the {cert-manager-operator} is installed. + +. Install the {js-operator}. +.. Navigate to *Ecosystem* -> *Software Catalog*. +.. Search for and select the *`openshift-operators`* project. +.. Enter *{js-operator}* into the filter box. +.. Select the *{js-operator}* and click *Install*. +.. On the *Install Operator* page: +... The *Update channel* is set to *tech-preview-v0.1*, which installs the latest stable release of {js-operator} 0.1. +... Under *Installation mode*, select *A specific namespace on the cluster*. +... Under *Installed Namespace*, select *Operator recommended Namespace: openshift-jobset-operator*. +... Under *Update approval*, select one of the following update strategies: ++ +* The *Automatic* strategy allows {olm-first} to automatically update the Operator when a new version is available. +* The *Manual* strategy requires a user with appropriate credentials to approve the Operator update. +... Click *Install*. + +. Create the custom resource (CR) for the {js-operator}: +.. Navigate to *Installed Operators* -> *{js-operator}*. +.. Navigate to *Create JobSetOperator* page. +.. Set the name to *cluster*. +.. Set the *managementState* to *Managed*. +.. Under *Provided APIs*, click *Create instance* in the *JobSetOperator* pane. +.. Click *Create*. + +.Verification + +* Check that the {js-operator} and operand pods are running by entering the following command: ++ +[source,terminal] +---- +$ oc get pod -n openshift-jobset-operator +---- ++ +.Example output +[source,terminal] +---- +NAME READY STATUS RESTARTS AGE +jobset-controller-manager-5595547fb-b4g2x 1/1 Running 0 48s +jobset-operator-596cb848c6-q2dmp 1/1 Running 0 2m33s +---- diff --git a/modules/js-rn-initial.adoc b/modules/js-rn-initial.adoc new file mode 100644 index 0000000000..44dd3f1b7a --- /dev/null +++ b/modules/js-rn-initial.adoc @@ -0,0 +1,35 @@ +// Module included in the following assemblies: +// +// * ai_workloads/jobset_operator/jobset-release-notes.adoc + +// This release notes module is allowed to contain xrefs. It must only ever be included from one assembly. + +:_mod-docs-content-type: REFERENCE +[id="js-rn-initial_{context}"] += Release notes for {js-operator} 0.1.0 + +[role="_abstract"] +Review the new features and advisories for the initial Technology Preview release of {js-operator} 0.1.0. + +Issued: 4 November 2025 + +The following advisories are available for the {js-operator} 0.1.0: + +* link:https://access.redhat.com/errata/RHBA-2025:19431[RHBA-2025:19431] + +[id="js-rn-initial-new-features_{context}"] +== New features and enhancements + +* This is the initial Technology Preview release of the {js-operator}. + +// No bugs to list since this is the initial release +// [id="js-rn-0.1.0-bug-fixes_{context}"] +// == Bug fixes +// +// * TODO + +// No known issues to list +// [id="js-rn-0.1.0-known-issues_{context}"] +// == Known issues +// +// * TODO \ No newline at end of file