From 04c2e56ad7949b112270c20f69448d0d8aa5198b Mon Sep 17 00:00:00 2001 From: fjcloud Date: Wed, 11 Sep 2024 13:27:50 +0200 Subject: [PATCH 01/11] initial commit ollama+openwebui --- content/_index.md | 1 + content/misc/_index.md | 3 +- content/misc/ollama-openwebui/index.md | 129 +++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 content/misc/ollama-openwebui/index.md diff --git a/content/_index.md b/content/_index.md index a21217317..e5ee8add6 100644 --- a/content/_index.md +++ b/content/_index.md @@ -181,6 +181,7 @@ description: "Step-by-step tutorials from Red Hat experts to help you get the mo * [Deploy OpenShift Advanced Data Protection on a ROSA STS cluster](/experts/misc/oadp/rosa-sts/) * [Azure DevOps with Managed OpenShift](/experts/misc/azure-dev-ops-with-managed-openshift/) * [Configuring OpenShift Dev Spaces to serve Custom Domains](/experts/misc/devspaces-custom-domain) +* [Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPU inference](./experts/misc/ollama-openwebui) ### Applications diff --git a/content/misc/_index.md b/content/misc/_index.md index c91a0b7e1..3918698de 100644 --- a/content/misc/_index.md +++ b/content/misc/_index.md @@ -19,4 +19,5 @@ skipMetadata: true * [Sharing Common Images](./common-images-namespace) * [Stop Default Router from Serving Custom Domains](./default-router-custom-domain) * [Configuring OpenShift Dev Spaces to serve Custom Domains](./devspaces-custom-domain) -* [Running and Deploying LLMs using Red Hat OpenShift AI on ROSA cluster and Storing the Model in Amazon S3 Bucket](./rhoai-s3) \ No newline at end of file +* [Running and Deploying LLMs using Red Hat OpenShift AI on ROSA cluster and Storing the Model in Amazon S3 Bucket](./rhoai-s3) +* [Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPU inference](./ollama-openwebui) diff --git a/content/misc/ollama-openwebui/index.md b/content/misc/ollama-openwebui/index.md new file mode 100644 index 000000000..feebd141f --- /dev/null +++ b/content/misc/ollama-openwebui/index.md @@ -0,0 +1,129 @@ +--- +date: '2024-09-11' +title: Deploy Ollama and OpenWebUI on ROSA with Graviton GPUs +tags: ["AWS", "ROSA", "GPU", "Ollama", "OpenWebUI"] +aliases: ["/docs/misc/ollama-openwebui-graviton-gpu"] +authors: + - Florian Jacquin +--- + +Red Hat OpenShift Service on AWS (ROSA) provides a managed OpenShift environment that can leverage AWS's GPU instances. This guide will walk you through deploying Ollama and OpenWebUI on ROSA using Graviton instances with GPU for inferences. + +## Prerequisites + +* A Red Hat OpenShift on AWS (ROSA) 4.16+ cluster +* The OC CLI +* The ROSA CLI + +## Set up GPU-enabled Machine Pool + +First we need to check availability of our instance type used here (g5g.2xlarge), it should be in same region of the cluster. + +```bash +for region in $(aws ec2 describe-regions --query 'Regions[].RegionName' --output text); do + echo "Region: $region" + aws ec2 describe-instance-type-offerings --location-type availability-zone \ + --filters Name=instance-type,Values=g5g.2xlarge --region $region \ + --query 'InstanceTypeOfferings[].Location' --output table + echo "" +done +``` + +And then we can create a machine pool with GPU-enabled instances, in our example i use eu-central-1c AZ; this is the only place where you can find spot instance g5g.2xlarge in EU at the moment: + +```bash +rosa create machine-pool -c $CLUSTER_NAME --name gpu --replicas=1 --availability-zone eu-central-1c --instance-type g5g.2xlarge --use-spot-instances +``` + +This command creates a machine pool named "gpu" with one replica using the g5g.2xlarge spot instance type, which is a Graviton-based CPU instance with Nvidia T4 16GB GPU. A.K.A best performance/price at the moment. (0.2610$/h) + +## Deploy Required Operators + +We'll use kustomize to deploy the necessary operators thanks to this awesome repository provided by Red Hat COP (Communnity of Practices) https://github.com/redhat-cop/gitops-catalog + +1. Node Feature Discovery (NFD) Operator: + ```bash + oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/operator/overlays/stable + ``` + The NFD Operator detects hardware features and configuration in your cluster. + +2. GPU Operator: + ```bash + oc apply -k https://github.com/redhat-cop/gitops-catalog/gpu-operator-certified/operator/overlays/stable + ``` + The GPU Operator manages NVIDIA GPUs drivers in your cluster. + +## Create Operator Instances + +After the operators are installed, create their instances: + +1. NFD Instance: + ```bash + oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/only-nvidia + ``` + This creates an NFD instance configured for NVIDIA GPUs. + +2. GPU Operator Instance: + ```bash + oc apply -k https://github.com/redhat-cop/gitops-catalog/gpu-operator-certified/instance/overlays/aws + ``` + This creates a GPU Operator instance configured for AWS. + +## Deploy Ollama and OpenWebUI + +Now, let's deploy Ollama for inference and OpenWebUI for interacting with the LLM: + +1. Create a new project: + ```bash + oc new-project llm + ``` + +2. Deploy Ollama: + ```bash + oc new-app docker.io/ollama/ollama:0.3.10 --import-mode=PreserveOriginal + oc patch deployment ollama -p '{"spec":{"strategy":{"type":"Recreate"}}}' + oc set volume deployment/ollama --add --type=pvc --claim-size=50Gi --mount-path=/.ollama --name=config + oc set resources deployment/ollama --limits=nvidia.com/gpu=1 + ``` + This deploys Ollama, sets up persistent storage, and allocates a GPU to the deployment. + +3. Deploy OpenWebUI: + ```bash + oc new-app ghcr.io/open-webui/open-webui:0.3.19 -e WEBUI_SECRET_KEY=secret -e OLLAMA_BASE_URL=http://ollama:11434 --import-mode=PreserveOriginal + oc set volume deployment/open-webui --add --type=pvc --claim-size=5Gi --mount-path=/app/backend/data --name=data + oc set volume deployment/open-webui --add --type=emptyDir --mount-path=/app/backend/static --name=static + ``` + This deploys OpenWebUI and sets up the necessary storage and environment variables. + +4. Create a route for OpenWebUI: + ```bash + oc create route edge --service=open-webui + ``` + This creates an edge-terminated route to access OpenWebUI. + +## Accessing OpenWebUI + +After deploying OpenWebUI, follow these steps to access and configure it: + +1. Get the route URL: + ```bash + oc get route open-webui + ``` + +2. Open the URL in your web browser. You should see the OpenWebUI login page. + +3. Initial Setup: + - The first time you access OpenWebUI, you'll need to register. + - Choose a strong password for the admin account. + +4. Configuring Models: + - Once logged in, go to the "Models" section to download and configure the LLMs you want to use. + - Start with a smaller model to test your setup before moving to larger, more resource-intensive models. + +5. Testing Your Setup: + - Create a new chat and select one of the models you've configured. + - Try sending a test prompt to ensure everything is working correctly. + +## Conclusion + +You now have Ollama and OpenWebUI deployed on your ROSA cluster, leveraging Graviton GPU instances for inference. This setup allows you to run and interact with large language models efficiently using the power of AWS's GPU instances within a managed OpenShift environment. This approach represents the best of both worlds: the reliability and support of a managed OpenShift service, combined with the innovation and rapid advancement of the open-source AI community. It allows organizations to stay at the forefront of AI technology while maintaining the security, compliance, and operational standards required in enterprise environments. From 6357010a87d23adc2ef814b982ff2ac6c55fbba4 Mon Sep 17 00:00:00 2001 From: fjcloud Date: Wed, 11 Sep 2024 17:15:20 +0200 Subject: [PATCH 02/11] modifications --- content/misc/ollama-openwebui/index.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/content/misc/ollama-openwebui/index.md b/content/misc/ollama-openwebui/index.md index feebd141f..340d362a7 100644 --- a/content/misc/ollama-openwebui/index.md +++ b/content/misc/ollama-openwebui/index.md @@ -1,6 +1,6 @@ --- date: '2024-09-11' -title: Deploy Ollama and OpenWebUI on ROSA with Graviton GPUs +title: Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPUs tags: ["AWS", "ROSA", "GPU", "Ollama", "OpenWebUI"] aliases: ["/docs/misc/ollama-openwebui-graviton-gpu"] authors: @@ -17,7 +17,7 @@ Red Hat OpenShift Service on AWS (ROSA) provides a managed OpenShift environment ## Set up GPU-enabled Machine Pool -First we need to check availability of our instance type used here (g5g.2xlarge), it should be in same region of the cluster. +First we need to check availability of our instance type used here (g5g.2xlarge), it should be in same region of the cluster. Note you can use also x86_64 based instance like g4dn*. ```bash for region in $(aws ec2 describe-regions --query 'Regions[].RegionName' --output text); do @@ -35,11 +35,13 @@ And then we can create a machine pool with GPU-enabled instances, in our example rosa create machine-pool -c $CLUSTER_NAME --name gpu --replicas=1 --availability-zone eu-central-1c --instance-type g5g.2xlarge --use-spot-instances ``` -This command creates a machine pool named "gpu" with one replica using the g5g.2xlarge spot instance type, which is a Graviton-based CPU instance with Nvidia T4 16GB GPU. A.K.A best performance/price at the moment. (0.2610$/h) +This command creates a machine pool named "gpu" with one replica using the g5g.2xlarge spot instance, which is a Graviton-based CPU instance (ARM64) with Nvidia T4 16GB GPU. A.K.A best performance/price at the moment. (0.2610$/h) + +Note that mixed architecture for nodes is available only on HCP since 4.16. ## Deploy Required Operators -We'll use kustomize to deploy the necessary operators thanks to this awesome repository provided by Red Hat COP (Communnity of Practices) https://github.com/redhat-cop/gitops-catalog +We'll use kustomize to deploy the necessary operators thanks to this repository provided by Red Hat COP (Community of Practices) https://github.com/redhat-cop/gitops-catalog 1. Node Feature Discovery (NFD) Operator: ```bash @@ -61,7 +63,7 @@ After the operators are installed, create their instances: ```bash oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/only-nvidia ``` - This creates an NFD instance configured for NVIDIA GPUs. + This creates an NFD instance for cluster. 2. GPU Operator Instance: ```bash From 545f89dbebd80dea64c186586fcb853e0681bc1e Mon Sep 17 00:00:00 2001 From: fjcloud Date: Thu, 12 Sep 2024 10:37:12 +0200 Subject: [PATCH 03/11] add verifications steps and scaling --- content/misc/ollama-openwebui/index.md | 87 +++++++++++++++++++++----- 1 file changed, 70 insertions(+), 17 deletions(-) diff --git a/content/misc/ollama-openwebui/index.md b/content/misc/ollama-openwebui/index.md index 340d362a7..e20a28384 100644 --- a/content/misc/ollama-openwebui/index.md +++ b/content/misc/ollama-openwebui/index.md @@ -2,46 +2,45 @@ date: '2024-09-11' title: Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPUs tags: ["AWS", "ROSA", "GPU", "Ollama", "OpenWebUI"] -aliases: ["/docs/misc/ollama-openwebui-graviton-gpu"] +aliases: ["/docs/misc/ollama-openwebui"] authors: - Florian Jacquin --- -Red Hat OpenShift Service on AWS (ROSA) provides a managed OpenShift environment that can leverage AWS's GPU instances. This guide will walk you through deploying Ollama and OpenWebUI on ROSA using Graviton instances with GPU for inferences. +Red Hat OpenShift Service on AWS (ROSA) provides a managed OpenShift environment that can leverage AWS GPU instances. This guide will walk you through deploying Ollama and OpenWebUI on ROSA using instances with GPU for inferences. ## Prerequisites -* A Red Hat OpenShift on AWS (ROSA) 4.16+ cluster -* The OC CLI -* The ROSA CLI +* A Red Hat OpenShift on AWS (ROSA classic or HCP) 4.14+ cluster +* OC CLI +* ROSA CLI ## Set up GPU-enabled Machine Pool -First we need to check availability of our instance type used here (g5g.2xlarge), it should be in same region of the cluster. Note you can use also x86_64 based instance like g4dn*. +First we need to check availability of our instance type used here (g4dn.xlarge), it should be in same region of the cluster. Note you can use also Graviton based instance (ARM64) like g5g* but only on HCP 4.16+ cluster. ```bash for region in $(aws ec2 describe-regions --query 'Regions[].RegionName' --output text); do echo "Region: $region" aws ec2 describe-instance-type-offerings --location-type availability-zone \ - --filters Name=instance-type,Values=g5g.2xlarge --region $region \ + --filters Name=instance-type,Values=g4dn.xlarge --region $region \ --query 'InstanceTypeOfferings[].Location' --output table echo "" done ``` -And then we can create a machine pool with GPU-enabled instances, in our example i use eu-central-1c AZ; this is the only place where you can find spot instance g5g.2xlarge in EU at the moment: +And then we can create a machine pool with GPU-enabled instances, in our example i use eu-central-1c AZ: ```bash -rosa create machine-pool -c $CLUSTER_NAME --name gpu --replicas=1 --availability-zone eu-central-1c --instance-type g5g.2xlarge --use-spot-instances +export CLUSTER_NAME=mycluster +rosa create machine-pool -c $CLUSTER_NAME --name gpu --replicas=1 --availability-zone eu-central-1c --instance-type g4dn.xlarge --use-spot-instances ``` -This command creates a machine pool named "gpu" with one replica using the g5g.2xlarge spot instance, which is a Graviton-based CPU instance (ARM64) with Nvidia T4 16GB GPU. A.K.A best performance/price at the moment. (0.2610$/h) - -Note that mixed architecture for nodes is available only on HCP since 4.16. +This command creates a machine pool named "gpu" with one replica using the g4dn.xlarge spot instance, which is x86_64 instance with Nvidia T4 16GB GPU. ## Deploy Required Operators -We'll use kustomize to deploy the necessary operators thanks to this repository provided by Red Hat COP (Community of Practices) https://github.com/redhat-cop/gitops-catalog +We'll use kustomize to deploy the necessary operators thanks to this repository provided by Red Hat COP (Community of Practices) [link](https://github.com/redhat-cop/gitops-catalog) 1. Node Feature Discovery (NFD) Operator: ```bash @@ -83,7 +82,10 @@ Now, let's deploy Ollama for inference and OpenWebUI for interacting with the LL 2. Deploy Ollama: ```bash oc new-app docker.io/ollama/ollama:0.3.10 --import-mode=PreserveOriginal - oc patch deployment ollama -p '{"spec":{"strategy":{"type":"Recreate"}}}' + oc patch deployment ollama --type=json -p '[ + {"op": "remove", "path": "/spec/strategy/rollingUpdate"}, + {"op": "replace", "path": "/spec/strategy/type", "value": "Recreate"} + ]' oc set volume deployment/ollama --add --type=pvc --claim-size=50Gi --mount-path=/.ollama --name=config oc set resources deployment/ollama --limits=nvidia.com/gpu=1 ``` @@ -103,6 +105,37 @@ Now, let's deploy Ollama for inference and OpenWebUI for interacting with the LL ``` This creates an edge-terminated route to access OpenWebUI. +## Verify deployment + +1. All nvidia pods should be running or completed + ```bash + oc get pods -n nvidia-gpu-operator + ``` + +2. All pods of llm namespace should be running + ```bash + oc get pods -n llm + ``` + +3. Check logs of ollama, it should detect inference compute card + ```bash + oc logs -l deployment=ollama + time=2024-09-12T07:28:40.446Z level=INFO source=images.go:753 msg="total blobs: 0" + time=2024-09-12T07:28:40.446Z level=INFO source=images.go:760 msg="total unused blobs removed: 0" + time=2024-09-12T07:28:40.446Z level=INFO source=routes.go:1172 msg="Listening on [::]:11434 (version 0.3.10)" + time=2024-09-12T07:28:40.446Z level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama1403693285/runners + time=2024-09-12T07:28:53.779Z level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cuda_v12 rocm_v60102 cpu cpu_avx cpu_avx2 cuda_v11]" + time=2024-09-12T07:28:53.779Z level=INFO source=gpu.go:200 msg="looking for compatible GPUs" + time=2024-09-12T07:28:54.324Z level=INFO source=types.go:107 msg="inference compute" id=GPU-51dedb8e-2306-b077-67c1-774b4206c8da library=cuda variant=v12 compute=7.5 driver=12.4 name="Tesla T4" total="14.6 GiB" available="14.5 GiB" + ``` +## Download a model + +1. Download llama3.1 8B using Ollama CLI + ```bash + oc exec svc/ollama -- ollama pull llama3.1 + ``` + You can check all models available on [https://ollama.com/library](https://ollama.com/library) + ## Accessing OpenWebUI After deploying OpenWebUI, follow these steps to access and configure it: @@ -119,13 +152,33 @@ After deploying OpenWebUI, follow these steps to access and configure it: - Choose a strong password for the admin account. 4. Configuring Models: - - Once logged in, go to the "Models" section to download and configure the LLMs you want to use. - - Start with a smaller model to test your setup before moving to larger, more resource-intensive models. + - Once logged in, go to the "Models" section to choose the LLMs you want to use. 5. Testing Your Setup: - Create a new chat and select one of the models you've configured. - Try sending a test prompt to ensure everything is working correctly. +## Scaling + +If you want to give best experience for multiple users, for improving response time and token/s you can scale Ollama app. + +Note that here you should use EFS (RWX access) instead or EBS (RWO access) for storage of ollama models, you can install EFS operator using [this tutorial](https://cloud.redhat.com/experts/rosa/aws-efs/) + +1. Add new GPU node to machine pool + ```bash + rosa edit machine-pool -c $CLUSTER_NAME gpu --replicas=2 + ``` + +2. Change storage type for ollama app for using EFS + ```bash + oc set volume deployment/ollama --add --claim-class=efs-sc --type=pvc --claim-size=50Gi --mount-path=/.ollama --name=config + ``` + +3. Scale ollama deployment + ```bash + oc scale deployment/ollama --replicas=2 + ``` + ## Conclusion -You now have Ollama and OpenWebUI deployed on your ROSA cluster, leveraging Graviton GPU instances for inference. This setup allows you to run and interact with large language models efficiently using the power of AWS's GPU instances within a managed OpenShift environment. This approach represents the best of both worlds: the reliability and support of a managed OpenShift service, combined with the innovation and rapid advancement of the open-source AI community. It allows organizations to stay at the forefront of AI technology while maintaining the security, compliance, and operational standards required in enterprise environments. +You now have Ollama and OpenWebUI deployed on your ROSA cluster, leveraging AWS GPU instances for inference. This setup allows you to run and interact with large language models efficiently using the power of AWS's GPU instances within a managed OpenShift environment. This approach represents the best of both worlds: the reliability and support of a managed OpenShift service and AWS, combined with the innovation and rapid advancement of the open-source AI community. It allows organizations to stay at the forefront of AI technology while maintaining the security, compliance, and operational standards required in enterprise environments. From 32f98b7f85e60b1bc09c01d5e366387192b602d4 Mon Sep 17 00:00:00 2001 From: fjcloud Date: Thu, 12 Sep 2024 10:43:56 +0200 Subject: [PATCH 04/11] add open webui docs / features --- content/misc/ollama-openwebui/index.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/content/misc/ollama-openwebui/index.md b/content/misc/ollama-openwebui/index.md index e20a28384..cafa27c7b 100644 --- a/content/misc/ollama-openwebui/index.md +++ b/content/misc/ollama-openwebui/index.md @@ -145,7 +145,7 @@ After deploying OpenWebUI, follow these steps to access and configure it: oc get route open-webui ``` -2. Open the URL in your web browser. You should see the OpenWebUI login page. +2. Open the URL in your web browser. You should see the OpenWebUI login page. [https://docs.openwebui.com/](https://docs.openwebui.com/) 3. Initial Setup: - The first time you access OpenWebUI, you'll need to register. @@ -158,6 +158,14 @@ After deploying OpenWebUI, follow these steps to access and configure it: - Create a new chat and select one of the models you've configured. - Try sending a test prompt to ensure everything is working correctly. +6. Discover OpenWeb UI! You get lot of feature like : + - Model Builder + - Local and Remote RAG Integration + - Web Browsing Capabilities + - Role-Based Access Control (RBAC) + more here : [https://docs.openwebui.com/features](https://docs.openwebui.com/features) + + ## Scaling If you want to give best experience for multiple users, for improving response time and token/s you can scale Ollama app. From 146d01c7ec69170eba5036db59cee4e3f0f47507 Mon Sep 17 00:00:00 2001 From: fjcloud Date: Thu, 12 Sep 2024 10:51:37 +0200 Subject: [PATCH 05/11] fix path --- content/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/_index.md b/content/_index.md index e5ee8add6..0290d855e 100644 --- a/content/_index.md +++ b/content/_index.md @@ -181,7 +181,7 @@ description: "Step-by-step tutorials from Red Hat experts to help you get the mo * [Deploy OpenShift Advanced Data Protection on a ROSA STS cluster](/experts/misc/oadp/rosa-sts/) * [Azure DevOps with Managed OpenShift](/experts/misc/azure-dev-ops-with-managed-openshift/) * [Configuring OpenShift Dev Spaces to serve Custom Domains](/experts/misc/devspaces-custom-domain) -* [Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPU inference](./experts/misc/ollama-openwebui) +* [Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPUs](/experts/misc/ollama-openwebui) ### Applications From 0b18f7cfb7ecebde03136405f9417591a53f41bf Mon Sep 17 00:00:00 2001 From: fjcloud Date: Thu, 12 Sep 2024 11:00:33 +0200 Subject: [PATCH 06/11] add uninstall + fix typo --- content/misc/ollama-openwebui/index.md | 30 ++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/content/misc/ollama-openwebui/index.md b/content/misc/ollama-openwebui/index.md index cafa27c7b..a234342cc 100644 --- a/content/misc/ollama-openwebui/index.md +++ b/content/misc/ollama-openwebui/index.md @@ -12,7 +12,7 @@ Red Hat OpenShift Service on AWS (ROSA) provides a managed OpenShift environment ## Prerequisites * A Red Hat OpenShift on AWS (ROSA classic or HCP) 4.14+ cluster -* OC CLI +* OC CLI (Admin access to cluster) * ROSA CLI ## Set up GPU-enabled Machine Pool @@ -173,20 +173,46 @@ If you want to give best experience for multiple users, for improving response t Note that here you should use EFS (RWX access) instead or EBS (RWO access) for storage of ollama models, you can install EFS operator using [this tutorial](https://cloud.redhat.com/experts/rosa/aws-efs/) 1. Add new GPU node to machine pool + ```bash rosa edit machine-pool -c $CLUSTER_NAME gpu --replicas=2 ``` 2. Change storage type for ollama app for using EFS + ```bash oc set volume deployment/ollama --add --claim-class=efs-sc --type=pvc --claim-size=50Gi --mount-path=/.ollama --name=config ``` 3. Scale ollama deployment + ```bash oc scale deployment/ollama --replicas=2 ``` +## Uninstalling + +1. Delete llm namespace + ```bash + oc delete project llm + ``` + +2. Delete operators + ```bash + oc delete -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/only-nvidia + oc delete -k https://github.com/redhat-cop/gitops-catalog/gpu-operator-certified/instance/overlays/aws + oc delete -k https://github.com/redhat-cop/gitops-catalog/nfd/operator/overlays/stable + oc delete -k https://github.com/redhat-cop/gitops-catalog/gpu-operator-certified/operator/overlays/stable + ``` + +3. Delete machine pool + ```bash + rosa delete machine-pool -c $CLUSTER_NAME gpu + ```` + ## Conclusion -You now have Ollama and OpenWebUI deployed on your ROSA cluster, leveraging AWS GPU instances for inference. This setup allows you to run and interact with large language models efficiently using the power of AWS's GPU instances within a managed OpenShift environment. This approach represents the best of both worlds: the reliability and support of a managed OpenShift service and AWS, combined with the innovation and rapid advancement of the open-source AI community. It allows organizations to stay at the forefront of AI technology while maintaining the security, compliance, and operational standards required in enterprise environments. +You now have Ollama and OpenWebUI deployed on your ROSA cluster, leveraging AWS GPU instances for inference. +This setup allows you to run and interact with large language models efficiently using the power of AWS's GPU instances within a managed OpenShift environment. +This approach represents the best of both worlds: the reliability and support of a managed OpenShift service and AWS, combined with the innovation and rapid advancement of the open-source AI community. +It allows organizations to stay at the forefront of AI technology while maintaining the security, compliance, and operational standards required in enterprise environments. From 53ebf32a91b9d505849dde687fa7dfb79a022b52 Mon Sep 17 00:00:00 2001 From: fjcloud Date: Thu, 12 Sep 2024 11:21:35 +0200 Subject: [PATCH 07/11] fix typo --- content/misc/ollama-openwebui/index.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/content/misc/ollama-openwebui/index.md b/content/misc/ollama-openwebui/index.md index a234342cc..0438b393c 100644 --- a/content/misc/ollama-openwebui/index.md +++ b/content/misc/ollama-openwebui/index.md @@ -151,18 +151,22 @@ After deploying OpenWebUI, follow these steps to access and configure it: - The first time you access OpenWebUI, you'll need to register. - Choose a strong password for the admin account. + 4. Configuring Models: - Once logged in, go to the "Models" section to choose the LLMs you want to use. + 5. Testing Your Setup: - Create a new chat and select one of the models you've configured. - Try sending a test prompt to ensure everything is working correctly. + 6. Discover OpenWeb UI! You get lot of feature like : - Model Builder - Local and Remote RAG Integration - Web Browsing Capabilities - Role-Based Access Control (RBAC) + more here : [https://docs.openwebui.com/features](https://docs.openwebui.com/features) @@ -208,11 +212,11 @@ Note that here you should use EFS (RWX access) instead or EBS (RWO access) for s 3. Delete machine pool ```bash rosa delete machine-pool -c $CLUSTER_NAME gpu - ```` + ``` ## Conclusion -You now have Ollama and OpenWebUI deployed on your ROSA cluster, leveraging AWS GPU instances for inference. -This setup allows you to run and interact with large language models efficiently using the power of AWS's GPU instances within a managed OpenShift environment. -This approach represents the best of both worlds: the reliability and support of a managed OpenShift service and AWS, combined with the innovation and rapid advancement of the open-source AI community. -It allows organizations to stay at the forefront of AI technology while maintaining the security, compliance, and operational standards required in enterprise environments. +- You now have Ollama and OpenWebUI deployed on your ROSA cluster, leveraging AWS GPU instances for inference. +- This setup allows you to run and interact with large language models efficiently using AWS's GPU instances within a managed OpenShift environment. +- This approach represents the best of both worlds: the reliability and support of a managed OpenShift service and AWS, combined with the innovation and rapid advancement of the open-source AI community. +- It allows organizations to stay at the forefront of AI technology while maintaining the security, compliance, and operational standards required in enterprise environments. From 5aad94c03f0b34ba2fb2ec92ef8a8d8f21446b7c Mon Sep 17 00:00:00 2001 From: fjcloud Date: Fri, 13 Sep 2024 09:57:24 +0200 Subject: [PATCH 08/11] add new section AI/ML --- content/_index.md | 5 ++++- content/ai-ml/_index.md | 12 ++++++++++++ content/{misc => ai-ml}/ollama-openwebui/index.md | 0 3 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 content/ai-ml/_index.md rename content/{misc => ai-ml}/ollama-openwebui/index.md (100%) diff --git a/content/_index.md b/content/_index.md index 0290d855e..d5baaafa0 100644 --- a/content/_index.md +++ b/content/_index.md @@ -173,6 +173,10 @@ description: "Step-by-step tutorials from Red Hat experts to help you get the mo * [Using Group Sync Operator with Azure Active Directory and ROSA/OSD](/experts/idp/az-ad-grp-sync) * [Using Group Sync Operator with Okta and ROSA/OSD](/experts/idp/okta-grp-sync) +## AI / ML + +* [Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPUs](/experts/ai-ml/ollama-openwebui) + ## Miscellaneous * [Demonstrating GitOps - ArgoCD](/experts/redhat/gitops/) @@ -181,7 +185,6 @@ description: "Step-by-step tutorials from Red Hat experts to help you get the mo * [Deploy OpenShift Advanced Data Protection on a ROSA STS cluster](/experts/misc/oadp/rosa-sts/) * [Azure DevOps with Managed OpenShift](/experts/misc/azure-dev-ops-with-managed-openshift/) * [Configuring OpenShift Dev Spaces to serve Custom Domains](/experts/misc/devspaces-custom-domain) -* [Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPUs](/experts/misc/ollama-openwebui) ### Applications diff --git a/content/ai-ml/_index.md b/content/ai-ml/_index.md new file mode 100644 index 000000000..0beb56269 --- /dev/null +++ b/content/ai-ml/_index.md @@ -0,0 +1,12 @@ +--- +title: "AI / ML" +date: 2024-09-13 +description: MOBB Guide related to AI / ML +archetype: chapter +skipMetadata: true +--- + + +## AI / ML Topics: + +* [Deploying and Running Ollama and Open WebUI in a ROSA Cluster with GPUs](./ollama-openwebui) diff --git a/content/misc/ollama-openwebui/index.md b/content/ai-ml/ollama-openwebui/index.md similarity index 100% rename from content/misc/ollama-openwebui/index.md rename to content/ai-ml/ollama-openwebui/index.md From 717b7770ea265aaa7cdf53c1074f32d9fc80833d Mon Sep 17 00:00:00 2001 From: fjcloud Date: Fri, 13 Sep 2024 10:35:01 +0200 Subject: [PATCH 09/11] implement andy suggestions + downscale + typo issues --- content/ai-ml/ollama-openwebui/index.md | 119 ++++++++++++++++++------ 1 file changed, 91 insertions(+), 28 deletions(-) diff --git a/content/ai-ml/ollama-openwebui/index.md b/content/ai-ml/ollama-openwebui/index.md index 0438b393c..af95495c3 100644 --- a/content/ai-ml/ollama-openwebui/index.md +++ b/content/ai-ml/ollama-openwebui/index.md @@ -19,8 +19,10 @@ Red Hat OpenShift Service on AWS (ROSA) provides a managed OpenShift environment First we need to check availability of our instance type used here (g4dn.xlarge), it should be in same region of the cluster. Note you can use also Graviton based instance (ARM64) like g5g* but only on HCP 4.16+ cluster. +Here i check availability of instance g4dn.xlarge in eu-* region : + ```bash -for region in $(aws ec2 describe-regions --query 'Regions[].RegionName' --output text); do +for region in $(aws ec2 describe-regions --query 'Regions[?starts_with(RegionName, `eu`)].RegionName' --output text); do echo "Region: $region" aws ec2 describe-instance-type-offerings --location-type availability-zone \ --filters Name=instance-type,Values=g4dn.xlarge --region $region \ @@ -29,14 +31,76 @@ for region in $(aws ec2 describe-regions --query 'Regions[].RegionName' --output done ``` -And then we can create a machine pool with GPU-enabled instances, in our example i use eu-central-1c AZ: +Example output : + +```bash +Region: eu-south-1 +------------------------------- +|DescribeInstanceTypeOfferings| ++-----------------------------+ +| eu-south-1c | +| eu-south-1b | ++-----------------------------+ + +Region: eu-south-2 + +Region: eu-central-1 +------------------------------- +|DescribeInstanceTypeOfferings| ++-----------------------------+ +| eu-central-1a | +| eu-central-1b | +| eu-central-1c | ++-----------------------------+ + +Region: eu-central-2 + +Region: eu-north-1 +------------------------------- +|DescribeInstanceTypeOfferings| ++-----------------------------+ +| eu-north-1c | +| eu-north-1a | +| eu-north-1b | ++-----------------------------+ + +Region: eu-west-3 +------------------------------- +|DescribeInstanceTypeOfferings| ++-----------------------------+ +| eu-west-3b | +| eu-west-3a | +| eu-west-3c | ++-----------------------------+ + +Region: eu-west-2 +------------------------------- +|DescribeInstanceTypeOfferings| ++-----------------------------+ +| eu-west-2c | +| eu-west-2a | +| eu-west-2b | ++-----------------------------+ + +Region: eu-west-1 +------------------------------- +|DescribeInstanceTypeOfferings| ++-----------------------------+ +| eu-west-1c | +| eu-west-1b | +| eu-west-1a | ++-----------------------------+ +``` +> Here we see that this instance is available everywhere in 3 AZ except in eu-south-2 and eu-central-2. + +With the region and zone known, now create a machine pool with GPU Enabled Instances. In this example I have used region eu-central-1c: ```bash export CLUSTER_NAME=mycluster rosa create machine-pool -c $CLUSTER_NAME --name gpu --replicas=1 --availability-zone eu-central-1c --instance-type g4dn.xlarge --use-spot-instances ``` -This command creates a machine pool named "gpu" with one replica using the g4dn.xlarge spot instance, which is x86_64 instance with Nvidia T4 16GB GPU. +This command creates a machine pool named "gpu" with one replica using the g4dn.xlarge spot instance, which is x86_64 instance with Nvidia T4 16GB GPU. It's the cheapest GPU instance you can have at the moment (0.2114$/h at the moment); 16GB of VRAM is enought for running small/medium models. ## Deploy Required Operators @@ -72,14 +136,15 @@ After the operators are installed, create their instances: ## Deploy Ollama and OpenWebUI -Now, let's deploy Ollama for inference and OpenWebUI for interacting with the LLM: +Next, we'll deploy Ollama for model inference and OpenWebUI as the interface for interacting with the language model. 1. Create a new project: ```bash oc new-project llm ``` -2. Deploy Ollama: +2. The following command deploys Ollama, sets up persistent storage, and allocates a GPU to the deployment: + ```bash oc new-app docker.io/ollama/ollama:0.3.10 --import-mode=PreserveOriginal oc patch deployment ollama --type=json -p '[ @@ -89,21 +154,14 @@ Now, let's deploy Ollama for inference and OpenWebUI for interacting with the LL oc set volume deployment/ollama --add --type=pvc --claim-size=50Gi --mount-path=/.ollama --name=config oc set resources deployment/ollama --limits=nvidia.com/gpu=1 ``` - This deploys Ollama, sets up persistent storage, and allocates a GPU to the deployment. -3. Deploy OpenWebUI: +3. The following command deploys OpenWebUI and sets up the necessary storage and environment variables and then expose the service with a route: ```bash oc new-app ghcr.io/open-webui/open-webui:0.3.19 -e WEBUI_SECRET_KEY=secret -e OLLAMA_BASE_URL=http://ollama:11434 --import-mode=PreserveOriginal oc set volume deployment/open-webui --add --type=pvc --claim-size=5Gi --mount-path=/app/backend/data --name=data oc set volume deployment/open-webui --add --type=emptyDir --mount-path=/app/backend/static --name=static - ``` - This deploys OpenWebUI and sets up the necessary storage and environment variables. - -4. Create a route for OpenWebUI: - ```bash oc create route edge --service=open-webui ``` - This creates an edge-terminated route to access OpenWebUI. ## Verify deployment @@ -148,29 +206,26 @@ After deploying OpenWebUI, follow these steps to access and configure it: 2. Open the URL in your web browser. You should see the OpenWebUI login page. [https://docs.openwebui.com/](https://docs.openwebui.com/) 3. Initial Setup: - - The first time you access OpenWebUI, you'll need to register. - - Choose a strong password for the admin account. - +- The first time you access OpenWebUI, you'll need to register. +- Choose a strong password for the admin account. 4. Configuring Models: - - Once logged in, go to the "Models" section to choose the LLMs you want to use. - +- Once logged in, go to the "Models" section to choose the LLMs you want to use. 5. Testing Your Setup: - - Create a new chat and select one of the models you've configured. - - Try sending a test prompt to ensure everything is working correctly. +- Create a new chat and select one of the models you've configured. +- Try sending a test prompt to ensure everything is working correctly. +6. Discover OpenWeb UI! You get lot of features like : +- Model Builder +- Local and Remote RAG Integration +- Web Browsing Capabilities +- Role-Based Access Control (RBAC) -6. Discover OpenWeb UI! You get lot of feature like : - - Model Builder - - Local and Remote RAG Integration - - Web Browsing Capabilities - - Role-Based Access Control (RBAC) +More here : [https://docs.openwebui.com/features](https://docs.openwebui.com/features) - more here : [https://docs.openwebui.com/features](https://docs.openwebui.com/features) - -## Scaling +## Scale If you want to give best experience for multiple users, for improving response time and token/s you can scale Ollama app. @@ -194,6 +249,14 @@ Note that here you should use EFS (RWX access) instead or EBS (RWO access) for s oc scale deployment/ollama --replicas=2 ``` +## Downscale + +For cost optimization, you can scale you machine pool of GPU to 0 : + + ```bash + rosa edit machine-pool -c $CLUSTER_NAME gpu --replicas=0 + ``` + ## Uninstalling 1. Delete llm namespace From f51ad7843109c636b91789abe782c2cef2f49f3a Mon Sep 17 00:00:00 2001 From: Florian Jacquin Date: Mon, 16 Sep 2024 11:31:02 +0200 Subject: [PATCH 10/11] Apply suggestions from code review Co-authored-by: Andy Repton <10834938+andyrepton@users.noreply.github.com> --- content/ai-ml/ollama-openwebui/index.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/content/ai-ml/ollama-openwebui/index.md b/content/ai-ml/ollama-openwebui/index.md index af95495c3..c13e068cf 100644 --- a/content/ai-ml/ollama-openwebui/index.md +++ b/content/ai-ml/ollama-openwebui/index.md @@ -19,7 +19,7 @@ Red Hat OpenShift Service on AWS (ROSA) provides a managed OpenShift environment First we need to check availability of our instance type used here (g4dn.xlarge), it should be in same region of the cluster. Note you can use also Graviton based instance (ARM64) like g5g* but only on HCP 4.16+ cluster. -Here i check availability of instance g4dn.xlarge in eu-* region : +Using the following command, you can check for the availability of the g4dn.xlarge instance type in all eu-* regions: ```bash for region in $(aws ec2 describe-regions --query 'Regions[?starts_with(RegionName, `eu`)].RegionName' --output text); do @@ -93,14 +93,15 @@ Region: eu-west-1 ``` > Here we see that this instance is available everywhere in 3 AZ except in eu-south-2 and eu-central-2. -With the region and zone known, now create a machine pool with GPU Enabled Instances. In this example I have used region eu-central-1c: +With the region and zone known, use the following command to create a machine pool with GPU Enabled Instances. In this example I have used region eu-central-1c: ```bash -export CLUSTER_NAME=mycluster +# Replace $mycluster with the name of your ROSA cluster +export CLUSTER_NAME=$mycluster rosa create machine-pool -c $CLUSTER_NAME --name gpu --replicas=1 --availability-zone eu-central-1c --instance-type g4dn.xlarge --use-spot-instances ``` -This command creates a machine pool named "gpu" with one replica using the g4dn.xlarge spot instance, which is x86_64 instance with Nvidia T4 16GB GPU. It's the cheapest GPU instance you can have at the moment (0.2114$/h at the moment); 16GB of VRAM is enought for running small/medium models. +This command creates a machine pool named "gpu" with one replica using the g4dn.xlarge spot instance, which is an x86_64 instance with Nvidia T4 16GB GPU. It's the cheapest GPU instance you can have at the moment (0.2114$/h at the moment); 16GB of VRAM is enough for running small/medium models. ## Deploy Required Operators @@ -120,7 +121,7 @@ We'll use kustomize to deploy the necessary operators thanks to this repository ## Create Operator Instances -After the operators are installed, create their instances: +After the operators are installed, use the following commands to create their instances: 1. NFD Instance: ```bash @@ -136,7 +137,7 @@ After the operators are installed, create their instances: ## Deploy Ollama and OpenWebUI -Next, we'll deploy Ollama for model inference and OpenWebUI as the interface for interacting with the language model. +Next, use the following commands to deploy Ollama for model inference and OpenWebUI as the interface for interacting with the language model. 1. Create a new project: ```bash @@ -165,7 +166,7 @@ Next, we'll deploy Ollama for model inference and OpenWebUI as the interface for ## Verify deployment -1. All nvidia pods should be running or completed +1. Use the following commands to ensure all nvidia pods are either running or completed ```bash oc get pods -n nvidia-gpu-operator ``` @@ -222,14 +223,14 @@ After deploying OpenWebUI, follow these steps to access and configure it: - Web Browsing Capabilities - Role-Based Access Control (RBAC) -More here : [https://docs.openwebui.com/features](https://docs.openwebui.com/features) +You can read more about OpenWebUI here : [https://docs.openwebui.com/features](https://docs.openwebui.com/features) -## Scale +## Implement scaling -If you want to give best experience for multiple users, for improving response time and token/s you can scale Ollama app. +If you would like to give best experience for multiple users, for example to improve response time and token/s you can scale the Ollama app. -Note that here you should use EFS (RWX access) instead or EBS (RWO access) for storage of ollama models, you can install EFS operator using [this tutorial](https://cloud.redhat.com/experts/rosa/aws-efs/) +Note that here you should use the EFS (RWX access) storage class instead of the EBS (RWO access) storage class for the storage of ollama models. For instructions on how to set this up, please see [this tutorial](https://cloud.redhat.com/experts/rosa/aws-efs/) 1. Add new GPU node to machine pool @@ -249,7 +250,7 @@ Note that here you should use EFS (RWX access) instead or EBS (RWO access) for s oc scale deployment/ollama --replicas=2 ``` -## Downscale +## Implement downscaling For cost optimization, you can scale you machine pool of GPU to 0 : From 0f148053bd67aea2fed09f11a50bc37fb9931f29 Mon Sep 17 00:00:00 2001 From: fjcloud Date: Mon, 16 Sep 2024 11:39:30 +0200 Subject: [PATCH 11/11] Add new line before backticks --- content/ai-ml/ollama-openwebui/index.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/content/ai-ml/ollama-openwebui/index.md b/content/ai-ml/ollama-openwebui/index.md index c13e068cf..9f52eb49b 100644 --- a/content/ai-ml/ollama-openwebui/index.md +++ b/content/ai-ml/ollama-openwebui/index.md @@ -108,12 +108,14 @@ This command creates a machine pool named "gpu" with one replica using the g4dn. We'll use kustomize to deploy the necessary operators thanks to this repository provided by Red Hat COP (Community of Practices) [link](https://github.com/redhat-cop/gitops-catalog) 1. Node Feature Discovery (NFD) Operator: + ```bash oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/operator/overlays/stable ``` The NFD Operator detects hardware features and configuration in your cluster. 2. GPU Operator: + ```bash oc apply -k https://github.com/redhat-cop/gitops-catalog/gpu-operator-certified/operator/overlays/stable ``` @@ -124,12 +126,14 @@ We'll use kustomize to deploy the necessary operators thanks to this repository After the operators are installed, use the following commands to create their instances: 1. NFD Instance: + ```bash oc apply -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/only-nvidia ``` This creates an NFD instance for cluster. 2. GPU Operator Instance: + ```bash oc apply -k https://github.com/redhat-cop/gitops-catalog/gpu-operator-certified/instance/overlays/aws ``` @@ -140,6 +144,7 @@ After the operators are installed, use the following commands to create their in Next, use the following commands to deploy Ollama for model inference and OpenWebUI as the interface for interacting with the language model. 1. Create a new project: + ```bash oc new-project llm ``` @@ -157,6 +162,7 @@ Next, use the following commands to deploy Ollama for model inference and OpenWe ``` 3. The following command deploys OpenWebUI and sets up the necessary storage and environment variables and then expose the service with a route: + ```bash oc new-app ghcr.io/open-webui/open-webui:0.3.19 -e WEBUI_SECRET_KEY=secret -e OLLAMA_BASE_URL=http://ollama:11434 --import-mode=PreserveOriginal oc set volume deployment/open-webui --add --type=pvc --claim-size=5Gi --mount-path=/app/backend/data --name=data @@ -167,16 +173,19 @@ Next, use the following commands to deploy Ollama for model inference and OpenWe ## Verify deployment 1. Use the following commands to ensure all nvidia pods are either running or completed + ```bash oc get pods -n nvidia-gpu-operator ``` 2. All pods of llm namespace should be running + ```bash oc get pods -n llm ``` 3. Check logs of ollama, it should detect inference compute card + ```bash oc logs -l deployment=ollama time=2024-09-12T07:28:40.446Z level=INFO source=images.go:753 msg="total blobs: 0" @@ -190,6 +199,7 @@ Next, use the following commands to deploy Ollama for model inference and OpenWe ## Download a model 1. Download llama3.1 8B using Ollama CLI + ```bash oc exec svc/ollama -- ollama pull llama3.1 ``` @@ -200,6 +210,7 @@ Next, use the following commands to deploy Ollama for model inference and OpenWe After deploying OpenWebUI, follow these steps to access and configure it: 1. Get the route URL: + ```bash oc get route open-webui ``` @@ -261,11 +272,13 @@ For cost optimization, you can scale you machine pool of GPU to 0 : ## Uninstalling 1. Delete llm namespace + ```bash oc delete project llm ``` 2. Delete operators + ```bash oc delete -k https://github.com/redhat-cop/gitops-catalog/nfd/instance/overlays/only-nvidia oc delete -k https://github.com/redhat-cop/gitops-catalog/gpu-operator-certified/instance/overlays/aws @@ -274,6 +287,7 @@ For cost optimization, you can scale you machine pool of GPU to 0 : ``` 3. Delete machine pool + ```bash rosa delete machine-pool -c $CLUSTER_NAME gpu ```