From b5b420a738da2ac1f3402e0240d615781f389ad3 Mon Sep 17 00:00:00 2001
From: Ratnopam Chakrabarti <ratnopamc@yahoo.com>
Date: Tue, 4 Jun 2024 13:29:04 -0700
Subject: [PATCH] Upgrade ray version on mistral7b inference

---
 .../inference/gradio-ui/gradio-app-mistral.py |  66 ---------
 .../mistral-7b-rayserve-inf2/Dockerfile       |  19 +--
 .../mistral-7b-rayserve-inf2/gradio-ui.yaml   | 137 ++++++++++++++++++
 .../ray-service-mistral.yaml                  |  17 ++-
 .../docs/gen-ai/inference/Mistral-7b-inf2.md  |  61 ++++----
 5 files changed, 186 insertions(+), 114 deletions(-)
 delete mode 100644 gen-ai/inference/gradio-ui/gradio-app-mistral.py
 create mode 100644 gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml
diff --git a/gen-ai/inference/gradio-ui/gradio-app-mistral.py b/gen-ai/inference/gradio-ui/gradio-app-mistral.py
deleted file mode 100644
index b77aa5a36..000000000
--- a/gen-ai/inference/gradio-ui/gradio-app-mistral.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import gradio as gr
-import requests
-import os
-
-
-# Constants for model endpoint and service name
-model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer")
-service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")
-
-# Function to generate text
-def generate_text(message, history):
-    prompt = message
-
-    # Create the URL for the inference
-    url = f"{service_name}{model_endpoint}"
-
-    try:
-        # Send the request to the model service
-        response = requests.get(url, params={"sentence": prompt}, timeout=180)
-        response.raise_for_status()  # Raise an exception for HTTP errors
-        prompt_to_replace = "[INST]" + prompt + "[/INST]"
-
-        # Removing the original prompt with instruction set from the output
-        text = response.json()[0].replace(prompt_to_replace, "", 1).strip('["]?\n')
-
-        # remove '<s>' strikethrough markdown
-        if text.startswith("<s>"):
-            text = text.replace("<s>", "", 1)
-
-        text = text.replace("</s>", "", 1)
-
-        answer_only = text
-
-        # Safety filter to remove harmful or inappropriate content
-        answer_only = filter_harmful_content(answer_only)
-        return answer_only
-    except requests.exceptions.RequestException as e:
-        # Handle any request exceptions (e.g., connection errors)
-        return f"AI: Error: {str(e)}"
-
-
-# Define the safety filter function (you can implement this as needed)
-def filter_harmful_content(text):
-    # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text
-
-    # For now, simply return the text as-is
-    return text
-
-
-# Define the Gradio ChatInterface
-chat_interface = gr.ChatInterface(
-    generate_text,
-    chatbot=gr.Chatbot(height=300),
-    textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
-    title="Mistral AI Chat",
-    description="Ask me any question",
-    theme="soft",
-    examples=["How Big Is Observable Universe", "How to kill a linux process"],
-    cache_examples=False,
-    retry_btn=None,
-    undo_btn="Delete Previous",
-    clear_btn="Clear",
-)
-
-# Launch the ChatInterface
-chat_interface.launch(server_name="0.0.0.0")
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile
index 370180c25..e4f824cce 100644
--- a/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile
+++ b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile
@@ -1,5 +1,5 @@
 # https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore
-FROM rayproject/ray:2.11.0-py310
+FROM rayproject/ray:2.22.0-py310
 
 # Maintainer label
 LABEL maintainer="DoEKS"
@@ -15,27 +15,22 @@ RUN . /etc/os-release && \
     sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \
     sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
     sudo apt-get update -y && \
-    sudo apt-get install git -y && \
-    sudo apt-get install aws-neuronx-dkms=2.* -y && \
-    sudo apt-get install aws-neuronx-collectives=2.* -y && \
-    sudo apt-get install aws-neuronx-runtime-lib=2.* -y && \
-    sudo apt-get install aws-neuronx-tools=2.* -y
-
+    sudo apt-get install aws-neuronx-dkms aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y && \
+    sudo apt-get clean
 
 # Switch back to a non-root user for the subsequent commands
 USER $USER
 
 # Set pip repository pointing to the Neuron repository and install required Python packages
-# huggingface_hub is needed to login to huggingface repo for the model access
 RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \
-    pip install neuronx-cc==2.* torch-neuronx==1.13.* torchvision transformers-neuronx sentencepiece transformers && \
-    pip install starlette==0.34.0 && \
-    pip install huggingface_hub
-
+    pip install wget awscli regex neuronx-cc==2.* torch-neuronx torchvision transformers-neuronx sentencepiece transformers
 
 # Add Neuron path to PATH
 ENV PATH /opt/aws/neuron/bin:$PATH
 
+# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
+ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH
+
 WORKDIR /serve_app
 
 COPY ray_serve_mistral.py /serve_app/ray_serve_mistral.py
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml
new file mode 100644
index 000000000..6e4c2c203
--- /dev/null
+++ b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml
@@ -0,0 +1,137 @@
+# gradio-deploy.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gradio-mistral7b-inf2
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gradio-deployment
+  namespace: gradio-mistral7b-inf2
+  labels:
+    app: gradio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gradio
+  template:
+    metadata:
+      labels:
+        app: gradio
+    spec:
+      containers:
+      - name: gradio
+        image: public.ecr.aws/data-on-eks/gradio-web-app-base:latest
+        imagePullPolicy: IfNotPresent
+        ports:
+        - containerPort: 7860
+        resources:
+          requests:
+            cpu: "512m"
+            memory: "2048Mi"
+          limits:
+            cpu: "1"
+            memory: "4096Mi"
+        env:
+        - name: MODEL_ENDPOINT
+          value: "/infer"
+        - name: SERVICE_NAME
+          value: "http://mistral-serve-svc.mistral.svc.cluster.local:8000"
+        volumeMounts:
+        - name: gradio-app-script
+          mountPath: /app/gradio-app.py
+          subPath: gradio-app-mistral7b-inf2.py
+      volumes:
+      - name: gradio-app-script
+        configMap:
+          name: gradio-app-script
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gradio-service
+  namespace: gradio-mistral7b-inf2
+spec:
+  selector:
+    app: gradio
+  ports:
+  - name: http
+    protocol: TCP
+    port: 7860
+    targetPort: 7860
+  type: ClusterIP
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gradio-app-script
+  namespace: gradio-mistral7b-inf2
+data:
+  gradio-app-mistral7b-inf2.py: |
+    import gradio as gr
+    import requests
+    import os
+
+
+    # Constants for model endpoint and service name
+    model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer")
+    service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")
+
+    # Function to generate text
+    def generate_text(message, history):
+        prompt = message
+
+        # Create the URL for the inference
+        url = f"{service_name}{model_endpoint}"
+
+        try:
+            # Send the request to the model service
+            response = requests.get(url, params={"sentence": prompt}, timeout=180)
+            response.raise_for_status()  # Raise an exception for HTTP errors
+            prompt_to_replace = "[INST]" + prompt + "[/INST]"
+
+            # Removing the original prompt with instruction set from the output
+            text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n')
+            # remove '<s>' strikethrough markdown
+            if text.startswith("<s>"):
+                text = text.replace("<s>", "", 1)
+
+            text = text.replace("</s>", "", 1)
+
+            answer_only = text
+
+            # Safety filter to remove harmful or inappropriate content
+            answer_only = filter_harmful_content(answer_only)
+            return answer_only
+        except requests.exceptions.RequestException as e:
+            # Handle any request exceptions (e.g., connection errors)
+            return f"AI: Error: {str(e)}"
+
+
+    # Define the safety filter function (you can implement this as needed)
+    def filter_harmful_content(text):
+        # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text
+
+        # For now, simply return the text as-is
+        return text
+
+
+    # Define the Gradio ChatInterface
+    chat_interface = gr.ChatInterface(
+        generate_text,
+        chatbot=gr.Chatbot(height=300),
+        textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
+        title="Mistral AI Chat",
+        description="Ask me any question",
+        theme="soft",
+        examples=["How Big Is Observable Universe", "How to kill a linux process"],
+        cache_examples=False,
+        retry_btn=None,
+        undo_btn="Delete Previous",
+        clear_btn="Clear",
+    )
+
+    # Launch the ChatInterface
+    chat_interface.launch()
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
index 41421084f..122528644 100644
--- a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
+++ b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
@@ -31,6 +31,8 @@ spec:
           env_vars:
             MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
             NEURON_CC_FLAGS: "-O1"
+            LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+            NEURON_CORES: "2"
         deployments:
           - name: mistral-7b
             autoscaling_config:
@@ -46,8 +48,11 @@ spec:
             ray_actor_options:
               num_cpus: 10
               resources: {"neuron_cores": 2}
+              runtime_env:
+                env_vars:
+                  LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
   rayClusterConfig:
-    rayVersion: '2.11.0'
+    rayVersion: '2.22.0'
     enableInTreeAutoscaling: true
     headGroupSpec:
       headService:
@@ -60,7 +65,7 @@ spec:
         spec:
           containers:
           - name: head
-            image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
             imagePullPolicy: Always # Ensure the image is always pulled when updated
             lifecycle:
               preStop:
@@ -68,7 +73,7 @@ spec:
                   command: ["/bin/sh", "-c", "ray stop"]
             ports:
             - containerPort: 6379
-              name: gcs-server
+              name: gcs
             - containerPort: 8265
               name: dashboard
             - containerPort: 10001
@@ -86,6 +91,8 @@ spec:
                 cpu: "2"
                 memory: "20G"
             env:
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
             - name: HUGGING_FACE_HUB_TOKEN
               valueFrom:
                 secretKeyRef:
@@ -108,7 +115,7 @@ spec:
         spec:
           containers:
           - name: worker
-            image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
             imagePullPolicy: Always # Ensure the image is always pulled when updated
             lifecycle:
               preStop:
@@ -125,6 +132,8 @@ spec:
                 memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
                 aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
             env:
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
             - name: HUGGING_FACE_HUB_TOKEN
               valueFrom:
                 secretKeyRef:
diff --git a/website/docs/gen-ai/inference/Mistral-7b-inf2.md b/website/docs/gen-ai/inference/Mistral-7b-inf2.md
index d96e41019..9b4b2b487 100644
--- a/website/docs/gen-ai/inference/Mistral-7b-inf2.md
+++ b/website/docs/gen-ai/inference/Mistral-7b-inf2.md
@@ -115,7 +115,7 @@ To deploy the Mistral-7B-Instruct-v0.2 model, it's essential to configure your H
 ```bash
 # set the Hugging Face Hub Token as an environment variable. This variable will be substituted when applying the ray-service-mistral.yaml file
 
-export  HUGGING_FACE_HUB_TOKEN=<Your-Hugging-Face-Hub-Token-Value>
+export HUGGING_FACE_HUB_TOKEN=$(echo -n "Your-Hugging-Face-Hub-Token-Value" | base64)
 
 cd ./../gen-ai/inference/mistral-7b-rayserve-inf2
 envsubst < ray-service-mistral.yaml| kubectl apply -f -
@@ -125,7 +125,7 @@ Verify the deployment by running the following commands
 
 :::info
 
-The deployment process may take up to 10 to 12 minutes. The Head Pod is expected to be ready within 2 to 3 minutes, while the Ray Serve worker pod may take up to 10 minutes for image retrieval and Model deployment from Huggingface.
+The deployment process may take up to 10 minutes. The Head Pod is expected to be ready within 2 to 3 minutes, while the Ray Serve worker pod may take up to 10 minutes for image retrieval and Model deployment from Huggingface.
 
 :::
 
@@ -152,6 +152,8 @@ mistral-service-serve-svc   NodePort   172.20.109.223   <none>        8000:31679
 
 For the Ray dashboard, you can port-forward these ports individually to access the web UI locally using localhost.
 
+
+
 ```bash
 kubectl -n mistral port-forward svc/mistral-service 8265:8265
 ```
@@ -170,42 +172,42 @@ You can monitor Serve deployment and the Ray Cluster deployment including resour
 ![RayServe Cluster](img/ray-serve-inf2-mistral-cluster.png)
 
 ## Deploying the Gradio WebUI App
-Discover how to create a user-friendly chat interface using [Gradio](https://www.gradio.app/) that integrates seamlessly with deployed models.
 
-Let's move forward with setting up the Gradio app as a Kubernetes deployment, utilizing a Docker container. This setup will enable interaction with the Mistral model, which is deployed using RayServe.
+[Gradio](https://www.gradio.app/) Web UI is used to interact with the Mistral7b inference service deployed on EKS Clusters using inf2 instances.
+The Gradio UI communicates internally with the mistral service(`mistral-serve-svc.mistral.svc.cluster.local:8000`), which is exposed on port `8000`, using its service name and port.
 
+We have created a base Docker(`gen-ai/inference/gradio-ui/Dockerfile-gradio-base`) image for the Gradio app, which can be used with any model inference.
+This image is published on [Public ECR](https://gallery.ecr.aws/data-on-eks/gradio-web-app-base).
 
-### Build the Gradio app docker container
+#### Steps to Deploy a Gradio App:
 
-First, lets build the docker container for the client app.
+The following YAML script (`gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml`) creates a dedicated namespace, deployment, service, and a ConfigMap where your model client script goes.
 
+To deploy this, execute:
 
 ```bash
-cd ../gradio-ui
-docker build --platform=linux/amd64 \
-  -t gradio-app:mistral \
-  --build-arg GRADIO_APP="gradio-app-mistral.py" \
-  .
+cd gen-ai/inference/mistral-7b-rayserve-inf2/
+kubectl apply -f gradio-ui.yaml
 ```
 
-### Deploy the Gradio container
+**Verification Steps:**
+Run the following commands to verify the deployment, service, and ConfigMap:
 
-Deploy the Gradio app as a container on localhost using docker:
+```bash
+kubectl get deployments -n gradio-mistral7b-inf2
 
+kubectl get services -n gradio-mistral7b-inf2
 
-```bash
-docker run --rm -it -p 7860:7860 -p 8000:8000 gradio-app:mistral
+kubectl get configmaps -n gradio-mistral7b-inf2
 ```
-:::info
-If you are not running Docker Desktop on your machine and using something like [finch](https://runfinch.com/) instead then you will need to additional flags for a custom host-to-IP mapping inside the container.
 
+**Port-Forward the Service:**
+
+Run the port-forward command so that you can access the Web UI locally:
+
+```bash
+kubectl port-forward service/gradio-service 7860:7860 -n gradio-mistral7b-inf2
 ```
-docker run --rm -it \
-    --add-host ray-service:<workstation-ip> \
-    -e "SERVICE_NAME=http://ray-service:8000" \
-    -p 7860:7860 gradio-app:mistral
-```
-:::
 
 #### Invoke the WebUI
 
@@ -233,21 +235,16 @@ Below screenshots provide some examples of the model response based on different
 
 Finally, we'll provide instructions for cleaning up and deprovisioning the resources when they are no longer needed.
 
-**Step1:** Delete Gradio Container
+**Step1:** Delete Gradio App and mistral Inference deployment
 
-`Ctrl-c` on the localhost terminal window where `docker run` is running to kill the container running the Gradio app. Optionally clean up the docker image
-
-```bash
-docker rmi gradio-app:mistral
-```
-**Step2:** Delete Ray Cluster
 
 ```bash
-cd ../mistral-7b-rayserve-inf2
+cd gen-ai/inference/mistral-7b-rayserve-inf2
+kubectl delete -f gradio-ui.yaml
 kubectl delete -f ray-service-mistral.yaml
 ```
 
-**Step3:** Cleanup the EKS Cluster
+**Step2:** Cleanup the EKS Cluster
 This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order.
 
 ```bash