From b5b420a738da2ac1f3402e0240d615781f389ad3 Mon Sep 17 00:00:00 2001 From: Ratnopam Chakrabarti Date: Tue, 4 Jun 2024 13:29:04 -0700 Subject: [PATCH] Upgrade ray version on mistral7b inference --- .../inference/gradio-ui/gradio-app-mistral.py | 66 --------- .../mistral-7b-rayserve-inf2/Dockerfile | 19 +-- .../mistral-7b-rayserve-inf2/gradio-ui.yaml | 137 ++++++++++++++++++ .../ray-service-mistral.yaml | 17 ++- .../docs/gen-ai/inference/Mistral-7b-inf2.md | 61 ++++---- 5 files changed, 186 insertions(+), 114 deletions(-) delete mode 100644 gen-ai/inference/gradio-ui/gradio-app-mistral.py create mode 100644 gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml diff --git a/gen-ai/inference/gradio-ui/gradio-app-mistral.py b/gen-ai/inference/gradio-ui/gradio-app-mistral.py deleted file mode 100644 index b77aa5a36..000000000 --- a/gen-ai/inference/gradio-ui/gradio-app-mistral.py +++ /dev/null @@ -1,66 +0,0 @@ -import gradio as gr -import requests -import os - - -# Constants for model endpoint and service name -model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer") -service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000") - -# Function to generate text -def generate_text(message, history): - prompt = message - - # Create the URL for the inference - url = f"{service_name}{model_endpoint}" - - try: - # Send the request to the model service - response = requests.get(url, params={"sentence": prompt}, timeout=180) - response.raise_for_status() # Raise an exception for HTTP errors - prompt_to_replace = "[INST]" + prompt + "[/INST]" - - # Removing the original prompt with instruction set from the output - text = response.json()[0].replace(prompt_to_replace, "", 1).strip('["]?\n') - - # remove '' strikethrough markdown - if text.startswith(""): - text = text.replace("", "", 1) - - text = text.replace("", "", 1) - - answer_only = text - - # Safety filter to remove harmful or inappropriate content - answer_only = filter_harmful_content(answer_only) - return answer_only - except requests.exceptions.RequestException as e: - # Handle any request exceptions (e.g., connection errors) - return f"AI: Error: {str(e)}" - - -# Define the safety filter function (you can implement this as needed) -def filter_harmful_content(text): - # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text - - # For now, simply return the text as-is - return text - - -# Define the Gradio ChatInterface -chat_interface = gr.ChatInterface( - generate_text, - chatbot=gr.Chatbot(height=300), - textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7), - title="Mistral AI Chat", - description="Ask me any question", - theme="soft", - examples=["How Big Is Observable Universe", "How to kill a linux process"], - cache_examples=False, - retry_btn=None, - undo_btn="Delete Previous", - clear_btn="Clear", -) - -# Launch the ChatInterface -chat_interface.launch(server_name="0.0.0.0") diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile index 370180c25..e4f824cce 100644 --- a/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile @@ -1,5 +1,5 @@ # https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore -FROM rayproject/ray:2.11.0-py310 +FROM rayproject/ray:2.22.0-py310 # Maintainer label LABEL maintainer="DoEKS" @@ -15,27 +15,22 @@ RUN . /etc/os-release && \ sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \ sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \ sudo apt-get update -y && \ - sudo apt-get install git -y && \ - sudo apt-get install aws-neuronx-dkms=2.* -y && \ - sudo apt-get install aws-neuronx-collectives=2.* -y && \ - sudo apt-get install aws-neuronx-runtime-lib=2.* -y && \ - sudo apt-get install aws-neuronx-tools=2.* -y - + sudo apt-get install aws-neuronx-dkms aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y && \ + sudo apt-get clean # Switch back to a non-root user for the subsequent commands USER $USER # Set pip repository pointing to the Neuron repository and install required Python packages -# huggingface_hub is needed to login to huggingface repo for the model access RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \ - pip install neuronx-cc==2.* torch-neuronx==1.13.* torchvision transformers-neuronx sentencepiece transformers && \ - pip install starlette==0.34.0 && \ - pip install huggingface_hub - + pip install wget awscli regex neuronx-cc==2.* torch-neuronx torchvision transformers-neuronx sentencepiece transformers # Add Neuron path to PATH ENV PATH /opt/aws/neuron/bin:$PATH +# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0 +ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH + WORKDIR /serve_app COPY ray_serve_mistral.py /serve_app/ray_serve_mistral.py diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml new file mode 100644 index 000000000..6e4c2c203 --- /dev/null +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml @@ -0,0 +1,137 @@ +# gradio-deploy.yaml +apiVersion: v1 +kind: Namespace +metadata: + name: gradio-mistral7b-inf2 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gradio-deployment + namespace: gradio-mistral7b-inf2 + labels: + app: gradio +spec: + replicas: 1 + selector: + matchLabels: + app: gradio + template: + metadata: + labels: + app: gradio + spec: + containers: + - name: gradio + image: public.ecr.aws/data-on-eks/gradio-web-app-base:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 7860 + resources: + requests: + cpu: "512m" + memory: "2048Mi" + limits: + cpu: "1" + memory: "4096Mi" + env: + - name: MODEL_ENDPOINT + value: "/infer" + - name: SERVICE_NAME + value: "http://mistral-serve-svc.mistral.svc.cluster.local:8000" + volumeMounts: + - name: gradio-app-script + mountPath: /app/gradio-app.py + subPath: gradio-app-mistral7b-inf2.py + volumes: + - name: gradio-app-script + configMap: + name: gradio-app-script +--- +apiVersion: v1 +kind: Service +metadata: + name: gradio-service + namespace: gradio-mistral7b-inf2 +spec: + selector: + app: gradio + ports: + - name: http + protocol: TCP + port: 7860 + targetPort: 7860 + type: ClusterIP +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: gradio-app-script + namespace: gradio-mistral7b-inf2 +data: + gradio-app-mistral7b-inf2.py: | + import gradio as gr + import requests + import os + + + # Constants for model endpoint and service name + model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer") + service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000") + + # Function to generate text + def generate_text(message, history): + prompt = message + + # Create the URL for the inference + url = f"{service_name}{model_endpoint}" + + try: + # Send the request to the model service + response = requests.get(url, params={"sentence": prompt}, timeout=180) + response.raise_for_status() # Raise an exception for HTTP errors + prompt_to_replace = "[INST]" + prompt + "[/INST]" + + # Removing the original prompt with instruction set from the output + text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n') + # remove '' strikethrough markdown + if text.startswith(""): + text = text.replace("", "", 1) + + text = text.replace("", "", 1) + + answer_only = text + + # Safety filter to remove harmful or inappropriate content + answer_only = filter_harmful_content(answer_only) + return answer_only + except requests.exceptions.RequestException as e: + # Handle any request exceptions (e.g., connection errors) + return f"AI: Error: {str(e)}" + + + # Define the safety filter function (you can implement this as needed) + def filter_harmful_content(text): + # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text + + # For now, simply return the text as-is + return text + + + # Define the Gradio ChatInterface + chat_interface = gr.ChatInterface( + generate_text, + chatbot=gr.Chatbot(height=300), + textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7), + title="Mistral AI Chat", + description="Ask me any question", + theme="soft", + examples=["How Big Is Observable Universe", "How to kill a linux process"], + cache_examples=False, + retry_btn=None, + undo_btn="Delete Previous", + clear_btn="Clear", + ) + + # Launch the ChatInterface + chat_interface.launch() diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml index 41421084f..122528644 100644 --- a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml +++ b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml @@ -31,6 +31,8 @@ spec: env_vars: MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2" NEURON_CC_FLAGS: "-O1" + LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" + NEURON_CORES: "2" deployments: - name: mistral-7b autoscaling_config: @@ -46,8 +48,11 @@ spec: ray_actor_options: num_cpus: 10 resources: {"neuron_cores": 2} + runtime_env: + env_vars: + LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" rayClusterConfig: - rayVersion: '2.11.0' + rayVersion: '2.22.0' enableInTreeAutoscaling: true headGroupSpec: headService: @@ -60,7 +65,7 @@ spec: spec: containers: - name: head - image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest + image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest imagePullPolicy: Always # Ensure the image is always pulled when updated lifecycle: preStop: @@ -68,7 +73,7 @@ spec: command: ["/bin/sh", "-c", "ray stop"] ports: - containerPort: 6379 - name: gcs-server + name: gcs - containerPort: 8265 name: dashboard - containerPort: 10001 @@ -86,6 +91,8 @@ spec: cpu: "2" memory: "20G" env: + - name: LD_LIBRARY_PATH + value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: @@ -108,7 +115,7 @@ spec: spec: containers: - name: worker - image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest + image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest imagePullPolicy: Always # Ensure the image is always pulled when updated lifecycle: preStop: @@ -125,6 +132,8 @@ spec: memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge env: + - name: LD_LIBRARY_PATH + value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: diff --git a/website/docs/gen-ai/inference/Mistral-7b-inf2.md b/website/docs/gen-ai/inference/Mistral-7b-inf2.md index d96e41019..9b4b2b487 100644 --- a/website/docs/gen-ai/inference/Mistral-7b-inf2.md +++ b/website/docs/gen-ai/inference/Mistral-7b-inf2.md @@ -115,7 +115,7 @@ To deploy the Mistral-7B-Instruct-v0.2 model, it's essential to configure your H ```bash # set the Hugging Face Hub Token as an environment variable. This variable will be substituted when applying the ray-service-mistral.yaml file -export HUGGING_FACE_HUB_TOKEN= +export HUGGING_FACE_HUB_TOKEN=$(echo -n "Your-Hugging-Face-Hub-Token-Value" | base64) cd ./../gen-ai/inference/mistral-7b-rayserve-inf2 envsubst < ray-service-mistral.yaml| kubectl apply -f - @@ -125,7 +125,7 @@ Verify the deployment by running the following commands :::info -The deployment process may take up to 10 to 12 minutes. The Head Pod is expected to be ready within 2 to 3 minutes, while the Ray Serve worker pod may take up to 10 minutes for image retrieval and Model deployment from Huggingface. +The deployment process may take up to 10 minutes. The Head Pod is expected to be ready within 2 to 3 minutes, while the Ray Serve worker pod may take up to 10 minutes for image retrieval and Model deployment from Huggingface. ::: @@ -152,6 +152,8 @@ mistral-service-serve-svc NodePort 172.20.109.223 8000:31679 For the Ray dashboard, you can port-forward these ports individually to access the web UI locally using localhost. + + ```bash kubectl -n mistral port-forward svc/mistral-service 8265:8265 ``` @@ -170,42 +172,42 @@ You can monitor Serve deployment and the Ray Cluster deployment including resour ![RayServe Cluster](img/ray-serve-inf2-mistral-cluster.png) ## Deploying the Gradio WebUI App -Discover how to create a user-friendly chat interface using [Gradio](https://www.gradio.app/) that integrates seamlessly with deployed models. -Let's move forward with setting up the Gradio app as a Kubernetes deployment, utilizing a Docker container. This setup will enable interaction with the Mistral model, which is deployed using RayServe. +[Gradio](https://www.gradio.app/) Web UI is used to interact with the Mistral7b inference service deployed on EKS Clusters using inf2 instances. +The Gradio UI communicates internally with the mistral service(`mistral-serve-svc.mistral.svc.cluster.local:8000`), which is exposed on port `8000`, using its service name and port. +We have created a base Docker(`gen-ai/inference/gradio-ui/Dockerfile-gradio-base`) image for the Gradio app, which can be used with any model inference. +This image is published on [Public ECR](https://gallery.ecr.aws/data-on-eks/gradio-web-app-base). -### Build the Gradio app docker container +#### Steps to Deploy a Gradio App: -First, lets build the docker container for the client app. +The following YAML script (`gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml`) creates a dedicated namespace, deployment, service, and a ConfigMap where your model client script goes. +To deploy this, execute: ```bash -cd ../gradio-ui -docker build --platform=linux/amd64 \ - -t gradio-app:mistral \ - --build-arg GRADIO_APP="gradio-app-mistral.py" \ - . +cd gen-ai/inference/mistral-7b-rayserve-inf2/ +kubectl apply -f gradio-ui.yaml ``` -### Deploy the Gradio container +**Verification Steps:** +Run the following commands to verify the deployment, service, and ConfigMap: -Deploy the Gradio app as a container on localhost using docker: +```bash +kubectl get deployments -n gradio-mistral7b-inf2 +kubectl get services -n gradio-mistral7b-inf2 -```bash -docker run --rm -it -p 7860:7860 -p 8000:8000 gradio-app:mistral +kubectl get configmaps -n gradio-mistral7b-inf2 ``` -:::info -If you are not running Docker Desktop on your machine and using something like [finch](https://runfinch.com/) instead then you will need to additional flags for a custom host-to-IP mapping inside the container. +**Port-Forward the Service:** + +Run the port-forward command so that you can access the Web UI locally: + +```bash +kubectl port-forward service/gradio-service 7860:7860 -n gradio-mistral7b-inf2 ``` -docker run --rm -it \ - --add-host ray-service: \ - -e "SERVICE_NAME=http://ray-service:8000" \ - -p 7860:7860 gradio-app:mistral -``` -::: #### Invoke the WebUI @@ -233,21 +235,16 @@ Below screenshots provide some examples of the model response based on different Finally, we'll provide instructions for cleaning up and deprovisioning the resources when they are no longer needed. -**Step1:** Delete Gradio Container +**Step1:** Delete Gradio App and mistral Inference deployment -`Ctrl-c` on the localhost terminal window where `docker run` is running to kill the container running the Gradio app. Optionally clean up the docker image - -```bash -docker rmi gradio-app:mistral -``` -**Step2:** Delete Ray Cluster ```bash -cd ../mistral-7b-rayserve-inf2 +cd gen-ai/inference/mistral-7b-rayserve-inf2 +kubectl delete -f gradio-ui.yaml kubectl delete -f ray-service-mistral.yaml ``` -**Step3:** Cleanup the EKS Cluster +**Step2:** Cleanup the EKS Cluster This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order. ```bash