awslabs · vara-bonthu · Jun 4, 2024 · Jun 4, 2024
diff --git a/gen-ai/inference/gradio-ui/gradio-app-mistral.py b/gen-ai/inference/gradio-ui/gradio-app-mistral.py
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile b/gen-ai/inference/mistral-7b-rayserve-inf2/Dockerfile
@@ -1,5 +1,5 @@
 # https://hub.docker.com/layers/rayproject/ray/2.11.0-py310/images/sha256-de798e487b76a8f2412c718c43c5f342b3eb05e0705a71325102904cd27c3613?context=explore
-FROM rayproject/ray:2.11.0-py310
+FROM rayproject/ray:2.22.0-py310
 
 # Maintainer label
 LABEL maintainer="DoEKS"
@@ -15,27 +15,22 @@ RUN . /etc/os-release && \
     sudo echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" > /etc/apt/sources.list.d/neuron.list && \
     sudo wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - && \
     sudo apt-get update -y && \
-    sudo apt-get install git -y && \
-    sudo apt-get install aws-neuronx-dkms=2.* -y && \
-    sudo apt-get install aws-neuronx-collectives=2.* -y && \
-    sudo apt-get install aws-neuronx-runtime-lib=2.* -y && \
-    sudo apt-get install aws-neuronx-tools=2.* -y
-
+    sudo apt-get install aws-neuronx-dkms aws-neuronx-collectives=2.* aws-neuronx-runtime-lib=2.* aws-neuronx-tools=2.* -y && \
+    sudo apt-get clean
 
 # Switch back to a non-root user for the subsequent commands
 USER $USER
 
 # Set pip repository pointing to the Neuron repository and install required Python packages
-# huggingface_hub is needed to login to huggingface repo for the model access
 RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com && \
-    pip install neuronx-cc==2.* torch-neuronx==1.13.* torchvision transformers-neuronx sentencepiece transformers && \
-    pip install starlette==0.34.0 && \
-    pip install huggingface_hub
-
+    pip install wget awscli regex neuronx-cc==2.* torch-neuronx torchvision transformers-neuronx sentencepiece transformers
 
 # Add Neuron path to PATH
 ENV PATH /opt/aws/neuron/bin:$PATH
 
+# Set LD_LIBRARY_PATH to include the directory with libpython3.10.so.1.0
+ENV LD_LIBRARY_PATH /home/ray/anaconda3/lib:$LD_LIBRARY_PATH
+
 WORKDIR /serve_app
 
 COPY ray_serve_mistral.py /serve_app/ray_serve_mistral.py
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/gradio-ui.yaml
@@ -0,0 +1,137 @@
+# gradio-deploy.yaml
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gradio-mistral7b-inf2
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gradio-deployment
+  namespace: gradio-mistral7b-inf2
+  labels:
+    app: gradio
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gradio
+  template:
+    metadata:
+      labels:
+        app: gradio
+    spec:
+      containers:
+      - name: gradio
+        image: public.ecr.aws/data-on-eks/gradio-web-app-base:latest
+        imagePullPolicy: IfNotPresent
+        ports:
+        - containerPort: 7860
+        resources:
+          requests:
+            cpu: "512m"
+            memory: "2048Mi"
+          limits:
+            cpu: "1"
+            memory: "4096Mi"
+        env:
+        - name: MODEL_ENDPOINT
+          value: "/infer"
+        - name: SERVICE_NAME
+          value: "http://mistral-serve-svc.mistral.svc.cluster.local:8000"
+        volumeMounts:
+        - name: gradio-app-script
+          mountPath: /app/gradio-app.py
+          subPath: gradio-app-mistral7b-inf2.py
+      volumes:
+      - name: gradio-app-script
+        configMap:
+          name: gradio-app-script
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gradio-service
+  namespace: gradio-mistral7b-inf2
+spec:
+  selector:
+    app: gradio
+  ports:
+  - name: http
+    protocol: TCP
+    port: 7860
+    targetPort: 7860
+  type: ClusterIP
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gradio-app-script
+  namespace: gradio-mistral7b-inf2
+data:
+  gradio-app-mistral7b-inf2.py: |
+    import gradio as gr
+    import requests
+    import os
+
+
+    # Constants for model endpoint and service name
+    model_endpoint = os.environ.get("MODEL_ENDPOINT", "/infer")
+    service_name = os.environ.get("SERVICE_NAME", "http://localhost:8000")
+
+    # Function to generate text
+    def generate_text(message, history):
+        prompt = message
+
+        # Create the URL for the inference
+        url = f"{service_name}{model_endpoint}"
+
+        try:
+            # Send the request to the model service
+            response = requests.get(url, params={"sentence": prompt}, timeout=180)
+            response.raise_for_status()  # Raise an exception for HTTP errors
+            prompt_to_replace = "[INST]" + prompt + "[/INST]"
+
+            # Removing the original prompt with instruction set from the output
+            text = response.text.replace(prompt_to_replace, "", 1).strip('["]?\n')
+            # remove '<s>' strikethrough markdown
+            if text.startswith("<s>"):
+                text = text.replace("<s>", "", 1)
+
+            text = text.replace("</s>", "", 1)
+
+            answer_only = text
+
+            # Safety filter to remove harmful or inappropriate content
+            answer_only = filter_harmful_content(answer_only)
+            return answer_only
+        except requests.exceptions.RequestException as e:
+            # Handle any request exceptions (e.g., connection errors)
+            return f"AI: Error: {str(e)}"
+
+
+    # Define the safety filter function (you can implement this as needed)
+    def filter_harmful_content(text):
+        # TODO: Implement a safety filter to remove any harmful or inappropriate content from the text
+
+        # For now, simply return the text as-is
+        return text
+
+
+    # Define the Gradio ChatInterface
+    chat_interface = gr.ChatInterface(
+        generate_text,
+        chatbot=gr.Chatbot(height=300),
+        textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
+        title="Mistral AI Chat",
+        description="Ask me any question",
+        theme="soft",
+        examples=["How Big Is Observable Universe", "How to kill a linux process"],
+        cache_examples=False,
+        retry_btn=None,
+        undo_btn="Delete Previous",
+        clear_btn="Clear",
+    )
+
+    # Launch the ChatInterface
+    chat_interface.launch()
diff --git a/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml b/gen-ai/inference/mistral-7b-rayserve-inf2/ray-service-mistral.yaml
@@ -31,6 +31,8 @@ spec:
           env_vars:
             MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.2"
             NEURON_CC_FLAGS: "-O1"
+            LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
+            NEURON_CORES: "2"
         deployments:
           - name: mistral-7b
             autoscaling_config:
@@ -46,8 +48,11 @@ spec:
             ray_actor_options:
               num_cpus: 10
               resources: {"neuron_cores": 2}
+              runtime_env:
+                env_vars:
+                  LD_LIBRARY_PATH: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
   rayClusterConfig:
-    rayVersion: '2.11.0'
+    rayVersion: '2.22.0'
     enableInTreeAutoscaling: true
     headGroupSpec:
       headService:
@@ -60,15 +65,15 @@ spec:
         spec:
           containers:
           - name: head
-            image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
             imagePullPolicy: Always # Ensure the image is always pulled when updated
             lifecycle:
               preStop:
                 exec:
                   command: ["/bin/sh", "-c", "ray stop"]
             ports:
             - containerPort: 6379
-              name: gcs-server
+              name: gcs
             - containerPort: 8265
               name: dashboard
             - containerPort: 10001
@@ -86,6 +91,8 @@ spec:
                 cpu: "2"
                 memory: "20G"
             env:
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
             - name: HUGGING_FACE_HUB_TOKEN
               valueFrom:
                 secretKeyRef:
@@ -108,7 +115,7 @@ spec:
         spec:
           containers:
           - name: worker
-            image: public.ecr.aws/data-on-eks/ray2.11.0-py310-mistral7b-neuron:latest
+            image: public.ecr.aws/data-on-eks/ray2.22.0-py310-mistral7b-neuron:latest
             imagePullPolicy: Always # Ensure the image is always pulled when updated
             lifecycle:
               preStop:
@@ -125,6 +132,8 @@ spec:
                 memory: "360G" # All memory of inf2.24xlarge; 24G for daemonset overhead
                 aws.amazon.com/neuron: "6" # All Neuron cores of inf2.24xlarge
             env:
+            - name: LD_LIBRARY_PATH
+              value: "/home/ray/anaconda3/lib:$LD_LIBRARY_PATH"
             - name: HUGGING_FACE_HUB_TOKEN
               valueFrom:
                 secretKeyRef: