diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index e1b8afde54..27f029ca16 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -555,6 +555,7 @@ alloydb antiword apikey apikeys +apk appspot appuser apredict @@ -948,6 +949,7 @@ newaxis newaxisngram nfcorpus nfl +nginx ngram ngrams nlp diff --git a/gemini/multimodal-live-api/websocket-demo-app/Dockerfile b/gemini/multimodal-live-api/websocket-demo-app/Dockerfile new file mode 100644 index 0000000000..5fc1ef8267 --- /dev/null +++ b/gemini/multimodal-live-api/websocket-demo-app/Dockerfile @@ -0,0 +1,20 @@ +FROM nginx:alpine + +# install Python 3 and pip +RUN apk add --no-cache python3=3.12.8-r1 py3-pip=24.0-r2 supervisor=4.2.5-r5 + +# copy the front end +COPY frontend/. /usr/share/nginx/html + +# copy backend +COPY backend/. /app + +# install supervisord +RUN pip3 install --no-cache-dir --break-system-packages -r app/requirements.txt + +COPY supervisord.conf /etc/supervisor/supervisord.conf +COPY nginx.conf /etc/nginx/nginx.conf + +EXPOSE 8000 + +CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"] \ No newline at end of file diff --git a/gemini/multimodal-live-api/websocket-demo-app/README.md b/gemini/multimodal-live-api/websocket-demo-app/README.md index c635e0423e..6213d27088 100644 --- a/gemini/multimodal-live-api/websocket-demo-app/README.md +++ b/gemini/multimodal-live-api/websocket-demo-app/README.md @@ -165,3 +165,55 @@ You can set up this app locally or via Cloud Shell. - Text input: You can write a text prompt to send to the model by entering your message in the box and pressing the send arrow. The model will then respond via audio (turn up your volume!). - Voice input: Press the pink microphone button and start speaking. The model will respond via audio. If you would like to mute your microphone, press the button with a slash through the microphone. - Video input: The model will also capture your camera input and send it to Gemini. You can ask questions about current or previous video footage. For more details on how this works, visit the [documentation page for the Multimodal Live API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live). + +### Setup in Cloud Run + +1. Clone the repository and cd into the correct directory + + ```sh + git clone https://github.com/GoogleCloudPlatform/generative-ai.git + cd generative-ai/gemini/multimodal-live-api/websocket-demo-app + ``` + +1. Modify the frontend code to point the WebSocket endpoint to the same container: + + - Navigate to the `script.js` file on line 9, `const PROXY_URL = "wss://[THE_URL_YOU_COPIED_WITHOUT_HTTP]";` and replace `PROXY_URL` value with `/ws`. It should look like: `const PROXY_URL = "/ws";`. Note the absence of the second "s" in "wss" as "ws" indicates a non-secure WebSocket connection. And there is no host part as it will use the same container as the frontend and backend. + - Right below on line 10, update `PROJECT_ID` with your Google Cloud project ID. + - Save the changes you've made to `script.js` + +1. Deploy the code to Cloud Run using the following `gcloud` command: + + ```sh + gcloud run deploy --project=YOUR-PROJECT-ID \ + --region=us-central1 \ + --source=./ \ + --allow-unauthenticated \ + --port=8000 \ + gemini-live-demo + ``` + +1. Last step command will output a link for the deployment if it run successfully. Copy the link to your browser and navigate to the demo app UI. + +1. Get your Google Cloud access token: Run the following command in a terminal with gcloud installed to set your project, and to retrieve your access token. + + ```sh + gcloud components update + gcloud components install beta + gcloud config set project YOUR-PROJECT-ID + gcloud auth print-access-token + ``` + +1. Copy the access token from the previous step into the UI that you have open in your browser. + +1. Enter the model ID in the UI: + Replace `YOUR-PROJECT-ID` in the input with your Google Cloud Project ID. + +1. Connect and interact with the demo: + + - After entering your Access Token and Model ID, press the connect button to connect your web app. Now you should be able to interact with Gemini 2.0 with the Multimodal Live API. + +1. To interact with the app, you can do the following: + + - Text input: You can write a text prompt to send to the model by entering your message in the box and pressing the send arrow. The model will then respond via audio (turn up your volume!). + - Voice input: Press the microphone button to stop speaking. The model will respond via audio. If you would like to mute your microphone, press the button with a slash through the microphone. + - Video input: The model will also capture your camera input and send it to Gemini. You can ask questions about current or previous video footage. For more details on how this works, visit the [documentation page for the Multimodal Live API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live). diff --git a/gemini/multimodal-live-api/websocket-demo-app/nginx.conf b/gemini/multimodal-live-api/websocket-demo-app/nginx.conf new file mode 100644 index 0000000000..a2e689a6f4 --- /dev/null +++ b/gemini/multimodal-live-api/websocket-demo-app/nginx.conf @@ -0,0 +1,43 @@ +worker_processes auto; + +events { + worker_connections 1024; +} + +http { + include mime.types; + default_type application/octet-stream; + + sendfile on; + + keepalive_timeout 65; + + server { + listen 8000; + server_name localhost; + location / { + root /usr/share/nginx/html; + index index.html index.htm; + try_files $uri $uri/ /index.html; + } + + location /ws { + proxy_pass http://127.0.0.1:8080; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + } + + error_page 500 502 503 504 /50x.html; + location = /50x.html { + root /usr/share/nginx/html; + } + } +} \ No newline at end of file diff --git a/gemini/multimodal-live-api/websocket-demo-app/supervisord.conf b/gemini/multimodal-live-api/websocket-demo-app/supervisord.conf new file mode 100644 index 0000000000..3c272c5212 --- /dev/null +++ b/gemini/multimodal-live-api/websocket-demo-app/supervisord.conf @@ -0,0 +1,9 @@ +[supervisord] +nodaemon=true + +[program:nginx] +command=nginx -g "daemon off;" + +[program:websocket] +command=python3 /app/main.py # use python3 command +directory=/app \ No newline at end of file