Merge pull request #58 from valentinfrlch/dev-custom-openai

Custom Provider, delete tmp folder, convert static GIFs
valentinfrlch · Aug 21, 2024 · 80862e4 · 80862e4
2 parents 497fc64 + d247e63
commit 80862e4
Show file tree

Hide file tree

Showing 11 changed files with 169 additions and 91 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 </p>
 <p align=center>
 <img src=https://img.shields.io/badge/HACS-Custom-orange.svg?style=for-the-badg>
-<img src=https://img.shields.io/badge/version-1.0.3-blue>
+<img src=https://img.shields.io/badge/version-1.1.0-blue>
 <a href="https://github.com/valentinfrlch/ha-llmvision/issues">
 <img src="https://img.shields.io/maintenance/yes/2024.svg">
 <img alt="Issues" src="https://img.shields.io/github/issues/valentinfrlch/ha-llmvision?color=0088ff"/>
@@ -31,20 +31,19 @@
 <br>
 
 **LLM Vision** is a Home Assistant integration to analyze images, videos and camera feeds using the vision capabilities of multimodal LLMs.  
-Supported providers are OpenAI, Anthropic, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI) and [Ollama](https://ollama.com/).
+Supported providers are OpenAI, Anthropic, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI), [Ollama](https://ollama.com/) and any OpenAI compatible API.
 
 ## Features
-- Compatible with OpenAI, Anthropic Claude, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI) and [Ollama](https://ollama.com/)
+- Compatible with OpenAI, Anthropic Claude, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI), [Ollama](https://ollama.com/) and custom OpenAI compatible APIs
 - Takes images and video from camera entities as input
 - Takes local image and video files as input
 - Images can be downscaled for faster processing
 
 ## Resources
-Check the docs for detailed instructions on how to set up LLM Vision and each of the supported providers as well as usage examples and service call parameters:
+Check the docs for detailed instructions on how to set up LLM Vision and each of the supported providers, get inspiration from examples or join the discussion on the Home Assistant Community.
 
-<a href="https://llm-vision.gitbook.io/getting-started"><img src="https://img.shields.io/badge/Documentation-blue?style=for-the-badge&logo=gitbook&logoColor=white&color=18bcf2"/></a>
+<a href="https://llm-vision.gitbook.io/getting-started"><img src="https://img.shields.io/badge/Documentation-blue?style=for-the-badge&logo=gitbook&logoColor=white&color=18bcf2"/> </a><a href="https://llm-vision.gitbook.io/examples/"><img src="https://img.shields.io/badge/Examples-blue?style=for-the-badge&logo=gitbook&logoColor=black&color=39ffc2"/></a> </a><a href="https://llm-vision.gitbook.io/examples/"><img src="https://img.shields.io/badge/Community-blue?style=for-the-badge&logo=homeassistant&logoColor=white&color=03a9f4"/></a>
 
-Check [📖 Examples](https://llm-vision.gitbook.io/examples/) on how you can integrate llmvision into your Home Assistant setup or join the [🗨️ discussion](https://community.home-assistant.io/t/gpt-4o-vision-capabilities-in-home-assistant/729241) on the Home Assistant Community.
 
 ## Installation
 [![Open a repository inside the Home Assistant Community Store.](https://my.home-assistant.io/badges/hacs_repository.svg)](https://my.home-assistant.io/redirect/hacs_repository/?owner=valentinfrlch&repository=ha-llmvision&category=Integration)
@@ -69,9 +68,9 @@ logger:
 > These are planned features and ideas. They are subject to change and may not be implemented in the order listed or at all.
 
 1. **New Provider**: NVIDIA ChatRTX 
-2. **New Provider**: Custom (OpenAI API compatible) Providers
-3. **Animation Support**: Support for animated GIFs
-4. **HACS**: Include in HACS default
+2. **Animation Support**: Support for animated GIFs
+3. **HACS**: Include in HACS default
+4. [x] ~~**New Provider**: Custom (OpenAI API compatible) Providers~~
 5. [x] ~~**Feature**: HTTPS support for LocalAI and Ollama~~
 6. [x] ~~**Feature**: Support for video files~~  
 7. [x] ~~**Feature**: Analyze Frigate Recordings using frigate's `event_id`~~

diff --git a/custom_components/llmvision/__init__.py b/custom_components/llmvision/__init__.py
@@ -10,6 +10,8 @@
     CONF_OLLAMA_IP_ADDRESS,
     CONF_OLLAMA_PORT,
     CONF_OLLAMA_HTTPS,
+    CONF_CUSTOM_OPENAI_ENDPOINT,
+    CONF_CUSTOM_OPENAI_API_KEY,
     MODEL,
     PROVIDER,
     MAXTOKENS,
@@ -45,6 +47,8 @@ async def async_setup_entry(hass, entry):
     ollama_ip_address = entry.data.get(CONF_OLLAMA_IP_ADDRESS)
     ollama_port = entry.data.get(CONF_OLLAMA_PORT)
     ollama_https = entry.data.get(CONF_OLLAMA_HTTPS)
+    custom_openai_endpoint = entry.data.get(CONF_CUSTOM_OPENAI_ENDPOINT)
+    custom_openai_api_key = entry.data.get(CONF_CUSTOM_OPENAI_API_KEY)
 
     # Ensure DOMAIN exists in hass.data
     if DOMAIN not in hass.data:
@@ -63,6 +67,8 @@ async def async_setup_entry(hass, entry):
             CONF_OLLAMA_IP_ADDRESS: ollama_ip_address,
             CONF_OLLAMA_PORT: ollama_port,
             CONF_OLLAMA_HTTPS: ollama_https,
+            CONF_CUSTOM_OPENAI_ENDPOINT: custom_openai_endpoint,
+            CONF_CUSTOM_OPENAI_API_KEY: custom_openai_api_key
         }.items()
         if value is not None
     })
@@ -104,6 +110,8 @@ def _default_model(self, provider):
             return "gpt-4-vision-preview"
         elif provider == "Ollama":
             return "llava"
+        elif provider == "Custom OpenAI":
+            return "gpt-4o-mini"
 
 
 def setup(hass, config):

diff --git a/custom_components/llmvision/config_flow.py b/custom_components/llmvision/config_flow.py
@@ -13,6 +13,8 @@
     CONF_OLLAMA_IP_ADDRESS,
     CONF_OLLAMA_PORT,
     CONF_OLLAMA_HTTPS,
+    CONF_CUSTOM_OPENAI_API_KEY,
+    CONF_CUSTOM_OPENAI_ENDPOINT,
     VERSION_ANTHROPIC,
 )
 import voluptuous as vol
@@ -120,6 +122,39 @@ async def openai(self):
             _LOGGER.error("Could not connect to OpenAI server.")
             raise ServiceValidationError("handshake_failed")
 
+    async def custom_openai(self):
+        self._validate_provider()
+        _LOGGER.debug(f"Splits: {len(self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(":"))}")
+        # URL with port
+        try:
+            if len(self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(":")) > 2:
+                protocol = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(
+                    "://")[0]
+                base_url = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(
+                    "://")[1].split("/")[0]
+                port = ":" + self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(":")[
+                    1].split("/")[0]
+            # URL without port
+            else:
+                protocol = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(
+                    "://")[0]
+                base_url = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(
+                    "://")[1].split("/")[0]
+                port = ""
+            endpoint = "/v1/models"
+            header = {'Content-type': 'application/json',
+                      'Authorization': 'Bearer ' + self.user_input[CONF_CUSTOM_OPENAI_API_KEY]}
+        except Exception as e:
+            _LOGGER.error(f"Could not parse endpoint: {e}")
+            raise ServiceValidationError("endpoint_parse_failed")
+
+        _LOGGER.debug(
+            f"Connecting to: [protocol: {protocol}, base_url: {base_url}, port: {port}, endpoint: {endpoint}]")
+
+        if not await self._handshake(base_url=base_url, port=port, protocol=protocol, endpoint=endpoint, header=header):
+            _LOGGER.error("Could not connect to Custom OpenAI server.")
+            raise ServiceValidationError("handshake_failed")
+
     async def anthropic(self):
         self._validate_provider()
         if not await self._validate_api_key(self.user_input[CONF_ANTHROPIC_API_KEY]):
@@ -149,6 +184,8 @@ def get_configured_providers(self):
             providers.append("LocalAI")
         if CONF_OLLAMA_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_OLLAMA_PORT in self.hass.data[DOMAIN]:
             providers.append("Ollama")
+        if CONF_CUSTOM_OPENAI_ENDPOINT in self.hass.data[DOMAIN]:
+            providers.append("Custom OpenAI")
         return providers
 
 
@@ -167,6 +204,7 @@ async def handle_provider(self, provider, configured_providers):
             "Google": self.async_step_google,
             "Ollama": self.async_step_ollama,
             "LocalAI": self.async_step_localai,
+            "Custom OpenAI": self.async_step_custom_openai,
         }
 
         step_method = provider_steps.get(provider)
@@ -180,7 +218,7 @@ async def async_step_user(self, user_input=None):
         data_schema = vol.Schema({
             vol.Required("provider", default="OpenAI"): selector({
                 "select": {
-                    "options": ["OpenAI", "Anthropic", "Google", "Ollama", "LocalAI"],
+                    "options": ["OpenAI", "Anthropic", "Google", "Ollama", "LocalAI", "Custom OpenAI"],
                     "mode": "dropdown",
                     "sort": False,
                     "custom_value": False
@@ -339,3 +377,31 @@ async def async_step_google(self, user_input=None):
             step_id="google",
             data_schema=data_schema,
         )
+
+    async def async_step_custom_openai(self, user_input=None):
+        data_schema = vol.Schema({
+            vol.Required(CONF_CUSTOM_OPENAI_ENDPOINT): str,
+            vol.Optional(CONF_CUSTOM_OPENAI_API_KEY): str,
+        })
+
+        if user_input is not None:
+            # save provider to user_input
+            user_input["provider"] = self.init_info["provider"]
+            validator = Validator(self.hass, user_input)
+            try:
+                await validator.custom_openai()
+                # add the mode to user_input
+                user_input["provider"] = self.init_info["provider"]
+                return self.async_create_entry(title="LLM Vision Custom OpenAI", data=user_input)
+            except ServiceValidationError as e:
+                _LOGGER.error(f"Validation failed: {e}")
+                return self.async_show_form(
+                    step_id="custom_openai",
+                    data_schema=data_schema,
+                    errors={"base": "handshake_failed"}
+                )
+
+        return self.async_show_form(
+            step_id="custom_openai",
+            data_schema=data_schema,
+        )
diff --git a/custom_components/llmvision/manifest.json b/custom_components/llmvision/manifest.json
@@ -6,5 +6,5 @@
     "documentation": "https://github.com/valentinfrlch/ha-llmvision",
     "iot_class": "cloud_polling",
     "issue_tracker": "https://github.com/valentinfrlch/ha-llmvision/issues",
-    "version": "1.0.4"
-}
+    "version": "1.1.0"
+}
diff --git a/custom_components/llmvision/media_handlers.py b/custom_components/llmvision/media_handlers.py
@@ -37,6 +37,7 @@ async def resize_image(self, target_width, image_path=None, image_data=None, img
             img = await self.hass.loop.run_in_executor(None, Image.open, image_path)
             with img:
                 # Check if the image is a GIF and convert if necessary
+                _LOGGER.debug(f"Image format: {img.format}")
                 if img.format == 'GIF':
                     # Convert GIF to RGB
                     img = img.convert('RGB')
@@ -58,6 +59,10 @@ async def resize_image(self, target_width, image_path=None, image_data=None, img
             img_byte_arr.write(image_data)
             img = await self.hass.loop.run_in_executor(None, Image.open, img_byte_arr)
             with img:
+                _LOGGER.debug(f"Image format: {img.format}")
+                if img.format == 'GIF':
+                    # Convert GIF to RGB
+                    img = img.convert('RGB')
                 # calculate new height based on aspect ratio
                 width, height = img.size
                 aspect_ratio = width / height
@@ -131,8 +136,8 @@ async def add_images(self, image_entities, image_paths, target_width, include_fi
         return self.client
 
     async def add_videos(self, video_paths, event_ids, interval, target_width, include_filename):
-        tmp_clips_dir = f"config/custom_components/{DOMAIN}/tmp_clips"
-        tmp_frames_dir = f"config/custom_components/{DOMAIN}/tmp_frames"
+        tmp_clips_dir = f"/config/custom_components/{DOMAIN}/tmp_clips"
+        tmp_frames_dir = f"/config/custom_components/{DOMAIN}/tmp_frames"
         if not video_paths:
             video_paths = []
         """Wrapper for client.add_frame for videos"""
@@ -196,7 +201,14 @@ async def add_videos(self, video_paths, event_ids, interval, target_width, inclu
         # Clean up tmp dirs
         try:
             await self.hass.loop.run_in_executor(None, shutil.rmtree, tmp_clips_dir)
+            _LOGGER.info(
+                f"Deleted tmp folder: {tmp_clips_dir}")
+        except FileNotFoundError as e:
+            _LOGGER.error(f"Failed to delete tmp folder: {e}")
+        try:
             await self.hass.loop.run_in_executor(None, shutil.rmtree, tmp_frames_dir)
+            _LOGGER.info(
+                f"Deleted tmp folder: {tmp_frames_dir}")
         except FileNotFoundError as e:
-            pass
+            _LOGGER.error(f"Failed to delete tmp folders: {e}")
         return self.client
diff --git a/custom_components/llmvision/request_handlers.py b/custom_components/llmvision/request_handlers.py
@@ -13,12 +13,16 @@
     CONF_OLLAMA_IP_ADDRESS,
     CONF_OLLAMA_PORT,
     CONF_OLLAMA_HTTPS,
+    CONF_CUSTOM_OPENAI_ENDPOINT,
+    CONF_CUSTOM_OPENAI_API_KEY,
     VERSION_ANTHROPIC,
+    ENDPOINT_OPENAI,
     ERROR_OPENAI_NOT_CONFIGURED,
     ERROR_ANTHROPIC_NOT_CONFIGURED,
     ERROR_GOOGLE_NOT_CONFIGURED,
     ERROR_LOCALAI_NOT_CONFIGURED,
     ERROR_OLLAMA_NOT_CONFIGURED,
+    ERROR_CUSTOM_OPENAI_NOT_CONFIGURED,
     ERROR_NO_IMAGE_INPUT
 )
 
@@ -103,16 +107,29 @@ async def make_request(self, call):
                                               ip_address=ip_address,
                                               port=port,
                                               https=https)
+        elif call.provider == 'Custom OpenAI':
+            api_key = self.hass.data.get(DOMAIN).get(
+                CONF_CUSTOM_OPENAI_API_KEY, "")
+            endpoint = self.hass.data.get(DOMAIN).get(CONF_CUSTOM_OPENAI_ENDPOINT)
 
+            # Additional debug logging
+            _LOGGER.debug(f"Data from DOMAIN: {self.hass.data.get(DOMAIN)}")
+            _LOGGER.debug(f"API Key: {api_key}")
+            _LOGGER.debug(f"Endpoint: {endpoint}")
+
+            model = call.model
+            self._validate_call(provider=call.provider,
+                                api_key=api_key,
+                                base64_images=self.base64_images)
+            response_text = await self.openai(model=model, api_key=api_key, endpoint=endpoint)
         return {"response_text": response_text}
 
     def add_frame(self, base64_image, filename):
         self.base64_images.append(base64_image)
         self.filenames.append(filename)
 
     # Request Handlers
-    async def openai(self, model, api_key):
-        from .const import ENDPOINT_OPENAI
+    async def openai(self, model, api_key, endpoint=ENDPOINT_OPENAI):
         # Set headers and payload
         headers = {'Content-type': 'application/json',
                    'Authorization': 'Bearer ' + api_key}
@@ -138,7 +155,7 @@ async def openai(self, model, api_key):
         )
 
         response = await self._post(
-            url=ENDPOINT_OPENAI, headers=headers, data=data)
+            url=endpoint, headers=headers, data=data)
 
         response_text = response.get(
             "choices")[0].get("message").get("content")
@@ -301,6 +318,9 @@ async def ollama(self, model, ip_address, port, https):
     async def _post(self, url, headers, data):
         """Post data to url and return response data"""
         _LOGGER.info(f"Request data: {sanitize_data(data)}")
+        _LOGGER.debug(
+            f"URL type: {type(url)}, Headers type: {type(headers)}, Data type: {type(data)}")
+
         try:
             response = await self.session.post(url, headers=headers, json=data)
         except Exception as e:
@@ -352,6 +372,9 @@ def _validate_call(self, provider, api_key, base64_images, ip_address=None, port
         elif provider == 'Ollama':
             if not ip_address or not port:
                 raise ServiceValidationError(ERROR_OLLAMA_NOT_CONFIGURED)
+        elif provider == 'Custom OpenAI':
+            if not api_key:
+                raise ServiceValidationError(ERROR_CUSTOM_OPENAI_NOT_CONFIGURED)
         # Check media input
         if base64_images == []:
             raise ServiceValidationError(ERROR_NO_IMAGE_INPUT)

diff --git a/custom_components/llmvision/services.yaml b/custom_components/llmvision/services.yaml
@@ -15,6 +15,7 @@ image_analyzer:
             - 'Google'
             - 'Ollama'
             - 'LocalAI'
+            - 'Custom OpenAI'
     model:
       name: Model
       required: false

diff --git a/custom_components/llmvision/strings.json b/custom_components/llmvision/strings.json
@@ -43,6 +43,14 @@
                 "data": {
                     "google_api_key": "Your API key"
                 }
+            },
+            "custom_openai": {
+                "title": "Configure Custom OpenAI provider",
+                "description": "**Important**: Only works if the API is compatible with OpenAI's API. If the API doesn't require an API key, leave it empty. The endpoint should have the following format: `http(s)://baseURL(:port)/some/endpoint`",
+                "data": {
+                    "custom_openai_endpoint": "Custom Endpoint",
+                    "custom_openai_api_key": "Your API key"
+                }
             }
         },
         "error": {