diff --git a/README.md b/README.md index 886d185..6592b31 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@

- + Issues @@ -31,20 +31,19 @@
**LLM Vision** is a Home Assistant integration to analyze images, videos and camera feeds using the vision capabilities of multimodal LLMs. -Supported providers are OpenAI, Anthropic, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI) and [Ollama](https://ollama.com/). +Supported providers are OpenAI, Anthropic, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI), [Ollama](https://ollama.com/) and any OpenAI compatible API. ## Features -- Compatible with OpenAI, Anthropic Claude, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI) and [Ollama](https://ollama.com/) +- Compatible with OpenAI, Anthropic Claude, Google Gemini, [LocalAI](https://github.com/mudler/LocalAI), [Ollama](https://ollama.com/) and custom OpenAI compatible APIs - Takes images and video from camera entities as input - Takes local image and video files as input - Images can be downscaled for faster processing ## Resources -Check the docs for detailed instructions on how to set up LLM Vision and each of the supported providers as well as usage examples and service call parameters: +Check the docs for detailed instructions on how to set up LLM Vision and each of the supported providers, get inspiration from examples or join the discussion on the Home Assistant Community. -
+ -Check [📖 Examples](https://llm-vision.gitbook.io/examples/) on how you can integrate llmvision into your Home Assistant setup or join the [🗨️ discussion](https://community.home-assistant.io/t/gpt-4o-vision-capabilities-in-home-assistant/729241) on the Home Assistant Community. ## Installation [![Open a repository inside the Home Assistant Community Store.](https://my.home-assistant.io/badges/hacs_repository.svg)](https://my.home-assistant.io/redirect/hacs_repository/?owner=valentinfrlch&repository=ha-llmvision&category=Integration) @@ -69,9 +68,9 @@ logger: > These are planned features and ideas. They are subject to change and may not be implemented in the order listed or at all. 1. **New Provider**: NVIDIA ChatRTX -2. **New Provider**: Custom (OpenAI API compatible) Providers -3. **Animation Support**: Support for animated GIFs -4. **HACS**: Include in HACS default +2. **Animation Support**: Support for animated GIFs +3. **HACS**: Include in HACS default +4. [x] ~~**New Provider**: Custom (OpenAI API compatible) Providers~~ 5. [x] ~~**Feature**: HTTPS support for LocalAI and Ollama~~ 6. [x] ~~**Feature**: Support for video files~~ 7. [x] ~~**Feature**: Analyze Frigate Recordings using frigate's `event_id`~~ diff --git a/custom_components/llmvision/__init__.py b/custom_components/llmvision/__init__.py index bf806dc..e81bfc5 100644 --- a/custom_components/llmvision/__init__.py +++ b/custom_components/llmvision/__init__.py @@ -10,6 +10,8 @@ CONF_OLLAMA_IP_ADDRESS, CONF_OLLAMA_PORT, CONF_OLLAMA_HTTPS, + CONF_CUSTOM_OPENAI_ENDPOINT, + CONF_CUSTOM_OPENAI_API_KEY, MODEL, PROVIDER, MAXTOKENS, @@ -45,6 +47,8 @@ async def async_setup_entry(hass, entry): ollama_ip_address = entry.data.get(CONF_OLLAMA_IP_ADDRESS) ollama_port = entry.data.get(CONF_OLLAMA_PORT) ollama_https = entry.data.get(CONF_OLLAMA_HTTPS) + custom_openai_endpoint = entry.data.get(CONF_CUSTOM_OPENAI_ENDPOINT) + custom_openai_api_key = entry.data.get(CONF_CUSTOM_OPENAI_API_KEY) # Ensure DOMAIN exists in hass.data if DOMAIN not in hass.data: @@ -63,6 +67,8 @@ async def async_setup_entry(hass, entry): CONF_OLLAMA_IP_ADDRESS: ollama_ip_address, CONF_OLLAMA_PORT: ollama_port, CONF_OLLAMA_HTTPS: ollama_https, + CONF_CUSTOM_OPENAI_ENDPOINT: custom_openai_endpoint, + CONF_CUSTOM_OPENAI_API_KEY: custom_openai_api_key }.items() if value is not None }) @@ -104,6 +110,8 @@ def _default_model(self, provider): return "gpt-4-vision-preview" elif provider == "Ollama": return "llava" + elif provider == "Custom OpenAI": + return "gpt-4o-mini" def setup(hass, config): diff --git a/custom_components/llmvision/config_flow.py b/custom_components/llmvision/config_flow.py index e41df68..f69a9fd 100644 --- a/custom_components/llmvision/config_flow.py +++ b/custom_components/llmvision/config_flow.py @@ -13,6 +13,8 @@ CONF_OLLAMA_IP_ADDRESS, CONF_OLLAMA_PORT, CONF_OLLAMA_HTTPS, + CONF_CUSTOM_OPENAI_API_KEY, + CONF_CUSTOM_OPENAI_ENDPOINT, VERSION_ANTHROPIC, ) import voluptuous as vol @@ -120,6 +122,39 @@ async def openai(self): _LOGGER.error("Could not connect to OpenAI server.") raise ServiceValidationError("handshake_failed") + async def custom_openai(self): + self._validate_provider() + _LOGGER.debug(f"Splits: {len(self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(":"))}") + # URL with port + try: + if len(self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(":")) > 2: + protocol = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split( + "://")[0] + base_url = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split( + "://")[1].split("/")[0] + port = ":" + self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split(":")[ + 1].split("/")[0] + # URL without port + else: + protocol = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split( + "://")[0] + base_url = self.user_input[CONF_CUSTOM_OPENAI_ENDPOINT].split( + "://")[1].split("/")[0] + port = "" + endpoint = "/v1/models" + header = {'Content-type': 'application/json', + 'Authorization': 'Bearer ' + self.user_input[CONF_CUSTOM_OPENAI_API_KEY]} + except Exception as e: + _LOGGER.error(f"Could not parse endpoint: {e}") + raise ServiceValidationError("endpoint_parse_failed") + + _LOGGER.debug( + f"Connecting to: [protocol: {protocol}, base_url: {base_url}, port: {port}, endpoint: {endpoint}]") + + if not await self._handshake(base_url=base_url, port=port, protocol=protocol, endpoint=endpoint, header=header): + _LOGGER.error("Could not connect to Custom OpenAI server.") + raise ServiceValidationError("handshake_failed") + async def anthropic(self): self._validate_provider() if not await self._validate_api_key(self.user_input[CONF_ANTHROPIC_API_KEY]): @@ -149,6 +184,8 @@ def get_configured_providers(self): providers.append("LocalAI") if CONF_OLLAMA_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_OLLAMA_PORT in self.hass.data[DOMAIN]: providers.append("Ollama") + if CONF_CUSTOM_OPENAI_ENDPOINT in self.hass.data[DOMAIN]: + providers.append("Custom OpenAI") return providers @@ -167,6 +204,7 @@ async def handle_provider(self, provider, configured_providers): "Google": self.async_step_google, "Ollama": self.async_step_ollama, "LocalAI": self.async_step_localai, + "Custom OpenAI": self.async_step_custom_openai, } step_method = provider_steps.get(provider) @@ -180,7 +218,7 @@ async def async_step_user(self, user_input=None): data_schema = vol.Schema({ vol.Required("provider", default="OpenAI"): selector({ "select": { - "options": ["OpenAI", "Anthropic", "Google", "Ollama", "LocalAI"], + "options": ["OpenAI", "Anthropic", "Google", "Ollama", "LocalAI", "Custom OpenAI"], "mode": "dropdown", "sort": False, "custom_value": False @@ -339,3 +377,31 @@ async def async_step_google(self, user_input=None): step_id="google", data_schema=data_schema, ) + + async def async_step_custom_openai(self, user_input=None): + data_schema = vol.Schema({ + vol.Required(CONF_CUSTOM_OPENAI_ENDPOINT): str, + vol.Optional(CONF_CUSTOM_OPENAI_API_KEY): str, + }) + + if user_input is not None: + # save provider to user_input + user_input["provider"] = self.init_info["provider"] + validator = Validator(self.hass, user_input) + try: + await validator.custom_openai() + # add the mode to user_input + user_input["provider"] = self.init_info["provider"] + return self.async_create_entry(title="LLM Vision Custom OpenAI", data=user_input) + except ServiceValidationError as e: + _LOGGER.error(f"Validation failed: {e}") + return self.async_show_form( + step_id="custom_openai", + data_schema=data_schema, + errors={"base": "handshake_failed"} + ) + + return self.async_show_form( + step_id="custom_openai", + data_schema=data_schema, + ) \ No newline at end of file diff --git a/custom_components/llmvision/manifest.json b/custom_components/llmvision/manifest.json index 28f11a1..220c447 100644 --- a/custom_components/llmvision/manifest.json +++ b/custom_components/llmvision/manifest.json @@ -6,5 +6,5 @@ "documentation": "https://github.com/valentinfrlch/ha-llmvision", "iot_class": "cloud_polling", "issue_tracker": "https://github.com/valentinfrlch/ha-llmvision/issues", - "version": "1.0.4" -} + "version": "1.1.0" +} \ No newline at end of file diff --git a/custom_components/llmvision/media_handlers.py b/custom_components/llmvision/media_handlers.py index 688274b..9cb8bc3 100644 --- a/custom_components/llmvision/media_handlers.py +++ b/custom_components/llmvision/media_handlers.py @@ -37,6 +37,7 @@ async def resize_image(self, target_width, image_path=None, image_data=None, img img = await self.hass.loop.run_in_executor(None, Image.open, image_path) with img: # Check if the image is a GIF and convert if necessary + _LOGGER.debug(f"Image format: {img.format}") if img.format == 'GIF': # Convert GIF to RGB img = img.convert('RGB') @@ -58,6 +59,10 @@ async def resize_image(self, target_width, image_path=None, image_data=None, img img_byte_arr.write(image_data) img = await self.hass.loop.run_in_executor(None, Image.open, img_byte_arr) with img: + _LOGGER.debug(f"Image format: {img.format}") + if img.format == 'GIF': + # Convert GIF to RGB + img = img.convert('RGB') # calculate new height based on aspect ratio width, height = img.size aspect_ratio = width / height @@ -131,8 +136,8 @@ async def add_images(self, image_entities, image_paths, target_width, include_fi return self.client async def add_videos(self, video_paths, event_ids, interval, target_width, include_filename): - tmp_clips_dir = f"config/custom_components/{DOMAIN}/tmp_clips" - tmp_frames_dir = f"config/custom_components/{DOMAIN}/tmp_frames" + tmp_clips_dir = f"/config/custom_components/{DOMAIN}/tmp_clips" + tmp_frames_dir = f"/config/custom_components/{DOMAIN}/tmp_frames" if not video_paths: video_paths = [] """Wrapper for client.add_frame for videos""" @@ -196,7 +201,14 @@ async def add_videos(self, video_paths, event_ids, interval, target_width, inclu # Clean up tmp dirs try: await self.hass.loop.run_in_executor(None, shutil.rmtree, tmp_clips_dir) + _LOGGER.info( + f"Deleted tmp folder: {tmp_clips_dir}") + except FileNotFoundError as e: + _LOGGER.error(f"Failed to delete tmp folder: {e}") + try: await self.hass.loop.run_in_executor(None, shutil.rmtree, tmp_frames_dir) + _LOGGER.info( + f"Deleted tmp folder: {tmp_frames_dir}") except FileNotFoundError as e: - pass + _LOGGER.error(f"Failed to delete tmp folders: {e}") return self.client \ No newline at end of file diff --git a/custom_components/llmvision/request_handlers.py b/custom_components/llmvision/request_handlers.py index bf14fdc..fccc486 100644 --- a/custom_components/llmvision/request_handlers.py +++ b/custom_components/llmvision/request_handlers.py @@ -13,12 +13,16 @@ CONF_OLLAMA_IP_ADDRESS, CONF_OLLAMA_PORT, CONF_OLLAMA_HTTPS, + CONF_CUSTOM_OPENAI_ENDPOINT, + CONF_CUSTOM_OPENAI_API_KEY, VERSION_ANTHROPIC, + ENDPOINT_OPENAI, ERROR_OPENAI_NOT_CONFIGURED, ERROR_ANTHROPIC_NOT_CONFIGURED, ERROR_GOOGLE_NOT_CONFIGURED, ERROR_LOCALAI_NOT_CONFIGURED, ERROR_OLLAMA_NOT_CONFIGURED, + ERROR_CUSTOM_OPENAI_NOT_CONFIGURED, ERROR_NO_IMAGE_INPUT ) @@ -103,7 +107,21 @@ async def make_request(self, call): ip_address=ip_address, port=port, https=https) + elif call.provider == 'Custom OpenAI': + api_key = self.hass.data.get(DOMAIN).get( + CONF_CUSTOM_OPENAI_API_KEY, "") + endpoint = self.hass.data.get(DOMAIN).get(CONF_CUSTOM_OPENAI_ENDPOINT) + # Additional debug logging + _LOGGER.debug(f"Data from DOMAIN: {self.hass.data.get(DOMAIN)}") + _LOGGER.debug(f"API Key: {api_key}") + _LOGGER.debug(f"Endpoint: {endpoint}") + + model = call.model + self._validate_call(provider=call.provider, + api_key=api_key, + base64_images=self.base64_images) + response_text = await self.openai(model=model, api_key=api_key, endpoint=endpoint) return {"response_text": response_text} def add_frame(self, base64_image, filename): @@ -111,8 +129,7 @@ def add_frame(self, base64_image, filename): self.filenames.append(filename) # Request Handlers - async def openai(self, model, api_key): - from .const import ENDPOINT_OPENAI + async def openai(self, model, api_key, endpoint=ENDPOINT_OPENAI): # Set headers and payload headers = {'Content-type': 'application/json', 'Authorization': 'Bearer ' + api_key} @@ -138,7 +155,7 @@ async def openai(self, model, api_key): ) response = await self._post( - url=ENDPOINT_OPENAI, headers=headers, data=data) + url=endpoint, headers=headers, data=data) response_text = response.get( "choices")[0].get("message").get("content") @@ -301,6 +318,9 @@ async def ollama(self, model, ip_address, port, https): async def _post(self, url, headers, data): """Post data to url and return response data""" _LOGGER.info(f"Request data: {sanitize_data(data)}") + _LOGGER.debug( + f"URL type: {type(url)}, Headers type: {type(headers)}, Data type: {type(data)}") + try: response = await self.session.post(url, headers=headers, json=data) except Exception as e: @@ -352,6 +372,9 @@ def _validate_call(self, provider, api_key, base64_images, ip_address=None, port elif provider == 'Ollama': if not ip_address or not port: raise ServiceValidationError(ERROR_OLLAMA_NOT_CONFIGURED) + elif provider == 'Custom OpenAI': + if not api_key: + raise ServiceValidationError(ERROR_CUSTOM_OPENAI_NOT_CONFIGURED) # Check media input if base64_images == []: raise ServiceValidationError(ERROR_NO_IMAGE_INPUT) diff --git a/custom_components/llmvision/services.yaml b/custom_components/llmvision/services.yaml index c9855f6..a2d09e5 100644 --- a/custom_components/llmvision/services.yaml +++ b/custom_components/llmvision/services.yaml @@ -15,6 +15,7 @@ image_analyzer: - 'Google' - 'Ollama' - 'LocalAI' + - 'Custom OpenAI' model: name: Model required: false diff --git a/custom_components/llmvision/strings.json b/custom_components/llmvision/strings.json index b0bec17..d4500af 100644 --- a/custom_components/llmvision/strings.json +++ b/custom_components/llmvision/strings.json @@ -43,6 +43,14 @@ "data": { "google_api_key": "Your API key" } + }, + "custom_openai": { + "title": "Configure Custom OpenAI provider", + "description": "**Important**: Only works if the API is compatible with OpenAI's API. If the API doesn't require an API key, leave it empty. The endpoint should have the following format: `http(s)://baseURL(:port)/some/endpoint`", + "data": { + "custom_openai_endpoint": "Custom Endpoint", + "custom_openai_api_key": "Your API key" + } } }, "error": { diff --git a/custom_components/llmvision/translations/de.json b/custom_components/llmvision/translations/de.json index 0533116..ed49ed4 100644 --- a/custom_components/llmvision/translations/de.json +++ b/custom_components/llmvision/translations/de.json @@ -2,53 +2,61 @@ "config": { "step": { "user": { - "title": "Wählen Sie Ihren Provider", - "description": "Wählen Sie den Anbieter den verwenden möchten." + "title": "Anbieter auswählen", + "description": "Wähle den Anbieter den du konfigurieren möchtest." }, "localai": { - "title": "Verbinden Sie sich mit Ihrem LocalAI-Server", - "description": "Geben Sie die IP-Adresse und den Port Ihres LocalAI-Servers an.", + "title": "Mit LocalAI-Server verbinden", + "description": "Gib die IP-Adresse und den Port deines LocalAI-Servers an.", "data": { "localai_ip": "IP-Adresse", "localai_port": "Port" } }, "ollama": { - "title": "Verbinden Sie sich mit Ihrem Ollama-Server", - "description": "Geben Sie die IP-Adresse und den Port Ihres Ollama-Servers an.", + "title": "Mit Ollama-Server verbinden", + "description": "Gib die IP-Adresse und den Port deines Ollama-Servers an.", "data": { "localai_ip": "IP-Addresse", "localai_port": "Port" } }, "openai": { - "title": "Fügen Sie den OpenAI-API-Schlüssel hinzu", - "description": "Geben Sie einen gültigen OpenAI-API-Schlüssel an.", + "title": "OpenAI", + "description": "Gib einen gültigen OpenAI API-key ein.", "data": { - "api_key": "Ihr API-Schlüssel" + "api_key": "Dein API-key" } }, "anthropic": { - "title": "Fügen Sie den Anthropic-API-Schlüssel hinzu", - "description": "Geben Sie einen gültigen Anthropic-API-Schlüssel an.", + "title": "Anthropic", + "description": "Gib einen gültigen Anthropic API-key ein.", "data": { - "api_key": "Ihr API-Schlüssel" + "api_key": "Dein API-key" } }, "google": { - "title": "Fügen Sie den Google Gemini-API-Schlüssel hinzu", - "description": "Geben Sie einen gültigen Google Gemini-API-Schlüssel an.", + "title": "Google Gemini", + "description": "Gib einen gültigen Google Gemini API-key ein.", "data": { - "api_key": "Ihr API-Schlüssel" + "api_key": "Dein API-key" + } + }, + "custom_openai": { + "title": "OpenAI-kompatiblen Provider konfigurieren", + "description": "**Wichtig**: Funktioniert nur mit OpenAI API kompatiblen APIs. 'Custom Endpoint' muss das folgende Format haben: `http(s)://baseURL(:port)/some/endpoint`", + "data": { + "custom_openai_endpoint": "Custom Endpoint", + "custom_openai_api_key": "Dein API-key" } } }, "error": { - "handshake_failed": "Verbindung zum Server konnte nicht hergestellt werden. Überprüfen Sie Ihren API-Schlüssel oder die IP und den Port", - "empty_api_key": "Ungültiger API-Key" + "handshake_failed": "Verbindung zum Server konnte nicht hergestellt werden. Überprüfe deinen API-key oder IP und Port", + "empty_api_key": "Ungültiger API-key" }, "abort": { - "already_configured": "Anbieter ist bereits konfiguriert. Löschen Sie die vorhandene Konfiguration, um eine neue hinzuzufügen.", + "already_configured": "Anbieter ist bereits konfiguriert. Lösche die vorhandene Konfiguration, um eine neue hinzuzufügen.", "unknown_provider": "Unbekannter Anbieter" } } diff --git a/custom_components/llmvision/translations/en.json b/custom_components/llmvision/translations/en.json index b0bec17..d4500af 100644 --- a/custom_components/llmvision/translations/en.json +++ b/custom_components/llmvision/translations/en.json @@ -43,6 +43,14 @@ "data": { "google_api_key": "Your API key" } + }, + "custom_openai": { + "title": "Configure Custom OpenAI provider", + "description": "**Important**: Only works if the API is compatible with OpenAI's API. If the API doesn't require an API key, leave it empty. The endpoint should have the following format: `http(s)://baseURL(:port)/some/endpoint`", + "data": { + "custom_openai_endpoint": "Custom Endpoint", + "custom_openai_api_key": "Your API key" + } } }, "error": { diff --git a/custom_components/llmvision/translations/fr.json b/custom_components/llmvision/translations/fr.json deleted file mode 100644 index 274a5f1..0000000 --- a/custom_components/llmvision/translations/fr.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "config": { - "step": { - "user": { - "title": "Choisissez votre fournisseur", - "description": "Sélectionnez le fournisseur que vous souhaitez utiliser pour votre IA." - }, - "localai": { - "title": "Connectez-vous à votre serveur LocalAI", - "description": "Fournissez l'adresse IP et le port de votre serveur LocalAI.", - "data": { - "localai_ip": "Adresse IP", - "localai_port": "Port" - } - }, - "ollama": { - "title": "Connectez-vous à votre serveur Ollama", - "description": "Fournissez l'adresse IP et le port de votre serveur Ollama.", - "data": { - "localai_ip": "Adresse IP", - "localai_port": "Port" - } - }, - "openai": { - "title": "Ajoutez la clé API OpenAI", - "description": "Fournissez une clé API OpenAI valide.", - "data": { - "api_key": "Votre clé API" - } - }, - "anthropic": { - "title": "Ajoutez la clé API OpenAI", - "description": "Fournissez une clé API Anthropic valide.", - "data": { - "api_key": "Votre clé API" - } - }, - "google": { - "title": "Ajoutez la clé API Google Gemini", - "description": "Fournissez une clé API Google Gemini valide.", - "data": { - "api_key": "Votre clé API" - } - } - }, - "error": { - "handshake_failed": "Impossible de se connecter au serveur. Vérifiez votre clé API ou l'adresse IP et le port.", - "empty_api_key": "clé API invalide" - }, - "abort": { - "already_configured": "Le fournisseur est déjà configuré. Supprimez la configuration existante pour en ajouter une nouvelle.", - "unknown_provider": "Fournisseur inconnu" - } - } -} \ No newline at end of file