From 8eb09d3eed111ac4a26d79aab36591460fd4a4f7 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Sat, 21 Oct 2023 12:15:07 +0530 Subject: [PATCH 01/13] Migrate RemoteWhisperTranscriber to OpenAI SDK --- .../components/audio/whisper_remote.py | 155 ++++----- .../components/audio/test_whisper_remote.py | 318 ++++++++---------- 2 files changed, 220 insertions(+), 253 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 3317e251e6..cdc04d7b44 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -6,15 +6,15 @@ from pathlib import Path from haystack.preview.utils import request_with_retry -from haystack.preview import component, Document, default_to_dict +from haystack.preview import component, Document, default_to_dict, default_from_dict -logger = logging.getLogger(__name__) +import openai -OPENAI_TIMEOUT = float(os.environ.get("HAYSTACK_OPENAI_TIMEOUT_SEC", 600)) +logger = logging.getLogger(__name__) -WhisperRemoteModel = Literal["whisper-1"] +API_BASE_URL = "https://api.openai.com/v1" @component @@ -30,52 +30,86 @@ class RemoteWhisperTranscriber: def __init__( self, - api_key: str, - model_name: WhisperRemoteModel = "whisper-1", - api_base: str = "https://api.openai.com/v1", - whisper_params: Optional[Dict[str, Any]] = None, + api_key: Optional[str] = None, + model_name: str = "whisper-1", + organization: Optional[str] = None, + api_base_url: str = API_BASE_URL, + **kwargs, ): """ Transcribes a list of audio files into a list of Documents. :param api_key: OpenAI API key. :param model_name: Name of the model to use. It now accepts only `whisper-1`. + :param organization: The OpenAI-Organization ID, defaults to `None`. For more details, see OpenAI + [documentation](https://platform.openai.com/docs/api-reference/requesting-organization). :param api_base: OpenAI base URL, defaults to `"https://api.openai.com/v1"`. + :param kwargs: Other parameters to use for the model. These parameters are all sent directly to the OpenAI + endpoint. See OpenAI [documentation](https://platform.openai.com/docs/api-reference/audio) for more details. + Some of the supported parameters: + - `language`: The language of the input audio. + Supplying the input language in ISO-639-1 format + will improve accuracy and latency. + - `prompt`: An optional text to guide the model's + style or continue a previous audio segment. + The prompt should match the audio language. + - `response_format`: The format of the transcript + output, in one of these options: json, text, srt, + verbose_json, or vtt. Defaults to "json". Currently only "json" is supported. + - `temperature`: The sampling temperature, between 0 + and 1. Higher values like 0.8 will make the output more + random, while lower values like 0.2 will make it more + focused and deterministic. If set to 0, the model will + use log probability to automatically increase the + temperature until certain thresholds are hit. """ - if model_name not in get_args(WhisperRemoteModel): - raise ValueError( - f"Model name not recognized. Choose one among: " f"{', '.join(get_args(WhisperRemoteModel))}." - ) - if not api_key: - raise ValueError("API key is None.") - self.model_name = model_name + if api_key is None: + try: + api_key = os.environ["OPENAI_API_KEY"] + except KeyError as e: + raise ValueError( + "RemoteWhisperTranscriber expects an OpenAI API key. " + "Set the OPENAI_API_KEY environment variable (recommended) or pass it explicitly." + ) from e + self.api_key = api_key - self.api_base = api_base - self.whisper_params = whisper_params or {} + self.organization = organization + self.model_name = model_name + self.api_base_url = api_base_url - @component.output_types(documents=List[Document]) - def run(self, audio_files: List[Path], whisper_params: Optional[Dict[str, Any]] = None): - """ - Transcribe the audio files into a list of Documents, one for each input file. + # Only response_format = "json" is supported + whisper_params = kwargs + whisper_params["response_format"] = "json" + self.whisper_params = whisper_params - For the supported audio formats, languages, and other parameters, see the - [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper - [github repo](https://github.com/openai/whisper). + openai.api_key = api_key + if organization is not None: + openai.organization = organization - :param audio_files: a list of paths or binary streams to transcribe - :returns: a list of Documents, one for each file. The content of the document is the transcription text, - while the document's metadata contains all the other values returned by the Whisper model, such as the - alignment data. Another key called `audio_file` contains the path to the audio file used for the - transcription. + def to_dict(self) -> Dict[str, Any]: """ - if whisper_params is None: - whisper_params = self.whisper_params + Serialize this component to a dictionary. + This method overrides the default serializer in order to + avoid leaking the `api_key` value passed to the constructor. + """ + return default_to_dict( + self, + model_name=self.model_name, + organization=self.organization, + api_base_url=self.api_base_url, + **self.whisper_params, + ) - documents = self.transcribe(audio_files, **whisper_params) - return {"documents": documents} + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "RemoteWhisperTranscriber": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) - def transcribe(self, audio_files: Sequence[Union[str, Path, BinaryIO]], **kwargs) -> List[Document]: + @component.output_types(documents=List[Document]) + def run(self, audio_files: Sequence[Union[str, Path, BinaryIO]]): """ Transcribe the audio files into a list of Documents, one for each input file. @@ -84,54 +118,21 @@ def transcribe(self, audio_files: Sequence[Union[str, Path, BinaryIO]], **kwargs [github repo](https://github.com/openai/whisper). :param audio_files: a list of paths or binary streams to transcribe - :returns: a list of transcriptions. + :returns: a list of Documents, one for each file. The content of the document is the transcription text, while the document's metadata contains a key called `audio_file`, which contains the path to the + audio file used for the transcription. """ - transcriptions = self._raw_transcribe(audio_files=audio_files, **kwargs) documents = [] - for audio, transcript in zip(audio_files, transcriptions): - content = transcript.pop("text") - if not isinstance(audio, (str, Path)): - audio = "<>" - doc = Document(text=content, metadata={"audio_file": audio, **transcript}) - documents.append(doc) - return documents - - def _raw_transcribe(self, audio_files: Sequence[Union[str, Path, BinaryIO]], **kwargs) -> List[Dict[str, Any]]: - """ - Transcribe the given audio files. Returns a list of strings. - - For the supported audio formats, languages, and other parameters, see the - [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper - [github repo](https://github.com/openai/whisper). - - :param audio_files: a list of paths or binary streams to transcribe. - :param kwargs: any other parameters that Whisper API can understand. - :returns: a list of transcriptions as they are produced by the Whisper API (JSON). - """ - translate = kwargs.pop("translate", False) - url = f"{self.api_base}/audio/{'translations' if translate else 'transcriptions'}" - data = {"model": self.model_name, **kwargs} - headers = {"Authorization": f"Bearer {self.api_key}"} - - transcriptions = [] for audio_file in audio_files: if isinstance(audio_file, (str, Path)): audio_file = open(audio_file, "rb") - request_files = ("file", (audio_file.name, audio_file, "application/octet-stream")) - response = request_with_retry( - method="post", url=url, data=data, headers=headers, files=[request_files], timeout=OPENAI_TIMEOUT - ) - transcription = json.loads(response.content) + content = openai.Audio.transcribe(file=audio_file, model=self.model_name, **self.whisper_params) - transcriptions.append(transcription) - return transcriptions + # Set the audio file to <> for document metadata if the path is not present + if not isinstance(audio_file, (str, Path)): + audio_file = "<>" - def to_dict(self) -> Dict[str, Any]: - """ - This method overrides the default serializer in order to avoid leaking the `api_key` value passed - to the constructor. - """ - return default_to_dict( - self, model_name=self.model_name, api_base=self.api_base, whisper_params=self.whisper_params - ) + doc = Document(text=content["text"], metadata={"audio_file": audio_file}) + documents.append(doc) + + return {"documents": documents} diff --git a/test/preview/components/audio/test_whisper_remote.py b/test/preview/components/audio/test_whisper_remote.py index 0c8cb235db..2b6cc1c62f 100644 --- a/test/preview/components/audio/test_whisper_remote.py +++ b/test/preview/components/audio/test_whisper_remote.py @@ -1,226 +1,192 @@ -import os +from typing import Union, BinaryIO +from pathlib import Path from unittest.mock import MagicMock, patch +from copy import deepcopy + import pytest +import openai +from openai.util import convert_to_openai_object + from haystack.preview.dataclasses import Document -from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber, OPENAI_TIMEOUT +from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber + + +def mock_openai_response( + file: Union[str, Path, BinaryIO], model: str = "whisper-1", response_format="json", **kwargs +) -> openai.openai_object.OpenAIObject: + if isinstance(file, (str, Path)): + file_path = str(file) + else: + file_path = file.name + if response_format == "json": + dict_response = {"text": f"model: {model}, file: str{file_path}, test transcription"} + else: + dict_response = {} + + return convert_to_openai_object(dict_response) class TestRemoteWhisperTranscriber: @pytest.mark.unit - def test_init_unknown_model(self): - with pytest.raises(ValueError, match="not recognized"): - RemoteWhisperTranscriber(model_name="anything", api_key="something") + def test_init_no_key(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + error_msg = "RemoteWhisperTranscriber expects an OpenAI API key." + with pytest.raises(ValueError, match=error_msg): + RemoteWhisperTranscriber(api_key=None) @pytest.mark.unit def test_init_default(self): - transcriber = RemoteWhisperTranscriber(api_key="just a test") + transcriber = RemoteWhisperTranscriber(api_key="test_api_key") + + assert transcriber.api_key == "test_api_key" assert transcriber.model_name == "whisper-1" - assert transcriber.api_key == "just a test" - assert transcriber.api_base == "https://api.openai.com/v1" + assert transcriber.organization is None + assert transcriber.api_base_url == "https://api.openai.com/v1" + assert transcriber.whisper_params == {"response_format": "json"} @pytest.mark.unit - def test_init_no_key(self): - with pytest.raises(ValueError, match="API key is None"): - RemoteWhisperTranscriber(api_key=None) + def test_init_custom_parameters(self): + transcriber = RemoteWhisperTranscriber( + api_key="test_api_key", + model_name="whisper-1", + organization="test-org", + api_base_url="test_api_url", + language="en", + prompt="test-prompt", + response_format="json", + temperature="0.5", + ) + + assert transcriber.api_key == "test_api_key" + assert transcriber.model_name == "whisper-1" + assert transcriber.organization == "test-org" + assert transcriber.api_base_url == "test_api_url" + assert transcriber.whisper_params == { + "language": "en", + "prompt": "test-prompt", + "response_format": "json", + "temperature": "0.5", + } @pytest.mark.unit - def test_to_dict(self): - transcriber = RemoteWhisperTranscriber(api_key="test") + def test_to_dict_default_parameters(self): + transcriber = RemoteWhisperTranscriber(api_key="test_api_key") data = transcriber.to_dict() assert data == { "type": "RemoteWhisperTranscriber", "init_parameters": { "model_name": "whisper-1", - "api_base": "https://api.openai.com/v1", - "whisper_params": {}, + "api_base_url": "https://api.openai.com/v1", + "organization": None, + "response_format": "json", }, } @pytest.mark.unit def test_to_dict_with_custom_init_parameters(self): transcriber = RemoteWhisperTranscriber( - api_key="test", + api_key="test_api_key", model_name="whisper-1", - api_base="https://my.api.base/something_else/v3", - whisper_params={"return_segments": True, "temperature": [0.1, 0.6, 0.8]}, + organization="test-org", + api_base_url="test_api_url", + language="en", + prompt="test-prompt", + response_format="json", + temperature="0.5", ) data = transcriber.to_dict() assert data == { "type": "RemoteWhisperTranscriber", "init_parameters": { "model_name": "whisper-1", - "api_base": "https://my.api.base/something_else/v3", - "whisper_params": {"return_segments": True, "temperature": [0.1, 0.6, 0.8]}, + "organization": "test-org", + "api_base_url": "test_api_url", + "language": "en", + "prompt": "test-prompt", + "response_format": "json", + "temperature": "0.5", }, } - @pytest.mark.unit - def test_run_with_path(self, preview_samples_path): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = '{"text": "test transcription", "other_metadata": ["other", "meta", "data"]}' - comp = RemoteWhisperTranscriber(api_key="whatever") - - with patch("haystack.preview.utils.requests_utils.requests") as mocked_requests: - mocked_requests.request.return_value = mock_response - - result = comp.run(audio_files=[preview_samples_path / "audio" / "this is the content of the document.wav"]) - expected = Document( - text="test transcription", - metadata={ - "audio_file": preview_samples_path / "audio" / "this is the content of the document.wav", - "other_metadata": ["other", "meta", "data"], - }, - ) - assert result["documents"] == [expected] + def test_from_dict_with_defualt_parameters(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test_api_key") - @pytest.mark.unit - def test_run_with_str(self, preview_samples_path): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = '{"text": "test transcription", "other_metadata": ["other", "meta", "data"]}' - comp = RemoteWhisperTranscriber(api_key="whatever") - - with patch("haystack.preview.utils.requests_utils.requests") as mocked_requests: - mocked_requests.request.return_value = mock_response - - result = comp.run( - audio_files=[ - str((preview_samples_path / "audio" / "this is the content of the document.wav").absolute()) - ] - ) - expected = Document( - text="test transcription", - metadata={ - "audio_file": str( - (preview_samples_path / "audio" / "this is the content of the document.wav").absolute() - ), - "other_metadata": ["other", "meta", "data"], - }, - ) - assert result["documents"] == [expected] + data = { + "type": "RemoteWhisperTranscriber", + "init_parameters": { + "model_name": "whisper-1", + "api_base_url": "https://api.openai.com/v1", + "organization": None, + "response_format": "json", + }, + } - @pytest.mark.unit - def test_transcribe_with_stream(self, preview_samples_path): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = '{"text": "test transcription", "other_metadata": ["other", "meta", "data"]}' - comp = RemoteWhisperTranscriber(api_key="whatever") - - with patch("haystack.preview.utils.requests_utils.requests") as mocked_requests: - mocked_requests.request.return_value = mock_response - - with open(preview_samples_path / "audio" / "this is the content of the document.wav", "rb") as audio_stream: - result = comp.transcribe(audio_files=[audio_stream]) - expected = Document( - text="test transcription", - metadata={"audio_file": "<>", "other_metadata": ["other", "meta", "data"]}, - ) - assert result == [expected] + transcriber = RemoteWhisperTranscriber.from_dict(data) - @pytest.mark.unit - def test_api_transcription(self, preview_samples_path): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = '{"text": "test transcription", "other_metadata": ["other", "meta", "data"]}' - comp = RemoteWhisperTranscriber(api_key="whatever") - - with patch("haystack.preview.utils.requests_utils.requests") as mocked_requests: - mocked_requests.request.return_value = mock_response - - comp.run(audio_files=[preview_samples_path / "audio" / "this is the content of the document.wav"]) - requests_params = mocked_requests.request.call_args.kwargs - requests_params.pop("files") - assert requests_params == { - "method": "post", - "url": "https://api.openai.com/v1/audio/transcriptions", - "data": {"model": "whisper-1"}, - "headers": {"Authorization": "Bearer whatever"}, - "timeout": OPENAI_TIMEOUT, - } + assert transcriber.api_key == "test_api_key" + assert transcriber.model_name == "whisper-1" + assert transcriber.organization is None + assert transcriber.api_base_url == "https://api.openai.com/v1" + assert transcriber.whisper_params == {"response_format": "json"} - @pytest.mark.unit - def test_api_translation(self, preview_samples_path): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = '{"text": "test transcription", "other_metadata": ["other", "meta", "data"]}' - comp = RemoteWhisperTranscriber(api_key="whatever") - - with patch("haystack.preview.utils.requests_utils.requests") as mocked_requests: - mocked_requests.request.return_value = mock_response - - comp.run( - audio_files=[preview_samples_path / "audio" / "this is the content of the document.wav"], - whisper_params={"translate": True}, - ) - requests_params = mocked_requests.request.call_args.kwargs - requests_params.pop("files") - assert requests_params == { - "method": "post", - "url": "https://api.openai.com/v1/audio/translations", - "data": {"model": "whisper-1"}, - "headers": {"Authorization": "Bearer whatever"}, - "timeout": OPENAI_TIMEOUT, - } + def test_from_dict_with_custom_init_parameters(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test_api_key") - @pytest.mark.unit - @patch("haystack.preview.components.audio.whisper_remote.request_with_retry") - def test_default_api_base(self, mock_request, preview_samples_path): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = '{"text": "test transcription", "other_metadata": ["other", "meta", "data"]}' - mock_request.return_value = mock_response + data = { + "type": "RemoteWhisperTranscriber", + "init_parameters": { + "model_name": "whisper-1", + "organization": "test-org", + "api_base_url": "test_api_url", + "language": "en", + "prompt": "test-prompt", + "response_format": "json", + "temperature": "0.5", + }, + } + transcriber = RemoteWhisperTranscriber.from_dict(data) - transcriber = RemoteWhisperTranscriber(api_key="just a test") - assert transcriber.api_base == "https://api.openai.com/v1" + assert transcriber.api_key == "test_api_key" + assert transcriber.model_name == "whisper-1" + assert transcriber.organization == "test-org" + assert transcriber.api_base_url == "test_api_url" + assert transcriber.whisper_params == { + "language": "en", + "prompt": "test-prompt", + "response_format": "json", + "temperature": "0.5", + } - transcriber.transcribe(audio_files=[preview_samples_path / "audio" / "this is the content of the document.wav"]) - assert mock_request.call_args.kwargs["url"] == "https://api.openai.com/v1/audio/transcriptions" + def test_from_dict_with_defualt_parameters_no_env_var(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + + data = { + "type": "RemoteWhisperTranscriber", + "init_parameters": { + "model_name": "whisper-1", + "api_base_url": "https://api.openai.com/v1", + "organization": None, + "response_format": "json", + }, + } + + with pytest.raises(ValueError, match="RemoteWhisperTranscriber expects an OpenAI API key."): + RemoteWhisperTranscriber.from_dict(data) @pytest.mark.unit - @patch("haystack.preview.components.audio.whisper_remote.request_with_retry") - def test_custom_api_base(self, mock_request, preview_samples_path): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = '{"text": "test transcription", "other_metadata": ["other", "meta", "data"]}' - mock_request.return_value = mock_response - - transcriber = RemoteWhisperTranscriber(api_key="just a test", api_base="https://fake_api_base.com") - assert transcriber.api_base == "https://fake_api_base.com" - - transcriber.transcribe(audio_files=[preview_samples_path / "audio" / "this is the content of the document.wav"]) - assert mock_request.call_args.kwargs["url"] == "https://fake_api_base.com/audio/transcriptions" - - @pytest.mark.skipif( - not os.environ.get("OPENAI_API_KEY", None), - reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", - ) - @pytest.mark.integration - def test_whisper_remote_transcriber(self, preview_samples_path): - comp = RemoteWhisperTranscriber(api_key=os.environ.get("OPENAI_API_KEY")) - - output = comp.run( - audio_files=[ - preview_samples_path / "audio" / "this is the content of the document.wav", - str((preview_samples_path / "audio" / "the context for this answer is here.wav").absolute()), - open(preview_samples_path / "audio" / "answer.wav", "rb"), - ] - ) - docs = output["documents"] - assert len(docs) == 3 + def test_run_with_path(self, preview_samples_path): + model = "whisper-1" + file_path = preview_samples_path / "audio" / "this is the content of the document.wav" + with patch("haystack.preview.components.audio.whisper_remote.openai.Audio") as openai_audio_patch: + openai_audio_patch.transcribe.side_effect = mock_openai_response - assert docs[0].text.strip().lower() == "this is the content of the document." - assert ( - preview_samples_path / "audio" / "this is the content of the document.wav" == docs[0].metadata["audio_file"] - ) + transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") - assert docs[1].text.strip().lower() == "the context for this answer is here." - assert ( - str((preview_samples_path / "audio" / "the context for this answer is here.wav").absolute()) - == docs[1].metadata["audio_file"] - ) + result = transcriber.run(audio_files=[file_path]) + + assert result["documents"][0].text == f"model: {model}, file: str{file_path}, test transcription" - assert docs[2].text.strip().lower() == "answer." - assert docs[2].metadata["audio_file"] == "<>" + open_file = open(file_path, "rb") + openai_audio_patch.transcribe.assert_called_once_with(file=open_file, model=model, response_format="json") From 14aefd2c0710c3a56afed19e695807aa6e6ea0dc Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 23 Oct 2023 01:58:54 +0530 Subject: [PATCH 02/13] Migrate RemoteWhisperTranscriber to OpenAI SDK --- .../components/audio/whisper_remote.py | 27 +++-- .../components/audio/test_whisper_remote.py | 109 +++++++++++++++--- 2 files changed, 107 insertions(+), 29 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index cdc04d7b44..16632b53e3 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -109,7 +109,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "RemoteWhisperTranscriber": return default_from_dict(cls, data) @component.output_types(documents=List[Document]) - def run(self, audio_files: Sequence[Union[str, Path, BinaryIO]]): + def run(self, audio_files: List[Union[str, Path, BinaryIO]]): """ Transcribe the audio files into a list of Documents, one for each input file. @@ -118,21 +118,26 @@ def run(self, audio_files: Sequence[Union[str, Path, BinaryIO]]): [github repo](https://github.com/openai/whisper). :param audio_files: a list of paths or binary streams to transcribe - :returns: a list of Documents, one for each file. The content of the document is the transcription text, while the document's metadata contains a key called `audio_file`, which contains the path to the + :returns: a list of Documents, one for each file. The content of the + document is the transcription text, while the document's metadata + contains a key called `audio_file`, which contains the path to the audio file used for the transcription. """ documents = [] + for audio_file in audio_files: if isinstance(audio_file, (str, Path)): - audio_file = open(audio_file, "rb") - - content = openai.Audio.transcribe(file=audio_file, model=self.model_name, **self.whisper_params) - - # Set the audio file to <> for document metadata if the path is not present - if not isinstance(audio_file, (str, Path)): - audio_file = "<>" - - doc = Document(text=content["text"], metadata={"audio_file": audio_file}) + if isinstance(audio_file, str): + file_name = audio_file + else: + file_name = str(audio_file.absolute()) + with open(audio_file, "rb") as file: + content = openai.Audio.transcribe(file=file, model=self.model_name, **self.whisper_params) + else: + file_name = "<>" + content = openai.Audio.transcribe(file=audio_file, model=self.model_name, **self.whisper_params) + + doc = Document(text=content["text"], metadata={"audio_file": file_name}) documents.append(doc) return {"documents": documents} diff --git a/test/preview/components/audio/test_whisper_remote.py b/test/preview/components/audio/test_whisper_remote.py index 2b6cc1c62f..3a1a04b6ea 100644 --- a/test/preview/components/audio/test_whisper_remote.py +++ b/test/preview/components/audio/test_whisper_remote.py @@ -1,7 +1,5 @@ -from typing import Union, BinaryIO -from pathlib import Path -from unittest.mock import MagicMock, patch -from copy import deepcopy +from unittest.mock import patch +import os import pytest @@ -13,15 +11,10 @@ from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber -def mock_openai_response( - file: Union[str, Path, BinaryIO], model: str = "whisper-1", response_format="json", **kwargs -) -> openai.openai_object.OpenAIObject: - if isinstance(file, (str, Path)): - file_path = str(file) - else: - file_path = file.name +def mock_openai_response(response_format="json", **kwargs) -> openai.openai_object.OpenAIObject: if response_format == "json": - dict_response = {"text": f"model: {model}, file: str{file_path}, test transcription"} + dict_response = {"text": "test transcription"} + # Currently only "json" is supported. else: dict_response = {} @@ -175,18 +168,98 @@ def test_from_dict_with_defualt_parameters_no_env_var(self, monkeypatch): with pytest.raises(ValueError, match="RemoteWhisperTranscriber expects an OpenAI API key."): RemoteWhisperTranscriber.from_dict(data) + @pytest.mark.unit + def test_run(self, preview_samples_path): + with patch("haystack.preview.components.audio.whisper_remote.openai.Audio") as openai_audio_patch: + model = "whisper-1" + openai_audio_patch.transcribe.side_effect = mock_openai_response + + transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") + with open(preview_samples_path / "audio" / "this is the content of the document.wav", "rb") as audio_stream: + result = transcriber.run(audio_files=[audio_stream]) + + assert result["documents"][0].text == "test transcription" + + openai_audio_patch.transcribe.assert_called_once_with( + file=audio_stream, model=model, response_format="json" + ) + @pytest.mark.unit def test_run_with_path(self, preview_samples_path): - model = "whisper-1" - file_path = preview_samples_path / "audio" / "this is the content of the document.wav" with patch("haystack.preview.components.audio.whisper_remote.openai.Audio") as openai_audio_patch: + model = "whisper-1" + openai_audio_patch.transcribe.side_effect = mock_openai_response + + transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") + + result = transcriber.run( + audio_files=[preview_samples_path / "audio" / "this is the content of the document.wav"] + ) + + expected = Document( + text="test transcription", + metadata={ + "audio_file": str( + (preview_samples_path / "audio" / "this is the content of the document.wav").absolute() + ) + }, + ) + assert result["documents"][0].text == expected.text + assert result["documents"][0].metadata == expected.metadata + + @pytest.mark.unit + def test_run_with_str(self, preview_samples_path): + with patch("haystack.preview.components.audio.whisper_remote.openai.Audio") as openai_audio_patch: + model = "whisper-1" openai_audio_patch.transcribe.side_effect = mock_openai_response transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") - result = transcriber.run(audio_files=[file_path]) + result = transcriber.run( + audio_files=[ + str((preview_samples_path / "audio" / "this is the content of the document.wav").absolute()) + ] + ) + + expected = Document( + text="test transcription", + metadata={ + "audio_file": str( + (preview_samples_path / "audio" / "this is the content of the document.wav").absolute() + ) + }, + ) + assert result["documents"][0].text == expected.text + assert result["documents"][0].metadata == expected.metadata - assert result["documents"][0].text == f"model: {model}, file: str{file_path}, test transcription" + @pytest.mark.skipif( + not os.environ.get("OPENAI_API_KEY", None), + reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", + ) + @pytest.mark.integration + def test_whisper_remote_transcriber(self, preview_samples_path): + comp = RemoteWhisperTranscriber(api_key=os.environ.get("OPENAI_API_KEY")) + + output = comp.run( + audio_files=[ + preview_samples_path / "audio" / "this is the content of the document.wav", + str((preview_samples_path / "audio" / "the context for this answer is here.wav").absolute()), + open(preview_samples_path / "audio" / "answer.wav", "rb"), + ] + ) + docs = output["documents"] + assert len(docs) == 3 + + assert docs[0].text.strip().lower() == "this is the content of the document." + assert ( + preview_samples_path / "audio" / "this is the content of the document.wav" == docs[0].metadata["audio_file"] + ) + + assert docs[1].text.strip().lower() == "the context for this answer is here." + assert ( + str((preview_samples_path / "audio" / "the context for this answer is here.wav").absolute()) + == docs[1].metadata["audio_file"] + ) - open_file = open(file_path, "rb") - openai_audio_patch.transcribe.assert_called_once_with(file=open_file, model=model, response_format="json") + assert docs[2].text.strip().lower() == "answer." + assert docs[2].metadata["audio_file"] == "<>" From 0deb5124740fb6d0027be742c56cfbfaaa98ee83 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 23 Oct 2023 01:59:43 +0530 Subject: [PATCH 03/13] Remove unnecessary imports --- haystack/preview/components/audio/whisper_remote.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 16632b53e3..e62cbf7b3e 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -1,7 +1,6 @@ -from typing import List, Optional, Dict, Any, Union, BinaryIO, Literal, get_args, Sequence +from typing import List, Optional, Dict, Any, Union, BinaryIO import os -import json import logging from pathlib import Path From 10f6ba2d7f4dd84e90c8d34f7c8708d04010e421 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 23 Oct 2023 02:12:49 +0530 Subject: [PATCH 04/13] Add release notes --- ...te-whisper-transcriber-to-openai-sdk-980ae6f54ddfd7df.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 releasenotes/notes/migrate-remote-whisper-transcriber-to-openai-sdk-980ae6f54ddfd7df.yaml diff --git a/releasenotes/notes/migrate-remote-whisper-transcriber-to-openai-sdk-980ae6f54ddfd7df.yaml b/releasenotes/notes/migrate-remote-whisper-transcriber-to-openai-sdk-980ae6f54ddfd7df.yaml new file mode 100644 index 0000000000..98e61e5f49 --- /dev/null +++ b/releasenotes/notes/migrate-remote-whisper-transcriber-to-openai-sdk-980ae6f54ddfd7df.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Migrate RemoteWhisperTranscriber to OpenAI SDK. From d04fcb137ea41a5a92d1d9b2864b3069e37842bd Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 23 Oct 2023 02:13:47 +0530 Subject: [PATCH 05/13] Fix api_key serialization --- haystack/preview/components/audio/whisper_remote.py | 1 - test/preview/components/audio/test_whisper_remote.py | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index e62cbf7b3e..cde0a1c5ae 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -72,7 +72,6 @@ def __init__( "Set the OPENAI_API_KEY environment variable (recommended) or pass it explicitly." ) from e - self.api_key = api_key self.organization = organization self.model_name = model_name self.api_base_url = api_base_url diff --git a/test/preview/components/audio/test_whisper_remote.py b/test/preview/components/audio/test_whisper_remote.py index 3a1a04b6ea..514c022e38 100644 --- a/test/preview/components/audio/test_whisper_remote.py +++ b/test/preview/components/audio/test_whisper_remote.py @@ -33,7 +33,7 @@ def test_init_no_key(self, monkeypatch): def test_init_default(self): transcriber = RemoteWhisperTranscriber(api_key="test_api_key") - assert transcriber.api_key == "test_api_key" + assert openai.api_key == "test_api_key" assert transcriber.model_name == "whisper-1" assert transcriber.organization is None assert transcriber.api_base_url == "https://api.openai.com/v1" @@ -52,7 +52,7 @@ def test_init_custom_parameters(self): temperature="0.5", ) - assert transcriber.api_key == "test_api_key" + assert openai.api_key == "test_api_key" assert transcriber.model_name == "whisper-1" assert transcriber.organization == "test-org" assert transcriber.api_base_url == "test_api_url" @@ -118,7 +118,7 @@ def test_from_dict_with_defualt_parameters(self, monkeypatch): transcriber = RemoteWhisperTranscriber.from_dict(data) - assert transcriber.api_key == "test_api_key" + assert openai.api_key == "test_api_key" assert transcriber.model_name == "whisper-1" assert transcriber.organization is None assert transcriber.api_base_url == "https://api.openai.com/v1" @@ -141,7 +141,7 @@ def test_from_dict_with_custom_init_parameters(self, monkeypatch): } transcriber = RemoteWhisperTranscriber.from_dict(data) - assert transcriber.api_key == "test_api_key" + assert openai.api_key == "test_api_key" assert transcriber.model_name == "whisper-1" assert transcriber.organization == "test-org" assert transcriber.api_base_url == "test_api_url" @@ -252,7 +252,8 @@ def test_whisper_remote_transcriber(self, preview_samples_path): assert docs[0].text.strip().lower() == "this is the content of the document." assert ( - preview_samples_path / "audio" / "this is the content of the document.wav" == docs[0].metadata["audio_file"] + str((preview_samples_path / "audio" / "this is the content of the document.wav").absolute()) + == docs[0].metadata["audio_file"] ) assert docs[1].text.strip().lower() == "the context for this answer is here." From c57b7ff5d46d52b1262b121648f6cd65296aedee Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Mon, 23 Oct 2023 14:45:11 +0530 Subject: [PATCH 06/13] Fix linting --- haystack/preview/components/audio/whisper_remote.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index cde0a1c5ae..56f2e01e07 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -4,11 +4,10 @@ import logging from pathlib import Path -from haystack.preview.utils import request_with_retry -from haystack.preview import component, Document, default_to_dict, default_from_dict - import openai +from haystack.preview import component, Document, default_to_dict, default_from_dict + logger = logging.getLogger(__name__) From bc137032a12cec91bb57ca83309fadd2f503d23d Mon Sep 17 00:00:00 2001 From: Ashwin Mathur <97467100+awinml@users.noreply.github.com> Date: Tue, 24 Oct 2023 18:11:58 +0530 Subject: [PATCH 07/13] Apply suggestions from code review Co-authored-by: ZanSara --- haystack/preview/components/audio/whisper_remote.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 56f2e01e07..bfc555d3d1 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -62,6 +62,8 @@ def __init__( temperature until certain thresholds are hit. """ + # if the user does not provide the API key, check if it is set in the module client + api_key = api_key or openai.api_key if api_key is None: try: api_key = os.environ["OPENAI_API_KEY"] @@ -70,6 +72,7 @@ def __init__( "RemoteWhisperTranscriber expects an OpenAI API key. " "Set the OPENAI_API_KEY environment variable (recommended) or pass it explicitly." ) from e + openai.api_key = api_key self.organization = organization self.model_name = model_name @@ -77,10 +80,11 @@ def __init__( # Only response_format = "json" is supported whisper_params = kwargs + if whisper_params.get("response_format") != "json": + logger.warning("RemoteWhisperTranscriber only supports 'response_format: json'. This parameter will be overwritten.") whisper_params["response_format"] = "json" self.whisper_params = whisper_params - openai.api_key = api_key if organization is not None: openai.organization = organization From 8e4eebf74059f555346b1c99cfab7dd86abb49a0 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Tue, 24 Oct 2023 21:13:00 +0530 Subject: [PATCH 08/13] Add additional tests for api_key --- .../preview/components/audio/whisper_remote.py | 4 +++- .../components/audio/test_whisper_remote.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index bfc555d3d1..14713cb115 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -81,7 +81,9 @@ def __init__( # Only response_format = "json" is supported whisper_params = kwargs if whisper_params.get("response_format") != "json": - logger.warning("RemoteWhisperTranscriber only supports 'response_format: json'. This parameter will be overwritten.") + logger.warning( + "RemoteWhisperTranscriber only supports 'response_format: json'. This parameter will be overwritten." + ) whisper_params["response_format"] = "json" self.whisper_params = whisper_params diff --git a/test/preview/components/audio/test_whisper_remote.py b/test/preview/components/audio/test_whisper_remote.py index 514c022e38..e69fee63e2 100644 --- a/test/preview/components/audio/test_whisper_remote.py +++ b/test/preview/components/audio/test_whisper_remote.py @@ -24,11 +24,25 @@ def mock_openai_response(response_format="json", **kwargs) -> openai.openai_obje class TestRemoteWhisperTranscriber: @pytest.mark.unit def test_init_no_key(self, monkeypatch): + openai.api_key = None monkeypatch.delenv("OPENAI_API_KEY", raising=False) error_msg = "RemoteWhisperTranscriber expects an OpenAI API key." with pytest.raises(ValueError, match=error_msg): RemoteWhisperTranscriber(api_key=None) + def test_init_key_env_var(self, monkeypatch): + openai.api_key = None + monkeypatch.setenv("OPENAI_API_KEY", "test_api_key") + RemoteWhisperTranscriber(api_key=None) + assert openai.api_key == "test_api_key" + + def test_init_key_module_env_and_global_var(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test_api_key_2") + openai.api_key = "test_api_key_1" + RemoteWhisperTranscriber(api_key=None) + # The module global variable takes preference + assert openai.api_key == "test_api_key_1" + @pytest.mark.unit def test_init_default(self): transcriber = RemoteWhisperTranscriber(api_key="test_api_key") @@ -153,6 +167,7 @@ def test_from_dict_with_custom_init_parameters(self, monkeypatch): } def test_from_dict_with_defualt_parameters_no_env_var(self, monkeypatch): + openai.api_key = None monkeypatch.delenv("OPENAI_API_KEY", raising=False) data = { From 25ecec59d302f5aa9a938415c8c0aad9ea8c50da Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 25 Oct 2023 01:49:43 +0530 Subject: [PATCH 09/13] Adapt .run() to take ByteStream inputs --- .../components/audio/whisper_remote.py | 23 ++--- .../components/audio/test_whisper_remote.py | 97 ++++++------------- 2 files changed, 35 insertions(+), 85 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 14713cb115..70ca58d905 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -1,13 +1,12 @@ -from typing import List, Optional, Dict, Any, Union, BinaryIO +from typing import List, Optional, Dict, Any import os import logging -from pathlib import Path import openai from haystack.preview import component, Document, default_to_dict, default_from_dict - +from haystack.preview.dataclasses import ByteStream logger = logging.getLogger(__name__) @@ -112,7 +111,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "RemoteWhisperTranscriber": return default_from_dict(cls, data) @component.output_types(documents=List[Document]) - def run(self, audio_files: List[Union[str, Path, BinaryIO]]): + def run(self, streams: List[ByteStream]): """ Transcribe the audio files into a list of Documents, one for each input file. @@ -128,19 +127,9 @@ def run(self, audio_files: List[Union[str, Path, BinaryIO]]): """ documents = [] - for audio_file in audio_files: - if isinstance(audio_file, (str, Path)): - if isinstance(audio_file, str): - file_name = audio_file - else: - file_name = str(audio_file.absolute()) - with open(audio_file, "rb") as file: - content = openai.Audio.transcribe(file=file, model=self.model_name, **self.whisper_params) - else: - file_name = "<>" - content = openai.Audio.transcribe(file=audio_file, model=self.model_name, **self.whisper_params) - - doc = Document(text=content["text"], metadata={"audio_file": file_name}) + for stream in streams: + content = openai.Audio.transcribe(file=stream.data, model=self.model_name, **self.whisper_params) + doc = Document(text=content["text"], metadata=stream.metadata) documents.append(doc) return {"documents": documents} diff --git a/test/preview/components/audio/test_whisper_remote.py b/test/preview/components/audio/test_whisper_remote.py index e69fee63e2..9b799243a5 100644 --- a/test/preview/components/audio/test_whisper_remote.py +++ b/test/preview/components/audio/test_whisper_remote.py @@ -1,13 +1,10 @@ from unittest.mock import patch import os - - import pytest import openai from openai.util import convert_to_openai_object - -from haystack.preview.dataclasses import Document +from haystack.preview.dataclasses import ByteStream from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber @@ -187,95 +184,59 @@ def test_from_dict_with_defualt_parameters_no_env_var(self, monkeypatch): def test_run(self, preview_samples_path): with patch("haystack.preview.components.audio.whisper_remote.openai.Audio") as openai_audio_patch: model = "whisper-1" + file_path = preview_samples_path / "audio" / "this is the content of the document.wav" openai_audio_patch.transcribe.side_effect = mock_openai_response transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") - with open(preview_samples_path / "audio" / "this is the content of the document.wav", "rb") as audio_stream: - result = transcriber.run(audio_files=[audio_stream]) + with open(file_path, "rb") as audio_stream: + byte_stream = audio_stream.read() + audio_file = ByteStream(byte_stream, metadata={"file_path": str(file_path.absolute())}) + result = transcriber.run(streams=[audio_file]) assert result["documents"][0].text == "test transcription" + assert result["documents"][0].metadata["file_path"] == str(file_path.absolute()) openai_audio_patch.transcribe.assert_called_once_with( - file=audio_stream, model=model, response_format="json" + file=audio_file.data, model=model, response_format="json" ) - @pytest.mark.unit - def test_run_with_path(self, preview_samples_path): - with patch("haystack.preview.components.audio.whisper_remote.openai.Audio") as openai_audio_patch: - model = "whisper-1" - openai_audio_patch.transcribe.side_effect = mock_openai_response - - transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") - - result = transcriber.run( - audio_files=[preview_samples_path / "audio" / "this is the content of the document.wav"] - ) - - expected = Document( - text="test transcription", - metadata={ - "audio_file": str( - (preview_samples_path / "audio" / "this is the content of the document.wav").absolute() - ) - }, - ) - assert result["documents"][0].text == expected.text - assert result["documents"][0].metadata == expected.metadata - - @pytest.mark.unit - def test_run_with_str(self, preview_samples_path): - with patch("haystack.preview.components.audio.whisper_remote.openai.Audio") as openai_audio_patch: - model = "whisper-1" - openai_audio_patch.transcribe.side_effect = mock_openai_response - - transcriber = RemoteWhisperTranscriber(api_key="test_api_key", model_name=model, response_format="json") - - result = transcriber.run( - audio_files=[ - str((preview_samples_path / "audio" / "this is the content of the document.wav").absolute()) - ] - ) - - expected = Document( - text="test transcription", - metadata={ - "audio_file": str( - (preview_samples_path / "audio" / "this is the content of the document.wav").absolute() - ) - }, - ) - assert result["documents"][0].text == expected.text - assert result["documents"][0].metadata == expected.metadata - @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", ) @pytest.mark.integration def test_whisper_remote_transcriber(self, preview_samples_path): - comp = RemoteWhisperTranscriber(api_key=os.environ.get("OPENAI_API_KEY")) - - output = comp.run( - audio_files=[ - preview_samples_path / "audio" / "this is the content of the document.wav", - str((preview_samples_path / "audio" / "the context for this answer is here.wav").absolute()), - open(preview_samples_path / "audio" / "answer.wav", "rb"), - ] - ) + transcriber = RemoteWhisperTranscriber(api_key=os.environ.get("OPENAI_API_KEY")) + + paths = [ + preview_samples_path / "audio" / "this is the content of the document.wav", + preview_samples_path / "audio" / "the context for this answer is here.wav", + preview_samples_path / "audio" / "answer.wav", + "rb", + ] + + audio_files = [] + for file_path in paths: + with open(file_path, "rb") as audio_stream: + byte_stream = audio_stream.read() + audio_file = ByteStream(byte_stream, metadata={"file_path": str(file_path.absolute())}) + audio_files.append(audio_file) + + output = transcriber.run(streams=audio_files) + docs = output["documents"] assert len(docs) == 3 - assert docs[0].text.strip().lower() == "this is the content of the document." assert ( str((preview_samples_path / "audio" / "this is the content of the document.wav").absolute()) - == docs[0].metadata["audio_file"] + == docs[0].metadata["file_path"] ) assert docs[1].text.strip().lower() == "the context for this answer is here." assert ( str((preview_samples_path / "audio" / "the context for this answer is here.wav").absolute()) - == docs[1].metadata["audio_file"] + == docs[1].metadata["file_path"] ) assert docs[2].text.strip().lower() == "answer." - assert docs[2].metadata["audio_file"] == "<>" + assert str((preview_samples_path / "audio" / "answer.wav").absolute()) == docs[2].metadata["file_path"] From d069035cd462628bb3399c53bc7f427d41228558 Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Wed, 25 Oct 2023 02:14:54 +0530 Subject: [PATCH 10/13] Update docstrings --- haystack/preview/components/audio/whisper_remote.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 70ca58d905..9d255a1bed 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -119,11 +119,8 @@ def run(self, streams: List[ByteStream]): [Whisper API documentation](https://platform.openai.com/docs/guides/speech-to-text) and the official Whisper [github repo](https://github.com/openai/whisper). - :param audio_files: a list of paths or binary streams to transcribe - :returns: a list of Documents, one for each file. The content of the - document is the transcription text, while the document's metadata - contains a key called `audio_file`, which contains the path to the - audio file used for the transcription. + :param audio_files: a list of ByteStream objects to transcribe. + :returns: a list of Documents, one for each file. The content of the document is the transcription text. """ documents = [] From 212c94fc494ded2a26b710cf8962b8af347fd81e Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 26 Oct 2023 13:31:43 +0530 Subject: [PATCH 11/13] Rework implementation to use io.BytesIO --- .../components/audio/whisper_remote.py | 20 ++++++++++++++----- .../components/audio/test_whisper_remote.py | 12 +++++------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 9d255a1bed..275a6c478b 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -1,11 +1,11 @@ -from typing import List, Optional, Dict, Any - -import os +import io import logging +import os +from typing import Any, Dict, List, Optional import openai -from haystack.preview import component, Document, default_to_dict, default_from_dict +from haystack.preview import Document, component, default_from_dict, default_to_dict from haystack.preview.dataclasses import ByteStream logger = logging.getLogger(__name__) @@ -125,7 +125,17 @@ def run(self, streams: List[ByteStream]): documents = [] for stream in streams: - content = openai.Audio.transcribe(file=stream.data, model=self.model_name, **self.whisper_params) + try: + file = io.BytesIO(stream.data) + file.name = stream.metadata["file_path"] + except Exception as e: + logger.warning( + "Could not read audio file. Skipping it. Make sure the 'file_path' is present in the metadata. Error message: %s", + e, + ) + continue + + content = openai.Audio.transcribe(file=file, model=self.model_name, **self.whisper_params) doc = Document(text=content["text"], metadata=stream.metadata) documents.append(doc) diff --git a/test/preview/components/audio/test_whisper_remote.py b/test/preview/components/audio/test_whisper_remote.py index 9b799243a5..3fae7fc3e3 100644 --- a/test/preview/components/audio/test_whisper_remote.py +++ b/test/preview/components/audio/test_whisper_remote.py @@ -1,11 +1,12 @@ -from unittest.mock import patch import os -import pytest +from unittest.mock import patch + import openai +import pytest from openai.util import convert_to_openai_object -from haystack.preview.dataclasses import ByteStream from haystack.preview.components.audio.whisper_remote import RemoteWhisperTranscriber +from haystack.preview.dataclasses import ByteStream def mock_openai_response(response_format="json", **kwargs) -> openai.openai_object.OpenAIObject: @@ -191,15 +192,12 @@ def test_run(self, preview_samples_path): with open(file_path, "rb") as audio_stream: byte_stream = audio_stream.read() audio_file = ByteStream(byte_stream, metadata={"file_path": str(file_path.absolute())}) + result = transcriber.run(streams=[audio_file]) assert result["documents"][0].text == "test transcription" assert result["documents"][0].metadata["file_path"] == str(file_path.absolute()) - openai_audio_patch.transcribe.assert_called_once_with( - file=audio_file.data, model=model, response_format="json" - ) - @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", From 3714122f91148eb31a7e7ff7631d07156ff2470f Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 26 Oct 2023 14:07:41 +0530 Subject: [PATCH 12/13] Update error message --- haystack/preview/components/audio/whisper_remote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 275a6c478b..65f64df290 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -128,7 +128,7 @@ def run(self, streams: List[ByteStream]): try: file = io.BytesIO(stream.data) file.name = stream.metadata["file_path"] - except Exception as e: + except KeyError as e: logger.warning( "Could not read audio file. Skipping it. Make sure the 'file_path' is present in the metadata. Error message: %s", e, From 5fbbbb5c17fc5dcbce196c49cfd14c562577c63c Mon Sep 17 00:00:00 2001 From: awinml <97467100+awinml@users.noreply.github.com> Date: Thu, 26 Oct 2023 18:33:01 +0530 Subject: [PATCH 13/13] Add default file name --- haystack/preview/components/audio/whisper_remote.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/haystack/preview/components/audio/whisper_remote.py b/haystack/preview/components/audio/whisper_remote.py index 65f64df290..4ef4821b30 100644 --- a/haystack/preview/components/audio/whisper_remote.py +++ b/haystack/preview/components/audio/whisper_remote.py @@ -125,15 +125,11 @@ def run(self, streams: List[ByteStream]): documents = [] for stream in streams: + file = io.BytesIO(stream.data) try: - file = io.BytesIO(stream.data) file.name = stream.metadata["file_path"] - except KeyError as e: - logger.warning( - "Could not read audio file. Skipping it. Make sure the 'file_path' is present in the metadata. Error message: %s", - e, - ) - continue + except KeyError: + file.name = "audio_input.wav" content = openai.Audio.transcribe(file=file, model=self.model_name, **self.whisper_params) doc = Document(text=content["text"], metadata=stream.metadata)