From ff10effbedd413dd2ccae38dc744be7465e42c62 Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 20 Dec 2024 23:16:45 +0900 Subject: [PATCH 01/14] [test] Add one case of google-cloud-speech --- tests/recognizers/test_google_cloud.py | 51 ++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 tests/recognizers/test_google_cloud.py diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py new file mode 100644 index 00000000..c55a81dc --- /dev/null +++ b/tests/recognizers/test_google_cloud.py @@ -0,0 +1,51 @@ +from unittest.mock import MagicMock, patch + +from google.cloud.speech import ( + RecognitionAudio, + RecognitionConfig, + RecognizeResponse, + SpeechRecognitionAlternative, + SpeechRecognitionResult, +) + +from speech_recognition import Recognizer +from speech_recognition.audio import AudioData + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_with_google_cloud_speech(SpeechClient, monkeypatch): + monkeypatch.setenv( + "GOOGLE_APPLICATION_CREDENTIALS", "path/to/credentials.json" + ) + + client = SpeechClient.return_value + # ref: https://cloud.google.com/speech-to-text/docs/transcribe-gcloud?hl=ja#make_an_audio_transcription_request + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="how old is the Brooklyn Bridge", + confidence=0.9840146, + ) + ] + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + actual = Recognizer().recognize_google_cloud(audio_data) + + assert actual == "how old is the Brooklyn Bridge " + SpeechClient.assert_called_once_with() + client.recognize.assert_called_once_with( + config=RecognitionConfig( + encoding=RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16_000, + language_code="en-US", + ), + audio=RecognitionAudio(content=b"flac_data"), + ) From 6b46a9b886f90f1b115ef831adc0278f208130f8 Mon Sep 17 00:00:00 2001 From: ftnext Date: Fri, 20 Dec 2024 23:20:23 +0900 Subject: [PATCH 02/14] [chore] Add google-cloud extra --- .github/workflows/unittests.yml | 6 +++--- setup.cfg | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 8b2a6daa..170b78f8 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -44,16 +44,16 @@ jobs: - name: Install Python dependencies (Ubuntu, <=3.12) if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13' run: | - python -m pip install .[dev,audio,pocketsphinx,whisper-local,openai,groq] + python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,openai,groq] - name: Install Python dependencies (Ubuntu, 3.13) if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' run: | python -m pip install standard-aifc setuptools - python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,openai,groq] + python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,google-cloud,openai,groq] - name: Install Python dependencies (Windows) if: matrix.os == 'windows-latest' run: | - python -m pip install .[dev,whisper-local,openai,groq] + python -m pip install .[dev,whisper-local,google-cloud,openai,groq] - name: Test with unittest run: | pytest --doctest-modules -v speech_recognition/recognizers/ tests/ diff --git a/setup.cfg b/setup.cfg index 33ac0595..9d1def40 100644 --- a/setup.cfg +++ b/setup.cfg @@ -7,6 +7,8 @@ audio = PyAudio >= 0.2.11 pocketsphinx = pocketsphinx < 5 +google-cloud = + google-cloud-speech whisper-local = openai-whisper soundfile From 27f6d8588ea405645850768d567a38dd0aead82f Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 15:24:32 +0900 Subject: [PATCH 03/14] [docs] Install with google-cloud extra --- README.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index afb69b66..522777b7 100644 --- a/README.rst +++ b/README.rst @@ -151,14 +151,15 @@ You also have to install Vosk Models: `Here `__ are models avaiable for download. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model" -Google Cloud Speech Library for Python (for Google Cloud Speech API users) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Google Cloud Speech Library for Python (for Google Cloud Speech-to-Text API users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -`Google Cloud Speech library for Python `__ is required if and only if you want to use the Google Cloud Speech API (``recognizer_instance.recognize_google_cloud``). +The library `google-cloud-speech `__ is **required if and only if you want to use Google Cloud Speech-to-Text API** (``recognizer_instance.recognize_google_cloud``). -If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_google_cloud`` will raise an ``RequestError``. +You can install it with :command:`python3 -m pip install SpeechRecognition[google-cloud]`. +(ref: `official installation instructions `__) -According to the `official installation instructions `__, the recommended way to install this is using `Pip `__: execute ``pip install google-cloud-speech`` (replace ``pip`` with ``pip3`` if using Python 3). +Currently only `V1 `__ is supported. (`V2 `__ is not supported) FLAC (for some systems) ~~~~~~~~~~~~~~~~~~~~~~~ From 1df198bb81609a317ca4fc1dc0e9bb767552a8ec Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 15:41:02 +0900 Subject: [PATCH 04/14] [refactor] Extract google-cloud-speech recognition --- speech_recognition/__init__.py | 87 +----------------- .../recognizers/google_cloud.py | 90 +++++++++++++++++++ 2 files changed, 92 insertions(+), 85 deletions(-) create mode 100644 speech_recognition/recognizers/google_cloud.py diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index d183c58b..f7c19f11 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -693,90 +693,6 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g if hypothesis is not None: return hypothesis.hypstr raise UnknownValueError() # no transcriptions available - def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params): - """ - Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. - - This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. - - The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation `__. - - If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. - - ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see - - The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, - then an appropriate enhanced model is chosen if an enhanced model exists for the audio. - If use_enhanced is true and an enhanced version of the specified model does not exist, - then the speech is recognized using the standard version of the specified model. - - Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best - suited to your domain to get best results. If a model is not explicitly specified, - then we auto-select a model based on the other parameters of this method. - - Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. - - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. - """ - assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" - if credentials_json is None: - assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None - assert isinstance(language, str), "``language`` must be a string" - assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" - - try: - import socket - - from google.api_core.exceptions import GoogleAPICallError - from google.cloud import speech - except ImportError: - raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.') - - if credentials_json is not None: - client = speech.SpeechClient.from_service_account_json(credentials_json) - else: - client = speech.SpeechClient() - - flac_data = audio_data.get_flac_data( - convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)), # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range - convert_width=2 # audio samples must be 16-bit - ) - audio = speech.RecognitionAudio(content=flac_data) - - config = { - 'encoding': speech.RecognitionConfig.AudioEncoding.FLAC, - 'sample_rate_hertz': audio_data.sample_rate, - 'language_code': language, - **api_params, - } - if preferred_phrases is not None: - config['speechContexts'] = [speech.SpeechContext( - phrases=preferred_phrases - )] - if show_all: - config['enableWordTimeOffsets'] = True # some useful extra options for when we want all the output - - opts = {} - if self.operation_timeout and socket.getdefaulttimeout() is None: - opts['timeout'] = self.operation_timeout - - config = speech.RecognitionConfig(**config) - - try: - response = client.recognize(config=config, audio=audio) - except GoogleAPICallError as e: - raise RequestError(e) - except URLError as e: - raise RequestError("recognition connection failed: {0}".format(e.reason)) - - if show_all: return response - if len(response.results) == 0: raise UnknownValueError() - - transcript = '' - for result in response.results: - transcript += result.alternatives[0].transcript.strip() + ' ' - return transcript - def recognize_wit(self, audio_data, key, show_all=False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API. @@ -1518,11 +1434,12 @@ def flush(self, *args, **kwargs): # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError. # This is a workaround to resolve this issue try: - from .recognizers import google, openai, groq + from .recognizers import google, google_cloud, openai, groq except (ModuleNotFoundError, ImportError): pass else: Recognizer.recognize_google = google.recognize_legacy + Recognizer.recognize_google_cloud = google_cloud.recognize Recognizer.recognize_openai = openai.recognize Recognizer.recognize_whisper_api = openai.recognize # Deprecated Recognizer.recognize_groq = groq.recognize diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py new file mode 100644 index 00000000..bf72e72f --- /dev/null +++ b/speech_recognition/recognizers/google_cloud.py @@ -0,0 +1,90 @@ +import os +from urllib.error import URLError + +from speech_recognition.audio import AudioData +from speech_recognition.exceptions import RequestError, UnknownValueError + + +def recognize(recognizer, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. + + This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. + + The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation `__. + + If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. + + ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see + + The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, + then an appropriate enhanced model is chosen if an enhanced model exists for the audio. + If use_enhanced is true and an enhanced version of the specified model does not exist, + then the speech is recognized using the standard version of the specified model. + + Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best + suited to your domain to get best results. If a model is not explicitly specified, + then we auto-select a model based on the other parameters of this method. + + Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. + """ + assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" + if credentials_json is None: + assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None + assert isinstance(language, str), "``language`` must be a string" + assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" + + try: + import socket + + from google.api_core.exceptions import GoogleAPICallError + from google.cloud import speech + except ImportError: + raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.') + + if credentials_json is not None: + client = speech.SpeechClient.from_service_account_json(credentials_json) + else: + client = speech.SpeechClient() + + flac_data = audio_data.get_flac_data( + convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)), # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range + convert_width=2 # audio samples must be 16-bit + ) + audio = speech.RecognitionAudio(content=flac_data) + + config = { + 'encoding': speech.RecognitionConfig.AudioEncoding.FLAC, + 'sample_rate_hertz': audio_data.sample_rate, + 'language_code': language, + **api_params, + } + if preferred_phrases is not None: + config['speechContexts'] = [speech.SpeechContext( + phrases=preferred_phrases + )] + if show_all: + config['enableWordTimeOffsets'] = True # some useful extra options for when we want all the output + + opts = {} + if recognizer.operation_timeout and socket.getdefaulttimeout() is None: + opts['timeout'] = recognizer.operation_timeout + + config = speech.RecognitionConfig(**config) + + try: + response = client.recognize(config=config, audio=audio) + except GoogleAPICallError as e: + raise RequestError(e) + except URLError as e: + raise RequestError("recognition connection failed: {0}".format(e.reason)) + + if show_all: return response + if len(response.results) == 0: raise UnknownValueError() + + transcript = '' + for result in response.results: + transcript += result.alternatives[0].transcript.strip() + ' ' + return transcript From b8d9f6ee436784d350c34adb678d2be4eada839d Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 15:45:02 +0900 Subject: [PATCH 05/14] [refactor] Remove dead code * opts not used * operation_timeout = None (Recognize.__init__()) --- speech_recognition/recognizers/google_cloud.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index bf72e72f..86af2aeb 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -37,8 +37,6 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" try: - import socket - from google.api_core.exceptions import GoogleAPICallError from google.cloud import speech except ImportError: @@ -68,10 +66,6 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p if show_all: config['enableWordTimeOffsets'] = True # some useful extra options for when we want all the output - opts = {} - if recognizer.operation_timeout and socket.getdefaulttimeout() is None: - opts['timeout'] = recognizer.operation_timeout - config = speech.RecognitionConfig(**config) try: From b792f18ac57d1e2dc76550e14c78310b1fccd910 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 15:49:52 +0900 Subject: [PATCH 06/14] [refactor] Tweak sut --- tests/recognizers/test_google_cloud.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py index c55a81dc..3a3c4154 100644 --- a/tests/recognizers/test_google_cloud.py +++ b/tests/recognizers/test_google_cloud.py @@ -10,6 +10,7 @@ from speech_recognition import Recognizer from speech_recognition.audio import AudioData +from speech_recognition.recognizers.google_cloud import recognize @patch("google.cloud.speech.SpeechClient") @@ -37,7 +38,7 @@ def test_transcribe_with_google_cloud_speech(SpeechClient, monkeypatch): audio_data.sample_rate = 16_000 audio_data.get_flac_data.return_value = b"flac_data" - actual = Recognizer().recognize_google_cloud(audio_data) + actual = recognize(MagicMock(spec=Recognizer), audio_data) assert actual == "how old is the Brooklyn Bridge " SpeechClient.assert_called_once_with() From 4c66a7b8f839223c6ab716fab5b31936971b5d8b Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:03:18 +0900 Subject: [PATCH 07/14] [docs] Don't Repeat Yourself --- reference/library-reference.rst | 18 +----------------- speech_recognition/recognizers/google_cloud.py | 3 +-- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 2c6a47d1..3f410aba 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -230,23 +230,7 @@ Raises a ``speech_recognition.UnknownValueError`` exception if the speech is uni ``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]`` ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. - -This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. - -The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation `__. - -If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. - -``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see - -The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, then an appropriate enhanced model is chosen if an enhanced model exists for the audio. If use_enhanced is true and an enhanced version of the specified model does not exist, then the speech is recognized using the standard version of the specified model. - -Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best suited to your domain to get best results. If a model is not explicitly specified, then we auto-select a model based on the other parameters of this method. - -Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. - -Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. +.. autofunction:: speech_recognition.recognizers.google_cloud.recognize ``recognizer_instance.recognize_wit(audio_data: AudioData, key: str, show_all: bool = False) -> Union[str, Dict[str, Any]]`` ---------------------------------------------------------------------------------------------------------------------------- diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 86af2aeb..d0988e69 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -6,8 +6,7 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params): - """ - Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. + """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. From 35447a4b1ac8e017c1d9e0c3c9dc2cd6e93c42be Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:03:45 +0900 Subject: [PATCH 08/14] [chore] Add Sphinx support to rstcheck --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cbf1e79c..bd18a867 100644 --- a/Makefile +++ b/Makefile @@ -3,4 +3,4 @@ lint: @pipx run flake8 --ignore=E501,E701,W503 . rstcheck: - @pipx run rstcheck --ignore-directives autofunction README.rst reference/*.rst + @pipx run rstcheck[sphinx] --ignore-directives autofunction README.rst reference/*.rst From 8a2afcf280ddc20ffce126b408c9f1550790282e Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:06:50 +0900 Subject: [PATCH 09/14] [style] Format --- .../recognizers/google_cloud.py | 67 +++++++++++++------ 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index d0988e69..8470902c 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -5,7 +5,15 @@ from speech_recognition.exceptions import RequestError, UnknownValueError -def recognize(recognizer, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params): +def recognize( + recognizer, + audio_data, + credentials_json=None, + language="en-US", + preferred_phrases=None, + show_all=False, + **api_params +): """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart `__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file `__. @@ -29,41 +37,56 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. """ - assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" + assert isinstance( + audio_data, AudioData + ), "``audio_data`` must be audio data" if credentials_json is None: - assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None + assert os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") is not None assert isinstance(language, str), "``language`` must be a string" - assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" + assert preferred_phrases is None or all( + isinstance(preferred_phrases, (type(""), type(""))) + for preferred_phrases in preferred_phrases + ), "``preferred_phrases`` must be a list of strings" try: from google.api_core.exceptions import GoogleAPICallError from google.cloud import speech except ImportError: - raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.') + raise RequestError( + "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly." + ) if credentials_json is not None: - client = speech.SpeechClient.from_service_account_json(credentials_json) + client = speech.SpeechClient.from_service_account_json( + credentials_json + ) else: client = speech.SpeechClient() flac_data = audio_data.get_flac_data( - convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)), # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range - convert_width=2 # audio samples must be 16-bit + # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range + convert_rate=( + None + if 8000 <= audio_data.sample_rate <= 48000 + else max(8000, min(audio_data.sample_rate, 48000)) + ), + convert_width=2, # audio samples must be 16-bit ) audio = speech.RecognitionAudio(content=flac_data) config = { - 'encoding': speech.RecognitionConfig.AudioEncoding.FLAC, - 'sample_rate_hertz': audio_data.sample_rate, - 'language_code': language, + "encoding": speech.RecognitionConfig.AudioEncoding.FLAC, + "sample_rate_hertz": audio_data.sample_rate, + "language_code": language, **api_params, } if preferred_phrases is not None: - config['speechContexts'] = [speech.SpeechContext( - phrases=preferred_phrases - )] + config["speechContexts"] = [ + speech.SpeechContext(phrases=preferred_phrases) + ] if show_all: - config['enableWordTimeOffsets'] = True # some useful extra options for when we want all the output + # some useful extra options for when we want all the output + config["enableWordTimeOffsets"] = True config = speech.RecognitionConfig(**config) @@ -72,12 +95,16 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p except GoogleAPICallError as e: raise RequestError(e) except URLError as e: - raise RequestError("recognition connection failed: {0}".format(e.reason)) + raise RequestError( + "recognition connection failed: {0}".format(e.reason) + ) - if show_all: return response - if len(response.results) == 0: raise UnknownValueError() + if show_all: + return response + if len(response.results) == 0: + raise UnknownValueError() - transcript = '' + transcript = "" for result in response.results: - transcript += result.alternatives[0].transcript.strip() + ' ' + transcript += result.alternatives[0].transcript.strip() + " " return transcript From 8d33596b89f44e08aa6e44637a9a9d986c1a9df5 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:13:18 +0900 Subject: [PATCH 10/14] [docs] Type hints --- speech_recognition/recognizers/google_cloud.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 8470902c..3c0cfbd0 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os from urllib.error import URLError @@ -7,12 +9,12 @@ def recognize( recognizer, - audio_data, - credentials_json=None, - language="en-US", + audio_data: AudioData, + credentials_json: str | None = None, + language: str = "en-US", preferred_phrases=None, - show_all=False, - **api_params + show_all: bool = False, + **api_params, ): """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API. From 3e893d56d80e89401e73fdef5d9ebb98847748d4 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:25:03 +0900 Subject: [PATCH 11/14] [test] Google Cloud credentials file case --- reference/library-reference.rst | 4 +-- .../recognizers/google_cloud.py | 15 +++++----- tests/recognizers/test_google_cloud.py | 30 +++++++++++++++++++ 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 3f410aba..82239fd2 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -227,8 +227,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. -``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]`` -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]`` +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ .. autofunction:: speech_recognition.recognizers.google_cloud.recognize diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index 3c0cfbd0..eacd1594 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -10,7 +10,7 @@ def recognize( recognizer, audio_data: AudioData, - credentials_json: str | None = None, + credentials_json_path: str | None = None, language: str = "en-US", preferred_phrases=None, show_all: bool = False, @@ -42,7 +42,7 @@ def recognize( assert isinstance( audio_data, AudioData ), "``audio_data`` must be audio data" - if credentials_json is None: + if credentials_json_path is None: assert os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") is not None assert isinstance(language, str), "``language`` must be a string" assert preferred_phrases is None or all( @@ -58,12 +58,11 @@ def recognize( "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly." ) - if credentials_json is not None: - client = speech.SpeechClient.from_service_account_json( - credentials_json - ) - else: - client = speech.SpeechClient() + client = ( + speech.SpeechClient.from_service_account_json(credentials_json_path) + if credentials_json_path + else speech.SpeechClient() + ) flac_data = audio_data.get_flac_data( # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py index 3a3c4154..fbbdd775 100644 --- a/tests/recognizers/test_google_cloud.py +++ b/tests/recognizers/test_google_cloud.py @@ -50,3 +50,33 @@ def test_transcribe_with_google_cloud_speech(SpeechClient, monkeypatch): ), audio=RecognitionAudio(content=b"flac_data"), ) + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_with_specified_credentials(SpeechClient): + client = SpeechClient.from_service_account_json.return_value + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="transcript", confidence=0.9 + ) + ] + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + credentials_json_path="path/to/credentials.json", + ) + + SpeechClient.from_service_account_json.assert_called_once_with( + "path/to/credentials.json" + ) From 9843278dcde13c6a8f86b699c37f2d6e4ef5099b Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:47:06 +0900 Subject: [PATCH 12/14] [test] show_all case --- tests/recognizers/test_google_cloud.py | 68 ++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py index fbbdd775..015fac0b 100644 --- a/tests/recognizers/test_google_cloud.py +++ b/tests/recognizers/test_google_cloud.py @@ -6,6 +6,7 @@ RecognizeResponse, SpeechRecognitionAlternative, SpeechRecognitionResult, + WordInfo, ) from speech_recognition import Recognizer @@ -80,3 +81,70 @@ def test_transcribe_with_specified_credentials(SpeechClient): SpeechClient.from_service_account_json.assert_called_once_with( "path/to/credentials.json" ) + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_show_all(SpeechClient, monkeypatch): + monkeypatch.setenv( + "GOOGLE_APPLICATION_CREDENTIALS", "path/to/credentials.json" + ) + + client = SpeechClient.return_value + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="transcript", + confidence=0.9, + words=[ + WordInfo( + word="transcript", + start_time="0s", + end_time="0.400s", + ) + ], + ) + ], + language_code="en-US", + result_end_time="0.400s", + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + actual = recognize(MagicMock(spec=Recognizer), audio_data, show_all=True) + + assert actual == RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="transcript", + confidence=0.9, + words=[ + WordInfo( + word="transcript", + start_time="0s", + end_time="0.400s", + ) + ], + ) + ], + language_code="en-US", + result_end_time="0.400s", + ) + ] + ) + client.recognize.assert_called_once_with( + config=RecognitionConfig( + encoding=RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16_000, + language_code="en-US", + enable_word_time_offsets=True, + ), + audio=RecognitionAudio(content=b"flac_data"), + ) From 6477efafd069ac4c49eda086631472f4eebb0cf8 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:47:53 +0900 Subject: [PATCH 13/14] [bugfix] enableWordTimeOffsets is not supported; snake_case! --- speech_recognition/recognizers/google_cloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py index eacd1594..122104fa 100644 --- a/speech_recognition/recognizers/google_cloud.py +++ b/speech_recognition/recognizers/google_cloud.py @@ -86,8 +86,8 @@ def recognize( speech.SpeechContext(phrases=preferred_phrases) ] if show_all: - # some useful extra options for when we want all the output - config["enableWordTimeOffsets"] = True + # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets + config["enable_word_time_offsets"] = True config = speech.RecognitionConfig(**config) From 9aa550406c118c3f68ca9e2cc2d463f131e530b3 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 21 Dec 2024 16:54:33 +0900 Subject: [PATCH 14/14] [test] Cloud Speech-to-Text V1 parameters --- tests/recognizers/test_google_cloud.py | 41 ++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py index 015fac0b..340bda94 100644 --- a/tests/recognizers/test_google_cloud.py +++ b/tests/recognizers/test_google_cloud.py @@ -148,3 +148,44 @@ def test_transcribe_show_all(SpeechClient, monkeypatch): ), audio=RecognitionAudio(content=b"flac_data"), ) + + +@patch("google.cloud.speech.SpeechClient") +def test_transcribe_with_specified_api_parameters(SpeechClient, monkeypatch): + monkeypatch.setenv( + "GOOGLE_APPLICATION_CREDENTIALS", "path/to/credentials.json" + ) + + client = SpeechClient.return_value + client.recognize.return_value = RecognizeResponse( + results=[ + SpeechRecognitionResult( + alternatives=[ + SpeechRecognitionAlternative( + transcript="こんにちは", confidence=0.99 + ) + ] + ) + ] + ) + + audio_data = MagicMock(spec=AudioData) + audio_data.sample_rate = 16_000 + audio_data.get_flac_data.return_value = b"flac_data" + + _ = recognize( + MagicMock(spec=Recognizer), + audio_data, + language="ja-JP", + use_enhanced=True, + ) + + client.recognize.assert_called_once_with( + config=RecognitionConfig( + encoding=RecognitionConfig.AudioEncoding.FLAC, + sample_rate_hertz=16_000, + language_code="ja-JP", + use_enhanced=True, + ), + audio=RecognitionAudio(content=b"flac_data"), + )