From c84590450d307fe80467e571cc1568c5d8ed078c Mon Sep 17 00:00:00 2001 From: Vitor Hideyoshi Date: Thu, 15 Feb 2024 16:48:48 -0300 Subject: [PATCH 1/7] Adds Parameter use_enhanced and model to GoogleCloudSpeech Adds the parameters use_enhanced and model to the recognize_google_cloud method for more customizable options for the user and better results in specific cases --- speech_recognition/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 852eaeef..b2b365be 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -670,7 +670,7 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g if hypothesis is not None: return hypothesis.hypstr raise UnknownValueError() # no transcriptions available - def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False): + def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, use_enhanced=False, model=None, show_all=False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. @@ -689,6 +689,8 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None assert isinstance(language, str), "``language`` must be a string" assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" + assert isinstance(use_enhanced, bool), "``use_enhanced`` must be a boolean" + assert model is None or model in (None, "latest_long", "latest_short", "command_and_search", "phone_call", "video", "default", "medical_conversation", "medical_dictation"), "``model`` must be None or 'command_and_search', 'phone_call', 'video', or 'default'" try: import socket @@ -712,7 +714,9 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en config = { 'encoding': speech.RecognitionConfig.AudioEncoding.FLAC, 'sample_rate_hertz': audio_data.sample_rate, - 'language_code': language + 'language_code': language, + 'use_enhanced': use_enhanced, + 'model': model, } if preferred_phrases is not None: config['speechContexts'] = [speech.SpeechContext( From 8e0fa407d99b92330d837d45774d01b50af5083b Mon Sep 17 00:00:00 2001 From: Vitor Hideyoshi Date: Fri, 26 Apr 2024 19:12:36 +0000 Subject: [PATCH 2/7] Adds Parameters use_enhanced and model to GoogleSpeechAPI docstring --- speech_recognition/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index b2b365be..2d60f7af 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -680,6 +680,10 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. + The ``use_enhanced`` is a boolean option that sets a flag with the same name on the Google Cloud Speech API, it will make the API uses the enhanced version of the model. More information can be found in the `Google Cloud Speech API documentation ` __. + + Furthermore, you can use the option ``model`` to set your desired model, the Python Google Speech API makes available the following options: 'command_and_search', 'phone_call', 'video', 'default', 'medical_conversation', 'medical_dictation'. More information can be found in the `Google Cloud Speech API documentation ` __. + Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. From daca0004ee7fe87da9881d3772b66d5d689090b9 Mon Sep 17 00:00:00 2001 From: Vitor Hideyoshi Date: Fri, 26 Apr 2024 19:27:41 +0000 Subject: [PATCH 3/7] Adds Missing Models to Docstring and Adds Missing Parameters to Library Reference File --- reference/library-reference.rst | 4 ++++ speech_recognition/__init__.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 0aa7a8ce..5268e722 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -238,6 +238,10 @@ The recognition language is determined by ``language``, which is a BCP-47 langua If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. +The ``use_enhanced`` is a boolean option that sets a flag with the same name on the Google Cloud Speech API, it will make the API uses the enhanced version of the model. More information can be found in the `Google Cloud Speech API documentation ` __. + +Furthermore, you can use the option ``model`` to set your desired model, the Python Google Speech API makes available the following options: 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', 'default', 'medical_conversation', 'medical_dictation'. More information can be found in the `Google Cloud Speech API documentation ` __. + Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection. diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 2d60f7af..6d82babb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -682,7 +682,7 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en The ``use_enhanced`` is a boolean option that sets a flag with the same name on the Google Cloud Speech API, it will make the API uses the enhanced version of the model. More information can be found in the `Google Cloud Speech API documentation ` __. - Furthermore, you can use the option ``model`` to set your desired model, the Python Google Speech API makes available the following options: 'command_and_search', 'phone_call', 'video', 'default', 'medical_conversation', 'medical_dictation'. More information can be found in the `Google Cloud Speech API documentation ` __. + Furthermore, you can use the option ``model`` to set your desired model, the Python Google Speech API makes available the following options: 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', 'default', 'medical_conversation', 'medical_dictation'. More information can be found in the `Google Cloud Speech API documentation ` __. Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. @@ -694,7 +694,7 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en assert isinstance(language, str), "``language`` must be a string" assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" assert isinstance(use_enhanced, bool), "``use_enhanced`` must be a boolean" - assert model is None or model in (None, "latest_long", "latest_short", "command_and_search", "phone_call", "video", "default", "medical_conversation", "medical_dictation"), "``model`` must be None or 'command_and_search', 'phone_call', 'video', or 'default'" + assert model is None or model in (None, "latest_long", "latest_short", "command_and_search", "phone_call", "video", "default", "medical_conversation", "medical_dictation"), "``model`` must be None or 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', or 'default'" try: import socket From abb35fe380c522d135ca7fb345f73ec5a9064490 Mon Sep 17 00:00:00 2001 From: Vitor Hideyoshi Date: Fri, 26 Apr 2024 19:37:09 +0000 Subject: [PATCH 4/7] Fixes Broken Formatting --- speech_recognition/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 6d82babb..02fa14dc 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -681,7 +681,7 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. The ``use_enhanced`` is a boolean option that sets a flag with the same name on the Google Cloud Speech API, it will make the API uses the enhanced version of the model. More information can be found in the `Google Cloud Speech API documentation ` __. - + Furthermore, you can use the option ``model`` to set your desired model, the Python Google Speech API makes available the following options: 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', 'default', 'medical_conversation', 'medical_dictation'. More information can be found in the `Google Cloud Speech API documentation ` __. Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. From 4be80262468d33d78dc79a9d8f8e89b44a817be9 Mon Sep 17 00:00:00 2001 From: Vitor Hideyoshi Date: Wed, 27 Nov 2024 21:27:30 -0300 Subject: [PATCH 5/7] Better Implementation of API Params Configuration This implementation is needed for the configuration of Cloud Speech API-specific parameters. This implementation only validates and creates assertions for the two most used params: use_enhanced and model. --- reference/library-reference.rst | 8 +++++--- speech_recognition/__init__.py | 25 ++++++++++++++++++------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 5268e722..87858464 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -227,7 +227,7 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. -``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False) -> Union[str, Dict[str, Any]]`` +``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]`` ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. @@ -238,9 +238,11 @@ The recognition language is determined by ``language``, which is a BCP-47 langua If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. -The ``use_enhanced`` is a boolean option that sets a flag with the same name on the Google Cloud Speech API, it will make the API uses the enhanced version of the model. More information can be found in the `Google Cloud Speech API documentation ` __. +``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see -Furthermore, you can use the option ``model`` to set your desired model, the Python Google Speech API makes available the following options: 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', 'default', 'medical_conversation', 'medical_dictation'. More information can be found in the `Google Cloud Speech API documentation ` __. +The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, then an appropriate enhanced model is chosen if an enhanced model exists for the audio. If use_enhanced is true and an enhanced version of the specified model does not exist, then the speech is recognized using the standard version of the specified model. + +Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best suited to your domain to get best results. If a model is not explicitly specified, then we auto-select a model based on the other parameters of this method. Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 02fa14dc..e0002c3c 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -670,7 +670,7 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g if hypothesis is not None: return hypothesis.hypstr raise UnknownValueError() # no transcriptions available - def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, use_enhanced=False, model=None, show_all=False): + def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API. @@ -680,9 +680,16 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings `__. - The ``use_enhanced`` is a boolean option that sets a flag with the same name on the Google Cloud Speech API, it will make the API uses the enhanced version of the model. More information can be found in the `Google Cloud Speech API documentation ` __. + ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see - Furthermore, you can use the option ``model`` to set your desired model, the Python Google Speech API makes available the following options: 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', 'default', 'medical_conversation', 'medical_dictation'. More information can be found in the `Google Cloud Speech API documentation ` __. + The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, + then an appropriate enhanced model is chosen if an enhanced model exists for the audio. + If use_enhanced is true and an enhanced version of the specified model does not exist, + then the speech is recognized using the standard version of the specified model. + + Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best + suited to your domain to get best results. If a model is not explicitly specified, + then we auto-select a model based on the other parameters of this method. Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary. @@ -693,8 +700,13 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None assert isinstance(language, str), "``language`` must be a string" assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" - assert isinstance(use_enhanced, bool), "``use_enhanced`` must be a boolean" - assert model is None or model in (None, "latest_long", "latest_short", "command_and_search", "phone_call", "video", "default", "medical_conversation", "medical_dictation"), "``model`` must be None or 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', or 'default'" + + # Implementation of assertions of common api_params + if "use_enhanced" in api_params: + assert isinstance(api_params["use_enhanced"], bool), "``use_enhanced`` must be a boolean when used" + + if "model" in api_params: + assert api_params["model"] in (None, "latest_long", "latest_short", "command_and_search", "phone_call", "video", "default", "medical_conversation", "medical_dictation"), "``model`` must be None or 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', or 'default'" try: import socket @@ -719,8 +731,7 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en 'encoding': speech.RecognitionConfig.AudioEncoding.FLAC, 'sample_rate_hertz': audio_data.sample_rate, 'language_code': language, - 'use_enhanced': use_enhanced, - 'model': model, + **api_params, } if preferred_phrases is not None: config['speechContexts'] = [speech.SpeechContext( From db0da15888c75c28707ce76840b42426922bb0d2 Mon Sep 17 00:00:00 2001 From: Vitor Hideyoshi Date: Wed, 18 Dec 2024 02:34:45 +0000 Subject: [PATCH 6/7] Removes Anti Pattern of Assertions for Data Validation --- speech_recognition/__init__.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index dbfd2a7b..d05a6285 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -724,13 +724,6 @@ def recognize_google_cloud(self, audio_data, credentials_json=None, language="en assert isinstance(language, str), "``language`` must be a string" assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings" - # Implementation of assertions of common api_params - if "use_enhanced" in api_params: - assert isinstance(api_params["use_enhanced"], bool), "``use_enhanced`` must be a boolean when used" - - if "model" in api_params: - assert api_params["model"] in (None, "latest_long", "latest_short", "command_and_search", "phone_call", "video", "default", "medical_conversation", "medical_dictation"), "``model`` must be None or 'latest_long', 'latest_short', 'command_and_search', 'phone_call', 'video', or 'default'" - try: import socket From 8abafcb0ff0253e7d9973ccb544e22414f14f774 Mon Sep 17 00:00:00 2001 From: nikkie Date: Wed, 18 Dec 2024 23:41:43 +0900 Subject: [PATCH 7/7] style: Fix rstcheck's "Title underline too short." --- reference/library-reference.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 7eef4108..296fc250 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -228,7 +228,7 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. ``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]`` ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.