From ff10effbedd413dd2ccae38dc744be7465e42c62 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Fri, 20 Dec 2024 23:16:45 +0900
Subject: [PATCH 01/14] [test] Add one case of google-cloud-speech

---
 tests/recognizers/test_google_cloud.py | 51 ++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 tests/recognizers/test_google_cloud.py

diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py
new file mode 100644
index 00000000..c55a81dc
--- /dev/null
+++ b/tests/recognizers/test_google_cloud.py
@@ -0,0 +1,51 @@
+from unittest.mock import MagicMock, patch
+
+from google.cloud.speech import (
+    RecognitionAudio,
+    RecognitionConfig,
+    RecognizeResponse,
+    SpeechRecognitionAlternative,
+    SpeechRecognitionResult,
+)
+
+from speech_recognition import Recognizer
+from speech_recognition.audio import AudioData
+
+
+@patch("google.cloud.speech.SpeechClient")
+def test_transcribe_with_google_cloud_speech(SpeechClient, monkeypatch):
+    monkeypatch.setenv(
+        "GOOGLE_APPLICATION_CREDENTIALS", "path/to/credentials.json"
+    )
+
+    client = SpeechClient.return_value
+    # ref: https://cloud.google.com/speech-to-text/docs/transcribe-gcloud?hl=ja#make_an_audio_transcription_request
+    client.recognize.return_value = RecognizeResponse(
+        results=[
+            SpeechRecognitionResult(
+                alternatives=[
+                    SpeechRecognitionAlternative(
+                        transcript="how old is the Brooklyn Bridge",
+                        confidence=0.9840146,
+                    )
+                ]
+            )
+        ]
+    )
+
+    audio_data = MagicMock(spec=AudioData)
+    audio_data.sample_rate = 16_000
+    audio_data.get_flac_data.return_value = b"flac_data"
+
+    actual = Recognizer().recognize_google_cloud(audio_data)
+
+    assert actual == "how old is the Brooklyn Bridge "
+    SpeechClient.assert_called_once_with()
+    client.recognize.assert_called_once_with(
+        config=RecognitionConfig(
+            encoding=RecognitionConfig.AudioEncoding.FLAC,
+            sample_rate_hertz=16_000,
+            language_code="en-US",
+        ),
+        audio=RecognitionAudio(content=b"flac_data"),
+    )

From 6b46a9b886f90f1b115ef831adc0278f208130f8 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Fri, 20 Dec 2024 23:20:23 +0900
Subject: [PATCH 02/14] [chore] Add google-cloud extra

---
 .github/workflows/unittests.yml | 6 +++---
 setup.cfg                       | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 8b2a6daa..170b78f8 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -44,16 +44,16 @@ jobs:
       - name: Install Python dependencies (Ubuntu, <=3.12)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.13'
         run: |
-          python -m pip install .[dev,audio,pocketsphinx,whisper-local,openai,groq]
+          python -m pip install .[dev,audio,pocketsphinx,google-cloud,whisper-local,openai,groq]
       - name: Install Python dependencies (Ubuntu, 3.13)
         if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13'
         run: |
           python -m pip install standard-aifc setuptools
-          python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,openai,groq]
+          python -m pip install --no-build-isolation .[dev,audio,pocketsphinx,google-cloud,openai,groq]
       - name: Install Python dependencies (Windows)
         if: matrix.os == 'windows-latest'
         run: |
-          python -m pip install .[dev,whisper-local,openai,groq]
+          python -m pip install .[dev,whisper-local,google-cloud,openai,groq]
       - name: Test with unittest
         run: |
           pytest --doctest-modules -v speech_recognition/recognizers/ tests/
diff --git a/setup.cfg b/setup.cfg
index 33ac0595..9d1def40 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,6 +7,8 @@ audio =
     PyAudio >= 0.2.11
 pocketsphinx =
     pocketsphinx < 5
+google-cloud =
+    google-cloud-speech
 whisper-local =
     openai-whisper
     soundfile

From 27f6d8588ea405645850768d567a38dd0aead82f Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 15:24:32 +0900
Subject: [PATCH 03/14] [docs] Install with google-cloud extra

---
 README.rst | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.rst b/README.rst
index afb69b66..522777b7 100644
--- a/README.rst
+++ b/README.rst
@@ -151,14 +151,15 @@ You also have to install Vosk Models:
 
 `Here <https://alphacephei.com/vosk/models>`__ are models avaiable for download. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model"
 
-Google Cloud Speech Library for Python (for Google Cloud Speech API users)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Google Cloud Speech Library for Python (for Google Cloud Speech-to-Text API users)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-`Google Cloud Speech library for Python <https://cloud.google.com/speech-to-text/docs/quickstart>`__ is required if and only if you want to use the Google Cloud Speech API (``recognizer_instance.recognize_google_cloud``).
+The library `google-cloud-speech <https://pypi.org/project/google-cloud-speech/>`__ is **required if and only if you want to use Google Cloud Speech-to-Text API** (``recognizer_instance.recognize_google_cloud``).
 
-If not installed, everything in the library will still work, except calling ``recognizer_instance.recognize_google_cloud`` will raise an ``RequestError``.
+You can install it with :command:`python3 -m pip install SpeechRecognition[google-cloud]`.
+(ref: `official installation instructions <https://cloud.google.com/speech-to-text/docs/transcribe-client-libraries#install_the_client_library>`__)
 
-According to the `official installation instructions <https://cloud.google.com/speech-to-text/docs/quickstart>`__, the recommended way to install this is using `Pip <https://pip.readthedocs.org/>`__: execute ``pip install google-cloud-speech`` (replace ``pip`` with ``pip3`` if using Python 3).
+Currently only `V1 <https://cloud.google.com/speech-to-text/docs/quickstart>`__ is supported. (`V2 <https://cloud.google.com/speech-to-text/v2/docs/quickstart>`__ is not supported)
 
 FLAC (for some systems)
 ~~~~~~~~~~~~~~~~~~~~~~~

From 1df198bb81609a317ca4fc1dc0e9bb767552a8ec Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 15:41:02 +0900
Subject: [PATCH 04/14] [refactor] Extract google-cloud-speech recognition

---
 speech_recognition/__init__.py                | 87 +-----------------
 .../recognizers/google_cloud.py               | 90 +++++++++++++++++++
 2 files changed, 92 insertions(+), 85 deletions(-)
 create mode 100644 speech_recognition/recognizers/google_cloud.py

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
index d183c58b..f7c19f11 100644
--- a/speech_recognition/__init__.py
+++ b/speech_recognition/__init__.py
@@ -693,90 +693,6 @@ def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, g
         if hypothesis is not None: return hypothesis.hypstr
         raise UnknownValueError()  # no transcriptions available
 
-    def recognize_google_cloud(self, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params):
-        """
-        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
-
-        This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
-
-        The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
-
-        If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
-
-        ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>
-
-            The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set,
-            then an appropriate enhanced model is chosen if an enhanced model exists for the audio.
-            If use_enhanced is true and an enhanced version of the specified model does not exist,
-            then the speech is recognized using the standard version of the specified model.
-
-            Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best
-            suited to your domain to get best results. If a model is not explicitly specified,
-            then we auto-select a model based on the other parameters of this method.
-
-        Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
-
-        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
-        """
-        assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
-        if credentials_json is None:
-            assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None
-        assert isinstance(language, str), "``language`` must be a string"
-        assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings"
-
-        try:
-            import socket
-
-            from google.api_core.exceptions import GoogleAPICallError
-            from google.cloud import speech
-        except ImportError:
-            raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.')
-
-        if credentials_json is not None:
-            client = speech.SpeechClient.from_service_account_json(credentials_json)
-        else:
-            client = speech.SpeechClient()
-
-        flac_data = audio_data.get_flac_data(
-            convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)),  # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
-            convert_width=2  # audio samples must be 16-bit
-        )
-        audio = speech.RecognitionAudio(content=flac_data)
-
-        config = {
-            'encoding': speech.RecognitionConfig.AudioEncoding.FLAC,
-            'sample_rate_hertz': audio_data.sample_rate,
-            'language_code': language,
-            **api_params,
-        }
-        if preferred_phrases is not None:
-            config['speechContexts'] = [speech.SpeechContext(
-                phrases=preferred_phrases
-            )]
-        if show_all:
-            config['enableWordTimeOffsets'] = True  # some useful extra options for when we want all the output
-
-        opts = {}
-        if self.operation_timeout and socket.getdefaulttimeout() is None:
-            opts['timeout'] = self.operation_timeout
-
-        config = speech.RecognitionConfig(**config)
-
-        try:
-            response = client.recognize(config=config, audio=audio)
-        except GoogleAPICallError as e:
-            raise RequestError(e)
-        except URLError as e:
-            raise RequestError("recognition connection failed: {0}".format(e.reason))
-
-        if show_all: return response
-        if len(response.results) == 0: raise UnknownValueError()
-
-        transcript = ''
-        for result in response.results:
-            transcript += result.alternatives[0].transcript.strip() + ' '
-        return transcript
-
     def recognize_wit(self, audio_data, key, show_all=False):
         """
         Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Wit.ai API.
@@ -1518,11 +1434,12 @@ def flush(self, *args, **kwargs):
 # At this time, the dependencies are not yet installed, resulting in a ModuleNotFoundError.
 # This is a workaround to resolve this issue
 try:
-    from .recognizers import google, openai, groq
+    from .recognizers import google, google_cloud, openai, groq
 except (ModuleNotFoundError, ImportError):
     pass
 else:
     Recognizer.recognize_google = google.recognize_legacy
+    Recognizer.recognize_google_cloud = google_cloud.recognize
     Recognizer.recognize_openai = openai.recognize
     Recognizer.recognize_whisper_api = openai.recognize  # Deprecated
     Recognizer.recognize_groq = groq.recognize
diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
new file mode 100644
index 00000000..bf72e72f
--- /dev/null
+++ b/speech_recognition/recognizers/google_cloud.py
@@ -0,0 +1,90 @@
+import os
+from urllib.error import URLError
+
+from speech_recognition.audio import AudioData
+from speech_recognition.exceptions import RequestError, UnknownValueError
+
+
+def recognize(recognizer, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params):
+    """
+    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
+
+    This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
+
+    The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
+
+    If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
+
+    ``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>
+
+        The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set,
+        then an appropriate enhanced model is chosen if an enhanced model exists for the audio.
+        If use_enhanced is true and an enhanced version of the specified model does not exist,
+        then the speech is recognized using the standard version of the specified model.
+
+        Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best
+        suited to your domain to get best results. If a model is not explicitly specified,
+        then we auto-select a model based on the other parameters of this method.
+
+    Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
+
+    Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
+    """
+    assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
+    if credentials_json is None:
+        assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None
+    assert isinstance(language, str), "``language`` must be a string"
+    assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings"
+
+    try:
+        import socket
+
+        from google.api_core.exceptions import GoogleAPICallError
+        from google.cloud import speech
+    except ImportError:
+        raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.')
+
+    if credentials_json is not None:
+        client = speech.SpeechClient.from_service_account_json(credentials_json)
+    else:
+        client = speech.SpeechClient()
+
+    flac_data = audio_data.get_flac_data(
+        convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)),  # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
+        convert_width=2  # audio samples must be 16-bit
+    )
+    audio = speech.RecognitionAudio(content=flac_data)
+
+    config = {
+        'encoding': speech.RecognitionConfig.AudioEncoding.FLAC,
+        'sample_rate_hertz': audio_data.sample_rate,
+        'language_code': language,
+        **api_params,
+    }
+    if preferred_phrases is not None:
+        config['speechContexts'] = [speech.SpeechContext(
+            phrases=preferred_phrases
+        )]
+    if show_all:
+        config['enableWordTimeOffsets'] = True  # some useful extra options for when we want all the output
+
+    opts = {}
+    if recognizer.operation_timeout and socket.getdefaulttimeout() is None:
+        opts['timeout'] = recognizer.operation_timeout
+
+    config = speech.RecognitionConfig(**config)
+
+    try:
+        response = client.recognize(config=config, audio=audio)
+    except GoogleAPICallError as e:
+        raise RequestError(e)
+    except URLError as e:
+        raise RequestError("recognition connection failed: {0}".format(e.reason))
+
+    if show_all: return response
+    if len(response.results) == 0: raise UnknownValueError()
+
+    transcript = ''
+    for result in response.results:
+        transcript += result.alternatives[0].transcript.strip() + ' '
+    return transcript

From b8d9f6ee436784d350c34adb678d2be4eada839d Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 15:45:02 +0900
Subject: [PATCH 05/14] [refactor] Remove dead code

* opts not used
* operation_timeout = None (Recognize.__init__())
---
 speech_recognition/recognizers/google_cloud.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
index bf72e72f..86af2aeb 100644
--- a/speech_recognition/recognizers/google_cloud.py
+++ b/speech_recognition/recognizers/google_cloud.py
@@ -37,8 +37,6 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p
     assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings"
 
     try:
-        import socket
-
         from google.api_core.exceptions import GoogleAPICallError
         from google.cloud import speech
     except ImportError:
@@ -68,10 +66,6 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p
     if show_all:
         config['enableWordTimeOffsets'] = True  # some useful extra options for when we want all the output
 
-    opts = {}
-    if recognizer.operation_timeout and socket.getdefaulttimeout() is None:
-        opts['timeout'] = recognizer.operation_timeout
-
     config = speech.RecognitionConfig(**config)
 
     try:

From b792f18ac57d1e2dc76550e14c78310b1fccd910 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 15:49:52 +0900
Subject: [PATCH 06/14] [refactor] Tweak sut

---
 tests/recognizers/test_google_cloud.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py
index c55a81dc..3a3c4154 100644
--- a/tests/recognizers/test_google_cloud.py
+++ b/tests/recognizers/test_google_cloud.py
@@ -10,6 +10,7 @@
 
 from speech_recognition import Recognizer
 from speech_recognition.audio import AudioData
+from speech_recognition.recognizers.google_cloud import recognize
 
 
 @patch("google.cloud.speech.SpeechClient")
@@ -37,7 +38,7 @@ def test_transcribe_with_google_cloud_speech(SpeechClient, monkeypatch):
     audio_data.sample_rate = 16_000
     audio_data.get_flac_data.return_value = b"flac_data"
 
-    actual = Recognizer().recognize_google_cloud(audio_data)
+    actual = recognize(MagicMock(spec=Recognizer), audio_data)
 
     assert actual == "how old is the Brooklyn Bridge "
     SpeechClient.assert_called_once_with()

From 4c66a7b8f839223c6ab716fab5b31936971b5d8b Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:03:18 +0900
Subject: [PATCH 07/14] [docs] Don't Repeat Yourself

---
 reference/library-reference.rst                | 18 +-----------------
 speech_recognition/recognizers/google_cloud.py |  3 +--
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/reference/library-reference.rst b/reference/library-reference.rst
index 2c6a47d1..3f410aba 100644
--- a/reference/library-reference.rst
+++ b/reference/library-reference.rst
@@ -230,23 +230,7 @@ Raises a ``speech_recognition.UnknownValueError`` exception if the speech is uni
 ``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]``
 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
-Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
-
-This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
-
-The recognition language is determined by ``language``, which is a BCP-47 language tag like ``"en-US"`` (US English). A list of supported language tags can be found in the `Google Cloud Speech API documentation <https://cloud.google.com/speech/docs/languages>`__.
-
-If ``preferred_phrases`` is an iterable of phrase strings, those given phrases will be more likely to be recognized over similar-sounding alternatives. This is useful for things like keyword/command recognition or adding new phrases that aren't in Google's vocabulary. Note that the API imposes certain `restrictions on the list of phrase strings <https://cloud.google.com/speech/limits#content>`__.
-
-``api_params`` are Cloud Speech API-specific parameters as dict (optional). For more information see <https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.RecognitionConfig>
-
-The ``use_enhanced`` is a boolean option. If use_enhanced is set to true and the model field is not set, then an appropriate enhanced model is chosen if an enhanced model exists for the audio. If use_enhanced is true and an enhanced version of the specified model does not exist, 	then the speech is recognized using the standard version of the specified model.
-
-Furthermore, if the option ``use_enhanced`` has not been set the option ``model`` can be used, which can be used to select the model best suited to your domain to get best results. If a model is not explicitly specified, then we auto-select a model based on the other parameters of this method.
-
-Returns the most likely transcription if ``show_all`` is False (the default). Otherwise, returns the raw API response as a JSON dictionary.
-
-Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
+.. autofunction:: speech_recognition.recognizers.google_cloud.recognize
 
 ``recognizer_instance.recognize_wit(audio_data: AudioData, key: str, show_all: bool = False) -> Union[str, Dict[str, Any]]``
 ----------------------------------------------------------------------------------------------------------------------------
diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
index 86af2aeb..d0988e69 100644
--- a/speech_recognition/recognizers/google_cloud.py
+++ b/speech_recognition/recognizers/google_cloud.py
@@ -6,8 +6,7 @@
 
 
 def recognize(recognizer, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params):
-    """
-    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech API.
+    """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
 
     This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
 

From 35447a4b1ac8e017c1d9e0c3c9dc2cd6e93c42be Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:03:45 +0900
Subject: [PATCH 08/14] [chore] Add Sphinx support to rstcheck

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index cbf1e79c..bd18a867 100644
--- a/Makefile
+++ b/Makefile
@@ -3,4 +3,4 @@ lint:
 	@pipx run flake8 --ignore=E501,E701,W503 .
 
 rstcheck:
-	@pipx run rstcheck --ignore-directives autofunction README.rst reference/*.rst
+	@pipx run rstcheck[sphinx] --ignore-directives autofunction README.rst reference/*.rst

From 8a2afcf280ddc20ffce126b408c9f1550790282e Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:06:50 +0900
Subject: [PATCH 09/14] [style] Format

---
 .../recognizers/google_cloud.py               | 67 +++++++++++++------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
index d0988e69..8470902c 100644
--- a/speech_recognition/recognizers/google_cloud.py
+++ b/speech_recognition/recognizers/google_cloud.py
@@ -5,7 +5,15 @@
 from speech_recognition.exceptions import RequestError, UnknownValueError
 
 
-def recognize(recognizer, audio_data, credentials_json=None, language="en-US", preferred_phrases=None, show_all=False, **api_params):
+def recognize(
+    recognizer,
+    audio_data,
+    credentials_json=None,
+    language="en-US",
+    preferred_phrases=None,
+    show_all=False,
+    **api_params
+):
     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
 
     This function requires a Google Cloud Platform account; see the `Google Cloud Speech API Quickstart <https://cloud.google.com/speech/docs/getting-started>`__ for details and instructions. Basically, create a project, enable billing for the project, enable the Google Cloud Speech API for the project, and set up Service Account Key credentials for the project. The result is a JSON file containing the API credentials. The text content of this JSON file is specified by ``credentials_json``. If not specified, the library will try to automatically `find the default API credentials JSON file <https://developers.google.com/identity/protocols/application-default-credentials>`__.
@@ -29,41 +37,56 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p
 
     Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the credentials aren't valid, or if there is no Internet connection.
     """
-    assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
+    assert isinstance(
+        audio_data, AudioData
+    ), "``audio_data`` must be audio data"
     if credentials_json is None:
-        assert os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') is not None
+        assert os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") is not None
     assert isinstance(language, str), "``language`` must be a string"
-    assert preferred_phrases is None or all(isinstance(preferred_phrases, (type(""), type(u""))) for preferred_phrases in preferred_phrases), "``preferred_phrases`` must be a list of strings"
+    assert preferred_phrases is None or all(
+        isinstance(preferred_phrases, (type(""), type("")))
+        for preferred_phrases in preferred_phrases
+    ), "``preferred_phrases`` must be a list of strings"
 
     try:
         from google.api_core.exceptions import GoogleAPICallError
         from google.cloud import speech
     except ImportError:
-        raise RequestError('missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly.')
+        raise RequestError(
+            "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
+        )
 
     if credentials_json is not None:
-        client = speech.SpeechClient.from_service_account_json(credentials_json)
+        client = speech.SpeechClient.from_service_account_json(
+            credentials_json
+        )
     else:
         client = speech.SpeechClient()
 
     flac_data = audio_data.get_flac_data(
-        convert_rate=None if 8000 <= audio_data.sample_rate <= 48000 else max(8000, min(audio_data.sample_rate, 48000)),  # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
-        convert_width=2  # audio samples must be 16-bit
+        # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
+        convert_rate=(
+            None
+            if 8000 <= audio_data.sample_rate <= 48000
+            else max(8000, min(audio_data.sample_rate, 48000))
+        ),
+        convert_width=2,  # audio samples must be 16-bit
     )
     audio = speech.RecognitionAudio(content=flac_data)
 
     config = {
-        'encoding': speech.RecognitionConfig.AudioEncoding.FLAC,
-        'sample_rate_hertz': audio_data.sample_rate,
-        'language_code': language,
+        "encoding": speech.RecognitionConfig.AudioEncoding.FLAC,
+        "sample_rate_hertz": audio_data.sample_rate,
+        "language_code": language,
         **api_params,
     }
     if preferred_phrases is not None:
-        config['speechContexts'] = [speech.SpeechContext(
-            phrases=preferred_phrases
-        )]
+        config["speechContexts"] = [
+            speech.SpeechContext(phrases=preferred_phrases)
+        ]
     if show_all:
-        config['enableWordTimeOffsets'] = True  # some useful extra options for when we want all the output
+        # some useful extra options for when we want all the output
+        config["enableWordTimeOffsets"] = True
 
     config = speech.RecognitionConfig(**config)
 
@@ -72,12 +95,16 @@ def recognize(recognizer, audio_data, credentials_json=None, language="en-US", p
     except GoogleAPICallError as e:
         raise RequestError(e)
     except URLError as e:
-        raise RequestError("recognition connection failed: {0}".format(e.reason))
+        raise RequestError(
+            "recognition connection failed: {0}".format(e.reason)
+        )
 
-    if show_all: return response
-    if len(response.results) == 0: raise UnknownValueError()
+    if show_all:
+        return response
+    if len(response.results) == 0:
+        raise UnknownValueError()
 
-    transcript = ''
+    transcript = ""
     for result in response.results:
-        transcript += result.alternatives[0].transcript.strip() + ' '
+        transcript += result.alternatives[0].transcript.strip() + " "
     return transcript

From 8d33596b89f44e08aa6e44637a9a9d986c1a9df5 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:13:18 +0900
Subject: [PATCH 10/14] [docs] Type hints

---
 speech_recognition/recognizers/google_cloud.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
index 8470902c..3c0cfbd0 100644
--- a/speech_recognition/recognizers/google_cloud.py
+++ b/speech_recognition/recognizers/google_cloud.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import os
 from urllib.error import URLError
 
@@ -7,12 +9,12 @@
 
 def recognize(
     recognizer,
-    audio_data,
-    credentials_json=None,
-    language="en-US",
+    audio_data: AudioData,
+    credentials_json: str | None = None,
+    language: str = "en-US",
     preferred_phrases=None,
-    show_all=False,
-    **api_params
+    show_all: bool = False,
+    **api_params,
 ):
     """Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Google Cloud Speech-to-Text V1 API.
 

From 3e893d56d80e89401e73fdef5d9ebb98847748d4 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:25:03 +0900
Subject: [PATCH 11/14] [test] Google Cloud credentials file case

---
 reference/library-reference.rst               |  4 +--
 .../recognizers/google_cloud.py               | 15 +++++-----
 tests/recognizers/test_google_cloud.py        | 30 +++++++++++++++++++
 3 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/reference/library-reference.rst b/reference/library-reference.rst
index 3f410aba..82239fd2 100644
--- a/reference/library-reference.rst
+++ b/reference/library-reference.rst
@@ -227,8 +227,8 @@ Returns the most likely transcription if ``show_all`` is false (the default). Ot
 
 Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
 
-``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]``
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+``recognizer_instance.recognize_google_cloud(audio_data: AudioData, credentials_json_path: Union[str, None] = None, language: str = "en-US", preferred_phrases: Union[Iterable[str], None] = None, show_all: bool = False, **api_params) -> Union[str, Dict[str, Any]]``
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 
 .. autofunction:: speech_recognition.recognizers.google_cloud.recognize
 
diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
index 3c0cfbd0..eacd1594 100644
--- a/speech_recognition/recognizers/google_cloud.py
+++ b/speech_recognition/recognizers/google_cloud.py
@@ -10,7 +10,7 @@
 def recognize(
     recognizer,
     audio_data: AudioData,
-    credentials_json: str | None = None,
+    credentials_json_path: str | None = None,
     language: str = "en-US",
     preferred_phrases=None,
     show_all: bool = False,
@@ -42,7 +42,7 @@ def recognize(
     assert isinstance(
         audio_data, AudioData
     ), "``audio_data`` must be audio data"
-    if credentials_json is None:
+    if credentials_json_path is None:
         assert os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") is not None
     assert isinstance(language, str), "``language`` must be a string"
     assert preferred_phrases is None or all(
@@ -58,12 +58,11 @@ def recognize(
             "missing google-cloud-speech module: ensure that google-cloud-speech is set up correctly."
         )
 
-    if credentials_json is not None:
-        client = speech.SpeechClient.from_service_account_json(
-            credentials_json
-        )
-    else:
-        client = speech.SpeechClient()
+    client = (
+        speech.SpeechClient.from_service_account_json(credentials_json_path)
+        if credentials_json_path
+        else speech.SpeechClient()
+    )
 
     flac_data = audio_data.get_flac_data(
         # audio sample rate must be between 8 kHz and 48 kHz inclusive - clamp sample rate into this range
diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py
index 3a3c4154..fbbdd775 100644
--- a/tests/recognizers/test_google_cloud.py
+++ b/tests/recognizers/test_google_cloud.py
@@ -50,3 +50,33 @@ def test_transcribe_with_google_cloud_speech(SpeechClient, monkeypatch):
         ),
         audio=RecognitionAudio(content=b"flac_data"),
     )
+
+
+@patch("google.cloud.speech.SpeechClient")
+def test_transcribe_with_specified_credentials(SpeechClient):
+    client = SpeechClient.from_service_account_json.return_value
+    client.recognize.return_value = RecognizeResponse(
+        results=[
+            SpeechRecognitionResult(
+                alternatives=[
+                    SpeechRecognitionAlternative(
+                        transcript="transcript", confidence=0.9
+                    )
+                ]
+            )
+        ]
+    )
+
+    audio_data = MagicMock(spec=AudioData)
+    audio_data.sample_rate = 16_000
+    audio_data.get_flac_data.return_value = b"flac_data"
+
+    _ = recognize(
+        MagicMock(spec=Recognizer),
+        audio_data,
+        credentials_json_path="path/to/credentials.json",
+    )
+
+    SpeechClient.from_service_account_json.assert_called_once_with(
+        "path/to/credentials.json"
+    )

From 9843278dcde13c6a8f86b699c37f2d6e4ef5099b Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:47:06 +0900
Subject: [PATCH 12/14] [test] show_all case

---
 tests/recognizers/test_google_cloud.py | 68 ++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py
index fbbdd775..015fac0b 100644
--- a/tests/recognizers/test_google_cloud.py
+++ b/tests/recognizers/test_google_cloud.py
@@ -6,6 +6,7 @@
     RecognizeResponse,
     SpeechRecognitionAlternative,
     SpeechRecognitionResult,
+    WordInfo,
 )
 
 from speech_recognition import Recognizer
@@ -80,3 +81,70 @@ def test_transcribe_with_specified_credentials(SpeechClient):
     SpeechClient.from_service_account_json.assert_called_once_with(
         "path/to/credentials.json"
     )
+
+
+@patch("google.cloud.speech.SpeechClient")
+def test_transcribe_show_all(SpeechClient, monkeypatch):
+    monkeypatch.setenv(
+        "GOOGLE_APPLICATION_CREDENTIALS", "path/to/credentials.json"
+    )
+
+    client = SpeechClient.return_value
+    client.recognize.return_value = RecognizeResponse(
+        results=[
+            SpeechRecognitionResult(
+                alternatives=[
+                    SpeechRecognitionAlternative(
+                        transcript="transcript",
+                        confidence=0.9,
+                        words=[
+                            WordInfo(
+                                word="transcript",
+                                start_time="0s",
+                                end_time="0.400s",
+                            )
+                        ],
+                    )
+                ],
+                language_code="en-US",
+                result_end_time="0.400s",
+            )
+        ]
+    )
+
+    audio_data = MagicMock(spec=AudioData)
+    audio_data.sample_rate = 16_000
+    audio_data.get_flac_data.return_value = b"flac_data"
+
+    actual = recognize(MagicMock(spec=Recognizer), audio_data, show_all=True)
+
+    assert actual == RecognizeResponse(
+        results=[
+            SpeechRecognitionResult(
+                alternatives=[
+                    SpeechRecognitionAlternative(
+                        transcript="transcript",
+                        confidence=0.9,
+                        words=[
+                            WordInfo(
+                                word="transcript",
+                                start_time="0s",
+                                end_time="0.400s",
+                            )
+                        ],
+                    )
+                ],
+                language_code="en-US",
+                result_end_time="0.400s",
+            )
+        ]
+    )
+    client.recognize.assert_called_once_with(
+        config=RecognitionConfig(
+            encoding=RecognitionConfig.AudioEncoding.FLAC,
+            sample_rate_hertz=16_000,
+            language_code="en-US",
+            enable_word_time_offsets=True,
+        ),
+        audio=RecognitionAudio(content=b"flac_data"),
+    )

From 6477efafd069ac4c49eda086631472f4eebb0cf8 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:47:53 +0900
Subject: [PATCH 13/14] [bugfix] enableWordTimeOffsets is not supported;
 snake_case!

---
 speech_recognition/recognizers/google_cloud.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/speech_recognition/recognizers/google_cloud.py b/speech_recognition/recognizers/google_cloud.py
index eacd1594..122104fa 100644
--- a/speech_recognition/recognizers/google_cloud.py
+++ b/speech_recognition/recognizers/google_cloud.py
@@ -86,8 +86,8 @@ def recognize(
             speech.SpeechContext(phrases=preferred_phrases)
         ]
     if show_all:
-        # some useful extra options for when we want all the output
-        config["enableWordTimeOffsets"] = True
+        # ref: https://cloud.google.com/speech-to-text/docs/async-time-offsets
+        config["enable_word_time_offsets"] = True
 
     config = speech.RecognitionConfig(**config)
 

From 9aa550406c118c3f68ca9e2cc2d463f131e530b3 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sat, 21 Dec 2024 16:54:33 +0900
Subject: [PATCH 14/14] [test] Cloud Speech-to-Text V1 parameters

---
 tests/recognizers/test_google_cloud.py | 41 ++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/recognizers/test_google_cloud.py b/tests/recognizers/test_google_cloud.py
index 015fac0b..340bda94 100644
--- a/tests/recognizers/test_google_cloud.py
+++ b/tests/recognizers/test_google_cloud.py
@@ -148,3 +148,44 @@ def test_transcribe_show_all(SpeechClient, monkeypatch):
         ),
         audio=RecognitionAudio(content=b"flac_data"),
     )
+
+
+@patch("google.cloud.speech.SpeechClient")
+def test_transcribe_with_specified_api_parameters(SpeechClient, monkeypatch):
+    monkeypatch.setenv(
+        "GOOGLE_APPLICATION_CREDENTIALS", "path/to/credentials.json"
+    )
+
+    client = SpeechClient.return_value
+    client.recognize.return_value = RecognizeResponse(
+        results=[
+            SpeechRecognitionResult(
+                alternatives=[
+                    SpeechRecognitionAlternative(
+                        transcript="こんにちは", confidence=0.99
+                    )
+                ]
+            )
+        ]
+    )
+
+    audio_data = MagicMock(spec=AudioData)
+    audio_data.sample_rate = 16_000
+    audio_data.get_flac_data.return_value = b"flac_data"
+
+    _ = recognize(
+        MagicMock(spec=Recognizer),
+        audio_data,
+        language="ja-JP",
+        use_enhanced=True,
+    )
+
+    client.recognize.assert_called_once_with(
+        config=RecognitionConfig(
+            encoding=RecognitionConfig.AudioEncoding.FLAC,
+            sample_rate_hertz=16_000,
+            language_code="ja-JP",
+            use_enhanced=True,
+        ),
+        audio=RecognitionAudio(content=b"flac_data"),
+    )