docs(samples): Update code samples for adaptation and VAD (#462)

Co-authored-by: AJ Morozoff <[email protected]> Co-authored-by: Anthonios Partheniou <[email protected]>
GoogleCloudPlatform · Feb 9, 2023 · c0602c1 · c0602c1
1 parent fea464c
commit c0602c1
Show file tree

Hide file tree

Showing 15 changed files with 390 additions and 17 deletions.
diff --git a/speech/snippets/adaptation_v2_custom_class_reference.py b/speech/snippets/adaptation_v2_custom_class_reference.py
@@ -44,7 +44,7 @@ def adaptation_v2_custom_class_reference(project_id, recognizer_id, phrase_set_i
     request = cloud_speech.CreateCustomClassRequest(
         parent=f"projects/{project_id}/locations/global",
         custom_class_id=custom_class_id,
-        custom_class=cloud_speech.CustomClass(items=[{"value": "Keem"}]))
+        custom_class=cloud_speech.CustomClass(items=[{"value": "fare"}]))
 
     operation = client.create_custom_class(request=request)
     custom_class = operation.result()
@@ -70,10 +70,6 @@ def adaptation_v2_custom_class_reference(project_id, recognizer_id, phrase_set_i
         auto_decoding_config={}, adaptation=adaptation
     )
 
-    print(custom_class)
-    print(phrase_set)
-    print(config)
-
     request = cloud_speech.RecognizeRequest(
         recognizer=recognizer.name, config=config, content=content
     )

diff --git a/speech/snippets/adaptation_v2_custom_class_reference_test.py b/speech/snippets/adaptation_v2_custom_class_reference_test.py
@@ -48,11 +48,11 @@ def test_adaptation_v2_custom_class_reference(capsys):
     phrase_set_id = "phrase-set-" + str(uuid4())
     custom_class_id = "custom-class-" + str(uuid4())
     response = adaptation_v2_custom_class_reference.adaptation_v2_custom_class_reference(
-        project_id, recognizer_id, phrase_set_id, custom_class_id, os.path.join(RESOURCES, "baby_keem.wav")
+        project_id, recognizer_id, phrase_set_id, custom_class_id, os.path.join(RESOURCES, "fair.wav")
     )
 
     assert re.search(
-        r"play Baby Keem",
+        r"the word is fare",
         response.results[0].alternatives[0].transcript,
         re.DOTALL | re.I,
     )

diff --git a/speech/snippets/adaptation_v2_inline_custom_class.py b/speech/snippets/adaptation_v2_inline_custom_class.py
@@ -41,8 +41,8 @@ def adaptation_v2_inline_custom_class(project_id, recognizer_id, audio_file):
         content = f.read()
 
     # Build inline phrase set to produce a more accurate transcript
-    phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "${keem}", "boost": 20}])
-    custom_class = cloud_speech.CustomClass(name="keem", items=[{"value": "Keem"}])
+    phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "${fare}", "boost": 20}])
+    custom_class = cloud_speech.CustomClass(name="fare", items=[{"value": "fare"}])
     adaptation = cloud_speech.SpeechAdaptation(
         phrase_sets=[
             cloud_speech.SpeechAdaptation.AdaptationPhraseSet(

diff --git a/speech/snippets/adaptation_v2_inline_custom_class_test.py b/speech/snippets/adaptation_v2_inline_custom_class_test.py
@@ -34,11 +34,11 @@ def test_adaptation_v2_inline_custom_class(capsys):
 
     recognizer_id = "recognizer-" + str(uuid4())
     response = adaptation_v2_inline_custom_class.adaptation_v2_inline_custom_class(
-        project_id, recognizer_id, os.path.join(RESOURCES, "baby_keem.wav")
+        project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav")
     )
 
     assert re.search(
-        r"play Baby Keem",
+        r"the word is fare",
         response.results[0].alternatives[0].transcript,
         re.DOTALL | re.I,
     )

diff --git a/speech/snippets/adaptation_v2_inline_phrase_set.py b/speech/snippets/adaptation_v2_inline_phrase_set.py
@@ -41,7 +41,7 @@ def adaptation_v2_inline_phrase_set(project_id, recognizer_id, audio_file):
         content = f.read()
 
     # Build inline phrase set to produce a more accurate transcript
-    phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "Keem", "boost": 10}])
+    phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}])
     adaptation = cloud_speech.SpeechAdaptation(
         phrase_sets=[
             cloud_speech.SpeechAdaptation.AdaptationPhraseSet(

diff --git a/speech/snippets/adaptation_v2_inline_phrase_set_test.py b/speech/snippets/adaptation_v2_inline_phrase_set_test.py
@@ -34,11 +34,11 @@ def test_adaptation_v2_inline_phrase_set(capsys):
 
     recognizer_id = "recognizer-" + str(uuid4())
     response = adaptation_v2_inline_phrase_set.adaptation_v2_inline_phrase_set(
-        project_id, recognizer_id, os.path.join(RESOURCES, "baby_keem.wav")
+        project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav")
     )
 
     assert re.search(
-        r"play Baby Keem",
+        r"the word is fare",
         response.results[0].alternatives[0].transcript,
         re.DOTALL | re.I,
     )

diff --git a/speech/snippets/adaptation_v2_phrase_set_reference.py b/speech/snippets/adaptation_v2_phrase_set_reference.py
@@ -44,7 +44,7 @@ def adaptation_v2_phrase_set_reference(project_id, recognizer_id, phrase_set_id,
     request = cloud_speech.CreatePhraseSetRequest(
         parent=f"projects/{project_id}/locations/global",
         phrase_set_id=phrase_set_id,
-        phrase_set=cloud_speech.PhraseSet(phrases=[{"value": "Keem", "boost": 10}]))
+        phrase_set=cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}]))
 
     operation = client.create_phrase_set(request=request)
     phrase_set = operation.result()

diff --git a/speech/snippets/adaptation_v2_phrase_set_reference_test.py b/speech/snippets/adaptation_v2_phrase_set_reference_test.py
@@ -41,11 +41,11 @@ def test_adaptation_v2_phrase_set_reference(capsys):
     recognizer_id = "recognizer-" + str(uuid4())
     phrase_set_id = "phrase-set-" + str(uuid4())
     response = adaptation_v2_phrase_set_reference.adaptation_v2_phrase_set_reference(
-        project_id, recognizer_id, phrase_set_id, os.path.join(RESOURCES, "baby_keem.wav")
+        project_id, recognizer_id, phrase_set_id, os.path.join(RESOURCES, "fair.wav")
     )
 
     assert re.search(
-        r"play Baby Keem",
+        r"the word is fare",
         response.results[0].alternatives[0].transcript,
         re.DOTALL | re.I,
     )

diff --git a/speech/snippets/resources/audio_silence_padding.wav b/speech/snippets/resources/audio_silence_padding.wav
diff --git a/speech/snippets/resources/baby_keem.wav b/speech/snippets/resources/baby_keem.wav
diff --git a/speech/snippets/resources/fair.wav b/speech/snippets/resources/fair.wav
diff --git a/speech/snippets/transcribe_streaming_voice_activity_events.py b/speech/snippets/transcribe_streaming_voice_activity_events.py
@@ -0,0 +1,108 @@
+# Copyright 2022 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+
+# [START speech_transcribe_streaming_voice_activity_events]
+import io
+
+from google.cloud.speech_v2 import SpeechClient
+from google.cloud.speech_v2.types import cloud_speech
+
+
+def transcribe_streaming_voice_activity_events(project_id, recognizer_id, audio_file):
+    # Instantiates a client
+    client = SpeechClient()
+
+    request = cloud_speech.CreateRecognizerRequest(
+        parent=f"projects/{project_id}/locations/global",
+        recognizer_id=recognizer_id,
+        recognizer=cloud_speech.Recognizer(
+            language_codes=["en-US"], model="latest_long"
+        ),
+    )
+
+    # Creates a Recognizer
+    operation = client.create_recognizer(request=request)
+    recognizer = operation.result()
+
+    # Reads a file as bytes
+    with io.open(audio_file, "rb") as f:
+        content = f.read()
+
+    # In practice, stream should be a generator yielding chunks of audio data
+    chunk_length = len(content) // 5
+    stream = [
+        content[start : start + chunk_length]
+        for start in range(0, len(content), chunk_length)
+    ]
+    audio_requests = (
+        cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream
+    )
+
+    recognition_config = cloud_speech.RecognitionConfig(auto_decoding_config={})
+
+    # Sets the flag to enable voice activity events
+    streaming_features = cloud_speech.StreamingRecognitionFeatures(
+        enable_voice_activity_events=True
+    )
+    streaming_config = cloud_speech.StreamingRecognitionConfig(
+        config=recognition_config, streaming_features=streaming_features
+    )
+
+    config_request = cloud_speech.StreamingRecognizeRequest(
+        recognizer=recognizer.name, streaming_config=streaming_config
+    )
+
+    def requests(config, audio):
+        yield config
+        for message in audio:
+            yield message
+
+    # Transcribes the audio into text
+    responses_iterator = client.streaming_recognize(
+        requests=requests(config_request, audio_requests)
+    )
+    responses = []
+    for response in responses_iterator:
+        responses.append(response)
+        if (
+            response.speech_event_type
+            == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
+        ):
+            print("Speech started.")
+        if (
+            response.speech_event_type
+            == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END
+        ):
+            print("Speech ended.")
+        for result in response.results:
+            print("Transcript: {}".format(result.alternatives[0].transcript))
+
+    return responses
+# [END speech_transcribe_streaming_voice_activity_events]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument("project_id", help="project to create recognizer in")
+    parser.add_argument("recognizer_id", help="name of recognizer to create")
+    parser.add_argument("audio_file", help="audio file to stream")
+    args = parser.parse_args()
+    transcribe_streaming_voice_activity_events(
+        args.project_id, args.recognizer_id, args.audio_file
+    )
diff --git a/speech/snippets/transcribe_streaming_voice_activity_events_test.py b/speech/snippets/transcribe_streaming_voice_activity_events_test.py
@@ -0,0 +1,58 @@
+# Copyright 2022, Google, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from uuid import uuid4
+
+from google.cloud.speech_v2 import SpeechClient
+from google.cloud.speech_v2.types import cloud_speech
+
+import transcribe_streaming_voice_activity_events
+
+RESOURCES = os.path.join(os.path.dirname(__file__), "resources")
+
+
+def delete_recognizer(name):
+    client = SpeechClient()
+    request = cloud_speech.DeleteRecognizerRequest(name=name)
+    client.delete_recognizer(request=request)
+
+
+def test_transcribe_streaming_voice_activity_events(capsys):
+    project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
+
+    recognizer_id = "recognizer-" + str(uuid4())
+    responses = transcribe_streaming_voice_activity_events.transcribe_streaming_voice_activity_events(
+        project_id, recognizer_id, os.path.join(RESOURCES, "audio.wav")
+    )
+
+    transcript = ""
+    for response in responses:
+        for result in response.results:
+            transcript += result.alternatives[0].transcript
+
+    assert (
+        responses[0].speech_event_type
+        == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
+    )
+
+    assert re.search(
+        r"how old is the Brooklyn Bridge",
+        transcript,
+        re.DOTALL | re.I,
+    )
+
+    delete_recognizer(
+        f"projects/{project_id}/locations/global/recognizers/{recognizer_id}"
+    )