Speech sample using microphone [(#1013)](#1013)

* Add sample for transcribing from microphone. * Remove error handling that probably won't work * Add test. * Fix lint. * Increment copyright date
GoogleCloudPlatform · Mar 13, 2023 · 955916b · 955916b
1 parent 940cce1
commit 955916b
Show file tree

Hide file tree

Showing 3 changed files with 237 additions and 0 deletions.
diff --git a/speech/snippets/resources/quit.raw b/speech/snippets/resources/quit.raw
diff --git a/speech/snippets/transcribe_streaming_mic.py b/speech/snippets/transcribe_streaming_mic.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Google Cloud Speech API sample application using the streaming API.
+
+NOTE: This module requires the additional dependency `pyaudio`. To install
+using pip:
+
+    pip install pyaudio
+
+Example usage:
+    python transcribe_streaming_mic.py
+"""
+
+# [START import_libraries]
+from __future__ import division
+
+import re
+import sys
+
+from google.cloud import speech
+import pyaudio
+from six.moves import queue
+# [END import_libraries]
+
+# Audio recording parameters
+RATE = 16000
+CHUNK = int(RATE / 10)  # 100ms
+
+
+class MicAsFile(object):
+    """Opens a recording stream as a file-like object."""
+    def __init__(self, rate, chunk):
+        self._rate = rate
+        self._chunk = chunk
+
+        # Create a thread-safe buffer of audio data
+        self._buff = queue.Queue()
+        self.closed = True
+
+    def __enter__(self):
+        self._audio_interface = pyaudio.PyAudio()
+        self._audio_stream = self._audio_interface.open(
+            format=pyaudio.paInt16,
+            # The API currently only supports 1-channel (mono) audio
+            # https://goo.gl/z757pE
+            channels=1, rate=self._rate,
+            input=True, frames_per_buffer=self._chunk,
+            # Run the audio stream asynchronously to fill the buffer object.
+            # This is necessary so that the input device's buffer doesn't
+            # overflow while the calling thread makes network requests, etc.
+            stream_callback=self._fill_buffer,
+        )
+
+        self.closed = False
+
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self._audio_stream.stop_stream()
+        self._audio_stream.close()
+        self.closed = True
+        # Flush out the read, just in case
+        self._buff.put(None)
+        self._audio_interface.terminate()
+
+    def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
+        """Continuously collect data from the audio stream, into the buffer."""
+        self._buff.put(in_data)
+        return None, pyaudio.paContinue
+
+    def read(self, chunk_size):
+        if self.closed:
+            return
+
+        # Use a blocking get() to ensure there's at least one chunk of data.
+        data = [self._buff.get()]
+
+        # Now consume whatever other data's still buffered.
+        while True:
+            try:
+                data.append(self._buff.get(block=False))
+            except queue.Empty:
+                break
+
+        if self.closed:
+            return
+        return b''.join(data)
+# [END audio_stream]
+
+
+def listen_print_loop(results_gen):
+    """Iterates through server responses and prints them.
+
+    The results_gen passed is a generator that will block until a response
+    is provided by the server. When the transcription response comes, print it.
+
+    In this case, responses are provided for interim results as well. If the
+    response is an interim one, print a line feed at the end of it, to allow
+    the next result to overwrite it, until the response is a final one. For the
+    final one, print a newline to preserve the finalized transcription.
+    """
+    num_chars_printed = 0
+    for result in results_gen:
+        if not result.alternatives:
+            continue
+
+        # Display the top transcription
+        transcript = result.transcript
+
+        # Display interim results, but with a carriage return at the end of the
+        # line, so subsequent lines will overwrite them.
+        #
+        # If the previous result was longer than this one, we need to print
+        # some extra spaces to overwrite the previous result
+        overwrite_chars = ' ' * max(0, num_chars_printed - len(transcript))
+
+        if not result.is_final:
+            sys.stdout.write(transcript + overwrite_chars + '\r')
+            sys.stdout.flush()
+
+            num_chars_printed = len(transcript)
+
+        else:
+            print(transcript + overwrite_chars)
+
+            # Exit recognition if any of the transcribed phrases could be
+            # one of our keywords.
+            if re.search(r'\b(exit|quit)\b', transcript, re.I):
+                print('Exiting..')
+                break
+
+            num_chars_printed = 0
+
+
+def main():
+    speech_client = speech.Client()
+
+    with MicAsFile(RATE, CHUNK) as stream:
+        audio_sample = speech_client.sample(
+            stream=stream,
+            encoding=speech.encoding.Encoding.LINEAR16,
+            sample_rate_hertz=RATE)
+        # See http://g.co/cloud/speech/docs/languages
+        # for a list of supported languages.
+        language_code = 'en-US'  # a BCP-47 language tag
+        results_gen = audio_sample.streaming_recognize(
+                language_code=language_code, interim_results=True)
+
+        # Now, put the transcription responses to use.
+        listen_print_loop(results_gen)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/speech/snippets/transcribe_streaming_mic_test.py b/speech/snippets/transcribe_streaming_mic_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017, Google, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import threading
+import time
+
+import mock
+
+RESOURCES = os.path.join(os.path.dirname(__file__), 'resources')
+
+
+class MockPyAudio(object):
+    def __init__(self, audio_filename):
+        self.audio_filename = audio_filename
+
+    def __call__(self, *args):
+        return self
+
+    def open(self, stream_callback, rate, *args, **kwargs):
+        self.rate = rate
+        self.closed = threading.Event()
+        self.stream_thread = threading.Thread(
+            target=self.stream_audio, args=(
+                self.audio_filename, stream_callback, self.closed))
+        self.stream_thread.start()
+        return self
+
+    def close(self):
+        self.closed.set()
+
+    def stop_stream(self):
+        pass
+
+    def terminate(self):
+        pass
+
+    def stream_audio(self, audio_filename, callback, closed, num_frames=512):
+        with open(audio_filename, 'rb') as audio_file:
+            while not closed.is_set():
+                # Approximate realtime by sleeping for the appropriate time for
+                # the requested number of frames
+                time.sleep(num_frames / float(self.rate))
+                # audio is 16-bit samples, whereas python byte is 8-bit
+                num_bytes = 2 * num_frames
+                chunk = audio_file.read(num_bytes) or b'\0' * num_bytes
+                callback(chunk, None, None, None)
+
+
+@mock.patch.dict('sys.modules', pyaudio=mock.MagicMock(
+        PyAudio=MockPyAudio(os.path.join(RESOURCES, 'quit.raw'))))
+def test_main(capsys):
+    import transcribe_streaming_mic
+
+    transcribe_streaming_mic.main()
+    out, err = capsys.readouterr()
+
+    assert re.search(r'quit', out, re.DOTALL | re.I)