From 2ce9b5dfedafc010ed1a3bafbce235a4de58cf2b Mon Sep 17 00:00:00 2001 From: Franklin Nunez <69214580+b-loved-dreamer@users.noreply.github.com> Date: Mon, 12 Oct 2020 09:57:52 -0700 Subject: [PATCH] fix: migrated samples to speech 2.0.0 (#78) * I updated the comment on the transcribe_async file to reflect time limitations on local files for the long_running_recognize * I updated the comment on the transcribe_async file to reflect time limitations on local files for the long_running_recognize * docs: I updated the comment on the transcribe_async file to reflect time limitations on local files for the long_running_recognize * chore: I updated the comments on the transcribe_async file to reflect time limitations on local files for the long_running_recognize * fix: resolved conflicts * fix: migrated samples to speech 2.0.0 * fix: migrated to speech 2.0.0 * fix: fixed lint issues --- speech/microphone/requirements.txt | 2 +- .../transcribe_streaming_infinite.py | 80 ++++---- speech/microphone/transcribe_streaming_mic.py | 39 ++-- .../transcribe_streaming_mic_test.py | 19 +- speech/snippets/beta_snippets.py | 171 ++++++++---------- speech/snippets/quickstart.py | 2 +- speech/snippets/requirements.txt | 2 +- speech/snippets/speech_adaptation_beta.py | 12 +- speech/snippets/speech_quickstart_beta.py | 9 +- speech/snippets/transcribe.py | 6 +- speech/snippets/transcribe_async.py | 10 +- .../snippets/transcribe_auto_punctuation.py | 2 +- speech/snippets/transcribe_context_classes.py | 4 +- speech/snippets/transcribe_enhanced_model.py | 2 +- speech/snippets/transcribe_model_selection.py | 6 +- speech/snippets/transcribe_multichannel.py | 4 +- speech/snippets/transcribe_streaming.py | 11 +- .../snippets/transcribe_word_time_offsets.py | 9 +- 18 files changed, 182 insertions(+), 208 deletions(-) diff --git a/speech/microphone/requirements.txt b/speech/microphone/requirements.txt index 40560b796629..275cd9d30f0d 100644 --- a/speech/microphone/requirements.txt +++ b/speech/microphone/requirements.txt @@ -1,4 +1,4 @@ -google-cloud-speech==1.3.2 +google-cloud-speech==2.0.0 pyaudio==0.2.11 six==1.15.0 diff --git a/speech/microphone/transcribe_streaming_infinite.py b/speech/microphone/transcribe_streaming_infinite.py index d6aafde2783d..dedb0d5bf72e 100644 --- a/speech/microphone/transcribe_streaming_infinite.py +++ b/speech/microphone/transcribe_streaming_infinite.py @@ -41,9 +41,9 @@ SAMPLE_RATE = 16000 CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms -RED = "\033[0;31m" -GREEN = "\033[0;32m" -YELLOW = "\033[0;33m" +RED = '\033[0;31m' +GREEN = '\033[0;32m' +YELLOW = '\033[0;33m' def get_current_time(): @@ -123,14 +123,12 @@ def generator(self): if self.bridging_offset > self.final_request_end_time: self.bridging_offset = self.final_request_end_time - chunks_from_ms = round( - (self.final_request_end_time - self.bridging_offset) - / chunk_time - ) + chunks_from_ms = round((self.final_request_end_time - + self.bridging_offset) / chunk_time) - self.bridging_offset = round( - (len(self.last_audio_input) - chunks_from_ms) * chunk_time - ) + self.bridging_offset = (round(( + len(self.last_audio_input) - chunks_from_ms) + * chunk_time)) for i in range(chunks_from_ms, len(self.last_audio_input)): data.append(self.last_audio_input[i]) @@ -159,7 +157,7 @@ def generator(self): except queue.Empty: break - yield b"".join(data) + yield b''.join(data) def listen_print_loop(responses, stream): @@ -195,45 +193,42 @@ def listen_print_loop(responses, stream): transcript = result.alternatives[0].transcript result_seconds = 0 - result_nanos = 0 + result_micros = 0 if result.result_end_time.seconds: result_seconds = result.result_end_time.seconds - if result.result_end_time.nanos: - result_nanos = result.result_end_time.nanos + if result.result_end_time.microseconds: + result_micros = result.result_end_time.microseconds - stream.result_end_time = int((result_seconds * 1000) + (result_nanos / 1000000)) + stream.result_end_time = int((result_seconds * 1000) + (result_micros / 1000)) - corrected_time = ( - stream.result_end_time - - stream.bridging_offset - + (STREAMING_LIMIT * stream.restart_counter) - ) + corrected_time = (stream.result_end_time - stream.bridging_offset + + (STREAMING_LIMIT * stream.restart_counter)) # Display interim results, but with a carriage return at the end of the # line, so subsequent lines will overwrite them. if result.is_final: sys.stdout.write(GREEN) - sys.stdout.write("\033[K") - sys.stdout.write(str(corrected_time) + ": " + transcript + "\n") + sys.stdout.write('\033[K') + sys.stdout.write(str(corrected_time) + ': ' + transcript + '\n') stream.is_final_end_time = stream.result_end_time stream.last_transcript_was_final = True # Exit recognition if any of the transcribed phrases could be # one of our keywords. - if re.search(r"\b(exit|quit)\b", transcript, re.I): + if re.search(r'\b(exit|quit)\b', transcript, re.I): sys.stdout.write(YELLOW) - sys.stdout.write("Exiting...\n") + sys.stdout.write('Exiting...\n') stream.closed = True break else: sys.stdout.write(RED) - sys.stdout.write("\033[K") - sys.stdout.write(str(corrected_time) + ": " + transcript + "\r") + sys.stdout.write('\033[K') + sys.stdout.write(str(corrected_time) + ': ' + transcript + '\r') stream.last_transcript_was_final = False @@ -245,39 +240,34 @@ def main(): config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=SAMPLE_RATE, - language_code="en-US", - max_alternatives=1, - ) + language_code='en-US', + max_alternatives=1) streaming_config = speech.StreamingRecognitionConfig( - config=config, interim_results=True - ) + config=config, + interim_results=True) mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE) print(mic_manager.chunk_size) sys.stdout.write(YELLOW) sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n') - sys.stdout.write("End (ms) Transcript Results/Status\n") - sys.stdout.write("=====================================================\n") + sys.stdout.write('End (ms) Transcript Results/Status\n') + sys.stdout.write('=====================================================\n') with mic_manager as stream: while not stream.closed: sys.stdout.write(YELLOW) - sys.stdout.write( - "\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n" - ) + sys.stdout.write('\n' + str( + STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n') stream.audio_input = [] audio_generator = stream.generator() - requests = ( - speech.StreamingRecognizeRequest(audio_content=content) - for content in audio_generator - ) + requests = (speech.StreamingRecognizeRequest( + audio_content=content)for content in audio_generator) - responses = client.streaming_recognize( - requests=requests, config=streaming_config - ) + responses = client.streaming_recognize(streaming_config, + requests) # Now, put the transcription responses to use. listen_print_loop(responses, stream) @@ -291,11 +281,11 @@ def main(): stream.restart_counter = stream.restart_counter + 1 if not stream.last_transcript_was_final: - sys.stdout.write("\n") + sys.stdout.write('\n') stream.new_stream = True -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/speech/microphone/transcribe_streaming_mic.py b/speech/microphone/transcribe_streaming_mic.py index b484a10e23e3..597e1bd7c1a9 100644 --- a/speech/microphone/transcribe_streaming_mic.py +++ b/speech/microphone/transcribe_streaming_mic.py @@ -32,6 +32,7 @@ import sys from google.cloud import speech + import pyaudio from six.moves import queue @@ -42,7 +43,6 @@ class MicrophoneStream(object): """Opens a recording stream as a generator yielding the audio chunks.""" - def __init__(self, rate, chunk): self._rate = rate self._chunk = chunk @@ -57,10 +57,8 @@ def __enter__(self): format=pyaudio.paInt16, # The API currently only supports 1-channel (mono) audio # https://goo.gl/z757pE - channels=1, - rate=self._rate, - input=True, - frames_per_buffer=self._chunk, + channels=1, rate=self._rate, + input=True, frames_per_buffer=self._chunk, # Run the audio stream asynchronously to fill the buffer object. # This is necessary so that the input device's buffer doesn't # overflow while the calling thread makes network requests, etc. @@ -105,7 +103,7 @@ def generator(self): except queue.Empty: break - yield b"".join(data) + yield b''.join(data) def listen_print_loop(responses): @@ -143,10 +141,10 @@ def listen_print_loop(responses): # # If the previous result was longer than this one, we need to print # some extra spaces to overwrite the previous result - overwrite_chars = " " * (num_chars_printed - len(transcript)) + overwrite_chars = ' ' * (num_chars_printed - len(transcript)) if not result.is_final: - sys.stdout.write(transcript + overwrite_chars + "\r") + sys.stdout.write(transcript + overwrite_chars + '\r') sys.stdout.flush() num_chars_printed = len(transcript) @@ -156,8 +154,8 @@ def listen_print_loop(responses): # Exit recognition if any of the transcribed phrases could be # one of our keywords. - if re.search(r"\b(exit|quit)\b", transcript, re.I): - print("Exiting..") + if re.search(r'\b(exit|quit)\b', transcript, re.I): + print('Exiting..') break num_chars_printed = 0 @@ -166,33 +164,28 @@ def listen_print_loop(responses): def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. - language_code = "en-US" # a BCP-47 language tag + language_code = 'en-US' # a BCP-47 language tag client = speech.SpeechClient() config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, - language_code=language_code, - ) + language_code=language_code) streaming_config = speech.StreamingRecognitionConfig( - config=config, interim_results=True - ) + config=config, + interim_results=True) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() - requests = ( - speech.StreamingRecognizeRequest(audio_content=content) - for content in audio_generator - ) + requests = (speech.StreamingRecognizeRequest(audio_content=content) + for content in audio_generator) - responses = client.streaming_recognize( - requests=requests, config=streaming_config - ) + responses = client.streaming_recognize(streaming_config, requests) # Now, put the transcription responses to use. listen_print_loop(responses) -if __name__ == "__main__": +if __name__ == '__main__': main() # [END speech_transcribe_streaming_mic] diff --git a/speech/microphone/transcribe_streaming_mic_test.py b/speech/microphone/transcribe_streaming_mic_test.py index f5e08f5d30b2..dd5e7ea6f5e6 100644 --- a/speech/microphone/transcribe_streaming_mic_test.py +++ b/speech/microphone/transcribe_streaming_mic_test.py @@ -18,7 +18,7 @@ import mock -RESOURCES = os.path.join(os.path.dirname(__file__), "resources") +RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') class MockPyAudio(object): @@ -32,9 +32,8 @@ def open(self, stream_callback, rate, *args, **kwargs): self.rate = rate self.closed = threading.Event() self.stream_thread = threading.Thread( - target=self.stream_audio, - args=(self.audio_filename, stream_callback, self.closed), - ) + target=self.stream_audio, args=( + self.audio_filename, stream_callback, self.closed)) self.stream_thread.start() return self @@ -48,25 +47,23 @@ def terminate(self): pass def stream_audio(self, audio_filename, callback, closed, num_frames=512): - with open(audio_filename, "rb") as audio_file: + with open(audio_filename, 'rb') as audio_file: while not closed.is_set(): # Approximate realtime by sleeping for the appropriate time for # the requested number of frames time.sleep(num_frames / float(self.rate)) # audio is 16-bit samples, whereas python byte is 8-bit num_bytes = 2 * num_frames - chunk = audio_file.read(num_bytes) or b"\0" * num_bytes + chunk = audio_file.read(num_bytes) or b'\0' * num_bytes callback(chunk, None, None, None) -@mock.patch.dict( - "sys.modules", - pyaudio=mock.MagicMock(PyAudio=MockPyAudio(os.path.join(RESOURCES, "quit.raw"))), -) +@mock.patch.dict('sys.modules', pyaudio=mock.MagicMock( + PyAudio=MockPyAudio(os.path.join(RESOURCES, 'quit.raw')))) def test_main(capsys): import transcribe_streaming_mic transcribe_streaming_mic.main() out, err = capsys.readouterr() - assert re.search(r"quit", out, re.DOTALL | re.I) + assert re.search(r'quit', out, re.DOTALL | re.I) diff --git a/speech/snippets/beta_snippets.py b/speech/snippets/beta_snippets.py index eaafe3ca978e..8d9571006959 100644 --- a/speech/snippets/beta_snippets.py +++ b/speech/snippets/beta_snippets.py @@ -35,31 +35,29 @@ def transcribe_file_with_enhanced_model(): """Transcribe the given audio file using an enhanced model.""" # [START speech_transcribe_enhanced_model_beta] from google.cloud import speech_v1p1beta1 as speech - client = speech.SpeechClient() - speech_file = "resources/commercial_mono.wav" + speech_file = 'resources/commercial_mono.wav' - with io.open(speech_file, "rb") as audio_file: + with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code="en-US", + language_code='en-US', use_enhanced=True, # A model must be specified to use enhanced model. - model="phone_call", - ) + model='phone_call') - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print("-" * 20) - print(f"First alternative of result {i}") - print(f"Transcript: {alternative.transcript}") + print('-' * 20) + print(u'First alternative of result {}'.format(i)) + print(u'Transcript: {}'.format(alternative.transcript)) # [END speech_transcribe_enhanced_model_beta] @@ -67,25 +65,23 @@ def transcribe_file_with_metadata(): """Send a request that includes recognition metadata.""" # [START speech_transcribe_recognition_metadata_beta] from google.cloud import speech_v1p1beta1 as speech - client = speech.SpeechClient() - speech_file = "resources/commercial_mono.wav" + speech_file = 'resources/commercial_mono.wav' - with io.open(speech_file, "rb") as audio_file: + with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() # Here we construct a recognition metadata object. # Most metadata fields are specified as enums that can be found - # in speech.RecognitionMetadata + # in speech.enums.RecognitionMetadata metadata = speech.RecognitionMetadata() - metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION + metadata.interaction_type = ( + speech.RecognitionMetadata.InteractionType.DISCUSSION) metadata.microphone_distance = ( - speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD - ) + speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD) metadata.recording_device_type = ( - speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE - ) + speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE) # Some metadata fields are free form strings metadata.recording_device_name = "Pixel 2 XL" # And some are integers, for instance the 6 digit NAICS code @@ -96,18 +92,17 @@ def transcribe_file_with_metadata(): config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code="en-US", + language_code='en-US', # Add this in the request to send metadata. - metadata=metadata, - ) + metadata=metadata) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print("-" * 20) - print("First alternative of result {}".format(i)) - print("Transcript: {}".format(alternative.transcript)) + print('-' * 20) + print(u'First alternative of result {}'.format(i)) + print(u'Transcript: {}'.format(alternative.transcript)) # [END speech_transcribe_recognition_metadata_beta] @@ -115,30 +110,28 @@ def transcribe_file_with_auto_punctuation(): """Transcribe the given audio file with auto punctuation enabled.""" # [START speech_transcribe_auto_punctuation_beta] from google.cloud import speech_v1p1beta1 as speech - client = speech.SpeechClient() - speech_file = "resources/commercial_mono.wav" + speech_file = 'resources/commercial_mono.wav' - with io.open(speech_file, "rb") as audio_file: + with io.open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code="en-US", + language_code='en-US', # Enable automatic punctuation - enable_automatic_punctuation=True, - ) + enable_automatic_punctuation=True) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print("-" * 20) - print("First alternative of result {}".format(i)) - print("Transcript: {}".format(alternative.transcript)) + print('-' * 20) + print(u'First alternative of result {}'.format(i)) + print(u'Transcript: {}'.format(alternative.transcript)) # [END speech_transcribe_auto_punctuation_beta] @@ -146,12 +139,11 @@ def transcribe_file_with_diarization(): """Transcribe the given audio file synchronously with diarization.""" # [START speech_transcribe_diarization_beta] from google.cloud import speech_v1p1beta1 as speech - client = speech.SpeechClient() - speech_file = "resources/commercial_mono.wav" + speech_file = 'resources/commercial_mono.wav' - with open(speech_file, "rb") as audio_file: + with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) @@ -159,13 +151,12 @@ def transcribe_file_with_diarization(): config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code="en-US", + language_code='en-US', enable_speaker_diarization=True, - diarization_speaker_count=2, - ) + diarization_speaker_count=2) - print("Waiting for operation to complete...") - response = client.recognize(request={"config": config, "audio": audio}) + print('Waiting for operation to complete...') + response = client.recognize(config=config, audio=audio) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words @@ -177,7 +168,8 @@ def transcribe_file_with_diarization(): # Printing out the output: for word_info in words_info: - print(f"word: '{word_info.word}', speaker_tag: {word_info.speaker_tag}") + print(u"word: '{}', speaker_tag: {}".format( + word_info.word, word_info.speaker_tag)) # [END speech_transcribe_diarization_beta] @@ -186,12 +178,11 @@ def transcribe_file_with_multichannel(): multi channel.""" # [START speech_transcribe_multichannel_beta] from google.cloud import speech_v1p1beta1 as speech - client = speech.SpeechClient() - speech_file = "resources/Google_Gnome.wav" + speech_file = 'resources/Google_Gnome.wav' - with open(speech_file, "rb") as audio_file: + with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) @@ -199,19 +190,18 @@ def transcribe_file_with_multichannel(): config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code="en-US", + language_code='en-US', audio_channel_count=1, - enable_separate_recognition_per_channel=True, - ) + enable_separate_recognition_per_channel=True) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print("-" * 20) - print("First alternative of result {}".format(i)) - print("Transcript: {}".format(alternative.transcript)) - print("Channel Tag: {}".format(result.channel_tag)) + print('-' * 20) + print('First alternative of result {}'.format(i)) + print(u'Transcript: {}'.format(alternative.transcript)) + print(u'Channel Tag: {}'.format(result.channel_tag)) # [END speech_transcribe_multichannel_beta] @@ -220,14 +210,13 @@ def transcribe_file_with_multilanguage(): multi language.""" # [START speech_transcribe_multilanguage_beta] from google.cloud import speech_v1p1beta1 as speech - client = speech.SpeechClient() - speech_file = "resources/multi.wav" - first_lang = "en-US" - second_lang = "es" + speech_file = 'resources/multi.wav' + first_lang = 'en-US' + second_lang = 'es' - with open(speech_file, "rb") as audio_file: + with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) @@ -237,17 +226,16 @@ def transcribe_file_with_multilanguage(): sample_rate_hertz=44100, audio_channel_count=2, language_code=first_lang, - alternative_language_codes=[second_lang], - ) + alternative_language_codes=[second_lang]) - print("Waiting for operation to complete...") - response = client.recognize(request={"config": config, "audio": audio}) + print('Waiting for operation to complete...') + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print("-" * 20) - print("First alternative of result {}: {}".format(i, alternative)) - print("Transcript: {}".format(alternative.transcript)) + print('-' * 20) + print(u'First alternative of result {}: {}'.format(i, alternative)) + print(u'Transcript: {}'.format(alternative.transcript)) # [END speech_transcribe_multilanguage_beta] @@ -256,12 +244,11 @@ def transcribe_file_with_word_level_confidence(): word level confidence.""" # [START speech_transcribe_word_level_confidence_beta] from google.cloud import speech_v1p1beta1 as speech - client = speech.SpeechClient() - speech_file = "resources/Google_Gnome.wav" + speech_file = 'resources/Google_Gnome.wav' - with open(speech_file, "rb") as audio_file: + with open(speech_file, 'rb') as audio_file: content = audio_file.read() audio = speech.RecognitionAudio(content=content) @@ -269,44 +256,40 @@ def transcribe_file_with_word_level_confidence(): config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code="en-US", - enable_word_confidence=True, - ) + language_code='en-US', + enable_word_confidence=True) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print("-" * 20) - print("First alternative of result {}".format(i)) - print("Transcript: {}".format(alternative.transcript)) - print( - "First Word and Confidence: ({}, {})".format( - alternative.words[0].word, alternative.words[0].confidence - ) - ) + print('-' * 20) + print('First alternative of result {}'.format(i)) + print(u'Transcript: {}'.format(alternative.transcript)) + print(u'First Word and Confidence: ({}, {})'.format( + alternative.words[0].word, alternative.words[0].confidence)) # [END speech_transcribe_word_level_confidence_beta] -if __name__ == "__main__": +if __name__ == '__main__': parser = argparse.ArgumentParser( - description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter - ) - parser.add_argument("command") + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('command') args = parser.parse_args() - if args.command == "enhanced-model": + if args.command == 'enhanced-model': transcribe_file_with_enhanced_model() - elif args.command == "metadata": + elif args.command == 'metadata': transcribe_file_with_metadata() - elif args.command == "punctuation": + elif args.command == 'punctuation': transcribe_file_with_auto_punctuation() - elif args.command == "diarization": + elif args.command == 'diarization': transcribe_file_with_diarization() - elif args.command == "multi-channel": + elif args.command == 'multi-channel': transcribe_file_with_multichannel() - elif args.command == "multi-language": + elif args.command == 'multi-language': transcribe_file_with_multilanguage() - elif args.command == "word-level-conf": + elif args.command == 'word-level-conf': transcribe_file_with_word_level_confidence() diff --git a/speech/snippets/quickstart.py b/speech/snippets/quickstart.py index ad0ab3275838..d050c68585d2 100644 --- a/speech/snippets/quickstart.py +++ b/speech/snippets/quickstart.py @@ -46,7 +46,7 @@ def run_quickstart(): ) # Detects speech in the audio file - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for result in response.results: print("Transcript: {}".format(result.alternatives[0].transcript)) diff --git a/speech/snippets/requirements.txt b/speech/snippets/requirements.txt index 1601217e5387..1472200c199a 100644 --- a/speech/snippets/requirements.txt +++ b/speech/snippets/requirements.txt @@ -1 +1 @@ -google-cloud-speech==1.3.2 +google-cloud-speech==2.0.0 diff --git a/speech/snippets/speech_adaptation_beta.py b/speech/snippets/speech_adaptation_beta.py index 890bb8ed7284..1f3288ce2da5 100644 --- a/speech/snippets/speech_adaptation_beta.py +++ b/speech/snippets/speech_adaptation_beta.py @@ -25,7 +25,7 @@ # usage: python3 samples/v1p1beta1/speech_adaptation_beta.py [--storage_uri "gs://cloud-samples-data/speech/brooklyn_bridge.mp3"] [--phrase "Brooklyn Bridge"] # [START speech_adaptation_beta] -from google.cloud import speech_v1p1beta1 +from google.cloud import speech_v1p1beta1 as speech def sample_recognize(storage_uri, phrase): @@ -37,7 +37,7 @@ def sample_recognize(storage_uri, phrase): phrase Phrase "hints" help recognize the specified phrases from your audio. """ - client = speech_v1p1beta1.SpeechClient() + client = speech.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3' # phrase = 'Brooklyn Bridge' @@ -60,8 +60,9 @@ def sample_recognize(storage_uri, phrase): language_code = "en-US" # Encoding of audio data sent. This sample sets this explicitly. - # This field is optional for FLAC and WAV audio formats. - encoding = speech_v1p1beta1.RecognitionConfig.AudioEncoding.MP3 + # This field is optional for FLAC and WAV audio formats + encoding = speech.RecognitionConfig.AudioEncoding.MP3 + config = { "speech_contexts": speech_contexts, "sample_rate_hertz": sample_rate_hertz, @@ -70,7 +71,8 @@ def sample_recognize(storage_uri, phrase): } audio = {"uri": storage_uri} - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) + for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] diff --git a/speech/snippets/speech_quickstart_beta.py b/speech/snippets/speech_quickstart_beta.py index ba1efab1a847..d40e6d32f1c8 100644 --- a/speech/snippets/speech_quickstart_beta.py +++ b/speech/snippets/speech_quickstart_beta.py @@ -25,7 +25,7 @@ # usage: python3 samples/v1p1beta1/speech_quickstart_beta.py [--storage_uri "gs://cloud-samples-data/speech/brooklyn_bridge.mp3"] # [START speech_quickstart_beta] -from google.cloud import speech_v1p1beta1 +from google.cloud import speech_v1p1beta1 as speech def sample_recognize(storage_uri): @@ -36,7 +36,7 @@ def sample_recognize(storage_uri): storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ - client = speech_v1p1beta1.SpeechClient() + client = speech.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.mp3' @@ -48,7 +48,7 @@ def sample_recognize(storage_uri): # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. - encoding = speech_v1p1beta1.RecognitionConfig.AudioEncoding.MP3 + encoding = speech.RecognitionConfig.AudioEncoding.MP3 config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, @@ -56,7 +56,8 @@ def sample_recognize(storage_uri): } audio = {"uri": storage_uri} - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) + for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] diff --git a/speech/snippets/transcribe.py b/speech/snippets/transcribe.py index 2cd21ddc3194..9243c7963978 100644 --- a/speech/snippets/transcribe.py +++ b/speech/snippets/transcribe.py @@ -47,7 +47,8 @@ def transcribe_file(speech_file): # [END speech_python_migration_config] # [START speech_python_migration_sync_response] - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) + # [END speech_python_migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. @@ -76,7 +77,8 @@ def transcribe_gcs(gcs_uri): ) # [END speech_python_migration_config_gcs] - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) + # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: diff --git a/speech/snippets/transcribe_async.py b/speech/snippets/transcribe_async.py index 789f2f36edc1..56c7fca13d8e 100644 --- a/speech/snippets/transcribe_async.py +++ b/speech/snippets/transcribe_async.py @@ -37,17 +37,23 @@ def transcribe_file(speech_file): with io.open(speech_file, "rb") as audio_file: content = audio_file.read() + """ + Note that transcription is limited to 60 seconds audio. + Use a GCS file for audio longer than 1 minute. + """ audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", ) - # [START speech_python_migration_async_response] + # [START speech_python_migration_async_response operation = client.long_running_recognize( request={"config": config, "audio": audio} ) + operation = client.long_running_recognize(config=config, audio=audio) # [END speech_python_migration_async_request] print("Waiting for operation to complete...") @@ -83,6 +89,8 @@ def transcribe_gcs(gcs_uri): request={"config": config, "audio": audio} ) + operation = client.long_running_recognize(config=config, audio=audio) + print("Waiting for operation to complete...") response = operation.result(timeout=90) diff --git a/speech/snippets/transcribe_auto_punctuation.py b/speech/snippets/transcribe_auto_punctuation.py index 106de0f772a3..490a8d9a1822 100644 --- a/speech/snippets/transcribe_auto_punctuation.py +++ b/speech/snippets/transcribe_auto_punctuation.py @@ -45,7 +45,7 @@ def transcribe_file_with_auto_punctuation(path): enable_automatic_punctuation=True, ) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] diff --git a/speech/snippets/transcribe_context_classes.py b/speech/snippets/transcribe_context_classes.py index 69f40fd9c4ce..72b40507614f 100644 --- a/speech/snippets/transcribe_context_classes.py +++ b/speech/snippets/transcribe_context_classes.py @@ -28,7 +28,7 @@ def transcribe_context_classes(storage_uri): # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext # Full list of supported phrases (class tokens) here: # https://cloud.google.com/speech-to-text/docs/class-tokens - speech_context = speech.SpeechContext(phrases=["$TIME"]) + speech_context = speech.SpeechContext(phrases=['$TIME']) # RecognitionConfig: to configure your encoding and sample_rate_hertz, see: # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig @@ -39,7 +39,7 @@ def transcribe_context_classes(storage_uri): speech_contexts=[speech_context], ) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] diff --git a/speech/snippets/transcribe_enhanced_model.py b/speech/snippets/transcribe_enhanced_model.py index 6b2862c7c55c..175040350faf 100644 --- a/speech/snippets/transcribe_enhanced_model.py +++ b/speech/snippets/transcribe_enhanced_model.py @@ -49,7 +49,7 @@ def transcribe_file_with_enhanced_model(path): model="phone_call", ) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] diff --git a/speech/snippets/transcribe_model_selection.py b/speech/snippets/transcribe_model_selection.py index a25fc1d51472..76db3c9cd731 100644 --- a/speech/snippets/transcribe_model_selection.py +++ b/speech/snippets/transcribe_model_selection.py @@ -47,7 +47,7 @@ def transcribe_model_selection(speech_file, model): model=model, ) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] @@ -76,9 +76,7 @@ def transcribe_model_selection_gcs(gcs_uri, model): model=model, ) - operation = client.long_running_recognize( - request={"config": config, "audio": audio} - ) + operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") response = operation.result(timeout=90) diff --git a/speech/snippets/transcribe_multichannel.py b/speech/snippets/transcribe_multichannel.py index c5b4d5de95c5..68fd013cffbd 100644 --- a/speech/snippets/transcribe_multichannel.py +++ b/speech/snippets/transcribe_multichannel.py @@ -46,7 +46,7 @@ def transcribe_file_with_multichannel(speech_file): enable_separate_recognition_per_channel=True, ) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] @@ -75,7 +75,7 @@ def transcribe_gcs_with_multichannel(gcs_uri): enable_separate_recognition_per_channel=True, ) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for i, result in enumerate(response.results): alternative = result.alternatives[0] diff --git a/speech/snippets/transcribe_streaming.py b/speech/snippets/transcribe_streaming.py index d3dc96e5db98..93243171a756 100644 --- a/speech/snippets/transcribe_streaming.py +++ b/speech/snippets/transcribe_streaming.py @@ -37,15 +37,16 @@ def transcribe_streaming(stream_file): # In practice, stream should be a generator yielding chunks of audio data. stream = [content] - requests = ( - speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream - ) + + requests = (speech.StreamingRecognizeRequest(audio_content=chunk) + for chunk in stream) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code="en-US", - ) + + language_code='en-US') + streaming_config = speech.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. diff --git a/speech/snippets/transcribe_word_time_offsets.py b/speech/snippets/transcribe_word_time_offsets.py index b49f2ecbe8f7..ced4baecf629 100644 --- a/speech/snippets/transcribe_word_time_offsets.py +++ b/speech/snippets/transcribe_word_time_offsets.py @@ -44,7 +44,7 @@ def transcribe_file_with_word_time_offsets(speech_file): enable_word_time_offsets=True, ) - response = client.recognize(request={"config": config, "audio": audio}) + response = client.recognize(config=config, audio=audio) for result in response.results: alternative = result.alternatives[0] @@ -54,6 +54,7 @@ def transcribe_file_with_word_time_offsets(speech_file): word = word_info.word start_time = word_info.start_time end_time = word_info.end_time + print( f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" ) @@ -75,9 +76,7 @@ def transcribe_gcs_with_word_time_offsets(gcs_uri): enable_word_time_offsets=True, ) - operation = client.long_running_recognize( - request={"config": config, "audio": audio} - ) + operation = client.long_running_recognize(config=config, audio=audio) print("Waiting for operation to complete...") result = operation.result(timeout=90) @@ -91,11 +90,11 @@ def transcribe_gcs_with_word_time_offsets(gcs_uri): word = word_info.word start_time = word_info.start_time end_time = word_info.end_time + print( f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" ) - # [END speech_transcribe_async_word_time_offsets_gcs]