diff --git a/speech/beta/pom.xml b/speech/beta/pom.xml new file mode 100644 index 00000000000..623af887158 --- /dev/null +++ b/speech/beta/pom.xml @@ -0,0 +1,139 @@ + + + 4.0.0 + com.example.speech + speech-google-cloud-samples + jar + + + + com.google.cloud.samples + shared-configuration + 1.0.10 + + + + 1.8 + 1.8 + UTF-8 + + + + + + com.google.cloud + google-cloud-speech + 0.56.0-beta + + + + + + junit + junit + 4.12 + test + + + com.google.truth + truth + 0.42 + test + + + + + + + maven-assembly-plugin + + + + com.example.language.QuickstartSample + + + + jar-with-dependencies + + + + + + + + + Quickstart + + + Quickstart + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.6.0 + + + + java + + + + + com.example.speech.QuickstartSample + false + + + + + + + + Recognize + + + Recognize + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.6.0 + + + + java + + + + + com.example.speech.Recognize + false + + + + + + + diff --git a/speech/beta/resources/Google_Gnome.wav b/speech/beta/resources/Google_Gnome.wav new file mode 100644 index 00000000000..2f497b7fbe7 Binary files /dev/null and b/speech/beta/resources/Google_Gnome.wav differ diff --git a/speech/beta/resources/audio.raw b/speech/beta/resources/audio.raw new file mode 100644 index 00000000000..5ebf79d3c9c Binary files /dev/null and b/speech/beta/resources/audio.raw differ diff --git a/speech/beta/resources/commercial_mono.wav b/speech/beta/resources/commercial_mono.wav new file mode 100644 index 00000000000..e6b9ed434f9 Binary files /dev/null and b/speech/beta/resources/commercial_mono.wav differ diff --git a/speech/beta/resources/commercial_stereo.wav b/speech/beta/resources/commercial_stereo.wav new file mode 100644 index 00000000000..467f3687702 Binary files /dev/null and b/speech/beta/resources/commercial_stereo.wav differ diff --git a/speech/beta/src/main/java/com/example/speech/Recognize.java b/speech/beta/src/main/java/com/example/speech/Recognize.java new file mode 100644 index 00000000000..7c8aaccd74a --- /dev/null +++ b/speech/beta/src/main/java/com/example/speech/Recognize.java @@ -0,0 +1,487 @@ +/* + * Copyright 2018 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.speech; + +import com.google.api.gax.longrunning.OperationFuture; +import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata; +import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse; +import com.google.cloud.speech.v1p1beta1.RecognitionAudio; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig; +import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance; +import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType; +import com.google.cloud.speech.v1p1beta1.RecognizeResponse; +import com.google.cloud.speech.v1p1beta1.SpeechClient; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult; +import com.google.protobuf.ByteString; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +public class Recognize { + + /** Run speech recognition tasks. */ + public static void main(String... args) throws Exception { + if (args.length < 1) { + System.out.println("Usage:"); + System.out.printf( + "\tjava %s \"\" \"\"\n" + + "Commands:\n" + + "\t metadata | diarization | multi-channel |\n" + + "\t multi-language | word-level-conf\n" + + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " + + "for a Cloud Storage resource (gs://...)\n", + Recognize.class.getCanonicalName()); + return; + } + String command = args[0]; + String path = args.length > 1 ? args[1] : ""; + + // Use command and GCS path pattern to invoke transcription. + if (command.equals("metadata")) { + transcribeFileWithMetadata(path); + } else if (command.equals("diarization")) { + if (path.startsWith("gs://")) { + transcribeDiarizationGcs(path); + } else { + transcribeDiarization(path); + } + } else if (command.equals("multi-channel")) { + if (path.startsWith("gs://")) { + transcribeMultiChannelGcs(path); + } else { + transcribeMultiChannel(path); + } + } else if (command.equals("multi-language")) { + if (path.startsWith("gs://")) { + transcribeMultiLanguageGcs(path); + } else { + transcribeMultiLanguage(path); + } + } else if (command.equals("word-level-conf")) { + if (path.startsWith("gs://")) { + transcribeWordLevelConfidenceGcs(path); + } else { + transcribeWordLevelConfidence(path); + } + } + } + + // [START speech_transcribe_recognition_metadata_beta] + /** + * Transcribe the given audio file and include recognition metadata in the request. + * + * @param fileName the path to an audio file. + */ + public static void transcribeFileWithMetadata(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + + // Construct a recognition metadata object. + // Most metadata fields are specified as enums that can be found + // in speech.enums.RecognitionMetadata + RecognitionMetadata metadata = + RecognitionMetadata.newBuilder() + .setInteractionType(InteractionType.DISCUSSION) + .setMicrophoneDistance(MicrophoneDistance.NEARFIELD) + .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE) + .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings + // And some are integers, for instance the 6 digit NAICS code + // https://www.naics.com/search/ + .setIndustryNaicsCodeOfAudio(519190) + .build(); + + // Configure request to enable enhanced models + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setMetadata(metadata) // Add the metadata to the config + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript: %s\n\n", alternative.getTranscript()); + } + } + } + // [END speech_transcribe_recognition_metadata_beta] + + // [START speech_transcribe_diarization_beta] + /** + * Transcribe the given audio file using speaker diarization. + * + * @param fileName the path to an audio file. + */ + public static void transcribeDiarization(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + + // Configure request to enable Speaker diarization + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setEnableSpeakerDiarization(true) + .setDiarizationSpeakerCount(2) + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just + // use the first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n", alternative.getTranscript()); + // The words array contains the entire transcript up until that point. + // Referencing the last spoken word to get the associated Speaker tag + System.out.format( + "Speaker Tag %s: %s\n", + alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), + alternative.getTranscript()); + } + } + } + // [END speech_transcribe_diarization_beta] + + // [START speech_transcribe_diarization_gcs_beta] + /** + * Transcribe a remote audio file using speaker diarization. + * + * @param gcsUri the path to an audio file. + */ + public static void transcribeDiarizationGcs(String gcsUri) throws Exception { + try (SpeechClient speechClient = SpeechClient.create()) { + // Configure request to enable Speaker diarization + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setEnableSpeakerDiarization(true) + .setDiarizationSpeakerCount(2) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + + for (SpeechRecognitionResult result : response.get().getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just + // use the first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + // The words array contains the entire transcript up until that point. + // Referencing the last spoken word to get the associated Speaker tag + System.out.format( + "Speaker Tag %s:%s\n", + alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), + alternative.getTranscript()); + } + } + } + // [END speech_transcribe_diarization_gcs_beta] + + // [START speech_transcribe_multichannel_beta] + /** + * Transcribe a local audio file with multi-channel recognition + * + * @param fileName the path to local audio file + */ + public static void transcribeMultiChannel(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + + // Configure request to enable multiple channels + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(44100) + .setAudioChannelCount(2) + .setEnableSeparateRecognitionPerChannel(true) + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n", alternative.getTranscript()); + System.out.printf("Channel Tag : %s\n\n", result.getChannelTag()); + } + } + } + // [END speech_transcribe_multichannel_beta] + + // [START speech_transcribe_multichannel_gcs_beta] + /** + * Transcribe a remote audio file with multi-channel recognition + * + * @param gcsUri the path to the audio file + */ + public static void transcribeMultiChannelGcs(String gcsUri) throws Exception { + + try (SpeechClient speechClient = SpeechClient.create()) { + + // Configure request to enable multiple channels + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(44100) + .setAudioChannelCount(2) + .setEnableSeparateRecognitionPerChannel(true) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + // Just print the first result here. + for (SpeechRecognitionResult result : response.get().getResultsList()) { + + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + + // Print out the result + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + System.out.printf("Channel Tag : %s\n\n", result.getChannelTag()); + } + } + } + // [END speech_transcribe_multichannel_gcs_beta] + + // [START speech_transcribe_multilanguage_beta] + /** + * Transcribe a local audio file with multi-language recognition + * + * @param fileName the path to the audio file + */ + public static void transcribeMultiLanguage(String fileName) throws Exception { + Path path = Paths.get(fileName); + // Get the contents of the local audio file + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + ArrayList languageList = new ArrayList<>(); + languageList.add("es-ES"); + languageList.add("en-US"); + + // Configure request to enable multiple languages + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("ja-JP") + .addAllAlternativeLanguageCodes(languageList) + .build(); + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n\n", alternative.getTranscript()); + } + } + } + // [END speech_transcribe_multilanguage_beta] + + // [START speech_transcribe_multilanguage_gcs_beta] + /** + * Transcribe a remote audio file with multi-language recognition + * + * @param gcsUri the path to the remote audio file + */ + public static void transcribeMultiLanguageGcs(String gcsUri) throws Exception { + try (SpeechClient speechClient = SpeechClient.create()) { + + ArrayList languageList = new ArrayList<>(); + languageList.add("es-ES"); + languageList.add("en-US"); + + // Configure request to enable multiple languages + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("ja-JP") + .addAllAlternativeLanguageCodes(languageList) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + + for (SpeechRecognitionResult result : response.get().getResultsList()) { + + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + + // Print out the result + System.out.printf("Transcript : %s\n\n", alternative.getTranscript()); + } + } + } + // [END speech_transcribe_multilanguage_gcs_beta] + + // [START speech_transcribe_word_level_confidence_beta] + /** + * Transcribe a local audio file with word level confidence + * + * @param fileName the path to the local audio file + */ + public static void transcribeWordLevelConfidence(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + // Configure request to enable word level confidence + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("en-US") + .setEnableWordConfidence(true) + .build(); + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n", alternative.getTranscript()); + System.out.format( + "First Word and Confidence : %s %s \n", + alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence()); + } + } + } + // [END speech_transcribe_word_level_confidence_beta] + + // [START speech_transcribe_word_level_confidence_gcs_beta] + /** + * Transcribe a remote audio file with word level confidence + * + * @param gcsUri path to the remote audio file + */ + public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Exception { + try (SpeechClient speechClient = SpeechClient.create()) { + + // Configure request to enable word level confidence + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setSampleRateHertz(16000) + .setLanguageCode("en-US") + .setEnableWordConfidence(true) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + // Just print the first result here. + SpeechRecognitionResult result = response.get().getResultsList().get(0); + + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + // Print out the result + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + System.out.format( + "First Word and Confidence : %s %s \n", + alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence()); + } + } + // [END speech_transcribe_word_level_confidence_gcs_beta] +} diff --git a/speech/beta/src/test/java/com/example/speech/RecognizeIT.java b/speech/beta/src/test/java/com/example/speech/RecognizeIT.java new file mode 100644 index 00000000000..5219b58dc79 --- /dev/null +++ b/speech/beta/src/test/java/com/example/speech/RecognizeIT.java @@ -0,0 +1,127 @@ +/* + * Copyright 2018 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.example.speech; + +import static com.google.common.truth.Truth.assertThat; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Tests for speech recognize sample. */ +@RunWith(JUnit4.class) +@SuppressWarnings("checkstyle:abbreviationaswordinname") +public class RecognizeIT { + private static final String BUCKET = "cloud-samples-tests"; + + private ByteArrayOutputStream bout; + private PrintStream out; + + // The path to the audio file to transcribe + private String audioFileName = "./resources/audio.raw"; + private String multiChannelAudioFileName = "./resources/commercial_stereo.wav"; + private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav"; + private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac"; + private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav"; + + // The path to the video file to transcribe + private String videoFileName = "./resources/Google_Gnome.wav"; + private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav"; + + private String recognitionAudioFile = "./resources/commercial_mono.wav"; + + @Before + public void setUp() { + bout = new ByteArrayOutputStream(); + out = new PrintStream(bout); + System.setOut(out); + } + + @After + public void tearDown() { + System.setOut(null); + } + + @Test + public void testMetadata() throws Exception { + Recognize.transcribeFileWithMetadata(recognitionAudioFile); + String got = bout.toString(); + assertThat(got).contains("Chrome"); + } + + @Test + public void testTranscribeDiarization() throws Exception { + Recognize.transcribeDiarization(recognitionAudioFile); + String got = bout.toString(); + assertThat(got).contains("Speaker Tag 2:"); + } + + @Test + public void testTranscribeDiarizationGcs() throws Exception { + Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath); + String got = bout.toString(); + assertThat(got).contains("Speaker Tag 2:"); + } + + @Test + public void testTranscribeMultiChannel() throws Exception { + Recognize.transcribeMultiChannel(multiChannelAudioFileName); + String got = bout.toString(); + assertThat(got).contains("Channel Tag : 1"); + } + + @Test + public void testTranscribeMultiChannelGcs() throws Exception { + Recognize.transcribeMultiChannelGcs(gcsMultiChannelAudioPath); + String got = bout.toString(); + assertThat(got).contains("Channel Tag : 1"); + } + + @Test + public void testTranscribeMultiLanguage() throws Exception { + Recognize.transcribeMultiLanguage(videoFileName); + String got = bout.toString(); + assertThat(got).contains("Transcript : OK Google"); + } + + @Test + public void testTranscribeMultiLanguageGcs() throws Exception { + Recognize.transcribeMultiLanguageGcs(gcsVideoPath); + String got = bout.toString(); + assertThat(got).contains("Transcript : OK Google"); + } + + @Test + public void testTranscribeWordLevelConfidence() throws Exception { + Recognize.transcribeWordLevelConfidence(audioFileName); + String got = bout.toString(); + assertThat(got).contains("Transcript : how old is the Brooklyn Bridge"); + assertThat(got).contains("First Word and Confidence : how"); + } + + @Test + public void testTranscribeWordLevelConfidenceGcs() throws Exception { + Recognize.transcribeWordLevelConfidenceGcs(gcsAudioPath); + String got = bout.toString(); + assertThat(got).contains("Transcript : how old is the Brooklyn Bridge"); + assertThat(got).contains("First Word and Confidence : how"); + } +} diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml index 68cd05bd13b..d926b40db03 100644 --- a/speech/cloud-client/pom.xml +++ b/speech/cloud-client/pom.xml @@ -40,7 +40,7 @@ com.google.cloud google-cloud-speech - 0.56.0-beta + 0.61.0-beta diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 3d6bfaecf29..de05ad84185 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -22,24 +22,20 @@ import com.google.api.gax.rpc.ClientStream; import com.google.api.gax.rpc.ResponseObserver; import com.google.api.gax.rpc.StreamController; -import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata; -import com.google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse; -import com.google.cloud.speech.v1p1beta1.RecognitionAudio; -import com.google.cloud.speech.v1p1beta1.RecognitionConfig; -import com.google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding; -import com.google.cloud.speech.v1p1beta1.RecognitionMetadata; -import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.InteractionType; -import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.MicrophoneDistance; -import com.google.cloud.speech.v1p1beta1.RecognitionMetadata.RecordingDeviceType; -import com.google.cloud.speech.v1p1beta1.RecognizeResponse; -import com.google.cloud.speech.v1p1beta1.SpeechClient; -import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative; -import com.google.cloud.speech.v1p1beta1.SpeechRecognitionResult; -import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig; -import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult; -import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest; -import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse; -import com.google.cloud.speech.v1p1beta1.WordInfo; +import com.google.cloud.speech.v1.LongRunningRecognizeMetadata; +import com.google.cloud.speech.v1.LongRunningRecognizeResponse; +import com.google.cloud.speech.v1.RecognitionAudio; +import com.google.cloud.speech.v1.RecognitionConfig; +import com.google.cloud.speech.v1.RecognitionConfig.AudioEncoding; +import com.google.cloud.speech.v1.RecognizeResponse; +import com.google.cloud.speech.v1.SpeechClient; +import com.google.cloud.speech.v1.SpeechRecognitionAlternative; +import com.google.cloud.speech.v1.SpeechRecognitionResult; +import com.google.cloud.speech.v1.StreamingRecognitionConfig; +import com.google.cloud.speech.v1.StreamingRecognitionResult; +import com.google.cloud.speech.v1.StreamingRecognizeRequest; +import com.google.cloud.speech.v1.StreamingRecognizeResponse; +import com.google.cloud.speech.v1.WordInfo; import com.google.common.util.concurrent.SettableFuture; import com.google.protobuf.ByteString; @@ -67,9 +63,8 @@ public static void main(String... args) throws Exception { "\tjava %s \"\" \"\"\n" + "Commands:\n" + "\tsyncrecognize | asyncrecognize | streamrecognize | micstreamrecognize \n" - + "\t| wordoffsets | model-selection | auto-punctuation | stream-punctuation \n" - + "\t| enhanced-model| metadata | diarization | multi-channel | multi-language \n" - + "\t | word-level-conf" + + "\t| wordoffsets | auto-punctuation | stream-punctuation \n" + + "\t| enhanced-model | model-selection\n" + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " + "for a Cloud Storage resource (gs://...)\n", Recognize.class.getCanonicalName()); @@ -101,12 +96,6 @@ public static void main(String... args) throws Exception { streamingRecognizeFile(path); } else if (command.equals("micstreamrecognize")) { streamingMicRecognize(); - } else if (command.equals("model-selection")) { - if (path.startsWith("gs://")) { - transcribeModelSelectionGcs(path); - } else { - transcribeModelSelection(path); - } } else if (command.equals("auto-punctuation")) { if (path.startsWith("gs://")) { transcribeGcsWithAutomaticPunctuation(path); @@ -117,31 +106,11 @@ public static void main(String... args) throws Exception { streamingTranscribeWithAutomaticPunctuation(path); } else if (command.equals("enhanced-model")) { transcribeFileWithEnhancedModel(path); - } else if (command.equals("metadata")) { - transcribeFileWithMetadata(path); - } else if (command.equals("diarization")) { - if (path.startsWith("gs://")) { - transcribeDiarizationGcs(path); - } else { - transcribeDiarization(path); - } - } else if (command.equals("multi-channel")) { - if (path.startsWith("gs://")) { - transcribeMultiChannelGcs(path); - } else { - transcribeMultiChannel(path); - } - } else if (command.equals("multi-language")) { - if (path.startsWith("gs://")) { - transcribeMultiLanguageGcs(path); - } else { - transcribeMultiLanguage(path); - } - } else if (command.equals("word-level-conf")) { + } else if (command.equals("model-selection")) { if (path.startsWith("gs://")) { - transcribeWordLevelConfidenceGcs(path); + transcribeModelSelectionGcs(path); } else { - transcribeWordLevelConfidence(path); + transcribeModelSelection(path); } } } @@ -477,87 +446,6 @@ public SettableFuture> future() { } // [END speech_transcribe_streaming] - // [START speech_transcribe_model_selection_beta] - /** - * Performs transcription of the given audio file synchronously with the selected model. - * - * @param fileName the path to a audio file to transcribe - */ - public static void transcribeModelSelection(String fileName) throws Exception { - Path path = Paths.get(fileName); - byte[] content = Files.readAllBytes(path); - - try (SpeechClient speech = SpeechClient.create()) { - // Configure request with video media type - RecognitionConfig recConfig = - RecognitionConfig.newBuilder() - // encoding may either be omitted or must match the value in the file header - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - // sample rate hertz may be either be omitted or must match the value in the file - // header - .setSampleRateHertz(16000) - .setModel("video") - .build(); - - RecognitionAudio recognitionAudio = - RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); - - RecognizeResponse recognizeResponse = speech.recognize(recConfig, recognitionAudio); - // Just print the first result here. - SpeechRecognitionResult result = recognizeResponse.getResultsList().get(0); - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcript : %s\n", alternative.getTranscript()); - } - // [END speech_transcribe_model_selection_beta] - } - - // [START speech_transcribe_model_selection_gcs_beta] - /** - * Performs transcription of the remote audio file asynchronously with the selected model. - * - * @param gcsUri the path to the remote audio file to transcribe. - */ - public static void transcribeModelSelectionGcs(String gcsUri) throws Exception { - try (SpeechClient speech = SpeechClient.create()) { - - // Configure request with video media type - RecognitionConfig config = - RecognitionConfig.newBuilder() - // encoding may either be omitted or must match the value in the file header - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - // sample rate hertz may be either be omitted or must match the value in the file - // header - .setSampleRateHertz(16000) - .setModel("video") - .build(); - - RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); - - // Use non-blocking call for getting file transcription - OperationFuture response = - speech.longRunningRecognizeAsync(config, audio); - - while (!response.isDone()) { - System.out.println("Waiting for response..."); - Thread.sleep(10000); - } - - List results = response.get().getResultsList(); - - // Just print the first result here. - SpeechRecognitionResult result = results.get(0); - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - System.out.printf("Transcript : %s\n", alternative.getTranscript()); - } - // [END speech_transcribe_model_selection_gcs_beta] - } - // [START speech_sync_recognize_punctuation] /** * Performs transcription with automatic punctuation on raw PCM audio data. @@ -598,7 +486,7 @@ public static void transcribeFileWithAutomaticPunctuation(String fileName) throw } // [END speech_sync_recognize_punctuation] - // [START speech_transcribe_auto_punctuation_beta] + // [START speech_transcribe_auto_punctuation] /** * Performs transcription on remote FLAC file and prints the transcription. * @@ -638,7 +526,7 @@ public static void transcribeGcsWithAutomaticPunctuation(String gcsUri) throws E System.out.printf("Transcript : %s\n", alternative.getTranscript()); } } - // [END speech_transcribe_auto_punctuation_beta] + // [END speech_transcribe_auto_punctuation] // [START speech_stream_recognize_punctuation] /** @@ -820,7 +708,7 @@ public void onError(Throwable t) { } // [END speech_transcribe_streaming_mic] - // [START speech_transcribe_enhanced_model_beta] + // [START speech_transcribe_enhanced_model] /** * Transcribe the given audio file using an enhanced model. * @@ -860,410 +748,86 @@ public static void transcribeFileWithEnhancedModel(String fileName) throws Excep } } } - // [END speech_transcribe_enhanced_model_beta] + // [END speech_transcribe_enhanced_model] - // [START speech_transcribe_recognition_metadata_beta] + // [START speech_transcribe_model_selection] /** - * Transcribe the given audio file and include recognition metadata in the request. - * - * @param fileName the path to an audio file. - */ - public static void transcribeFileWithMetadata(String fileName) throws Exception { - Path path = Paths.get(fileName); - byte[] content = Files.readAllBytes(path); - - try (SpeechClient speechClient = SpeechClient.create()) { - // Get the contents of the local audio file - RecognitionAudio recognitionAudio = - RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); - - // Construct a recognition metadata object. - // Most metadata fields are specified as enums that can be found - // in speech.enums.RecognitionMetadata - RecognitionMetadata metadata = - RecognitionMetadata.newBuilder() - .setInteractionType(InteractionType.DISCUSSION) - .setMicrophoneDistance(MicrophoneDistance.NEARFIELD) - .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE) - .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings - // And some are integers, for instance the 6 digit NAICS code - // https://www.naics.com/search/ - .setIndustryNaicsCodeOfAudio(519190) - .build(); - - // Configure request to enable enhanced models - RecognitionConfig config = - RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(8000) - .setMetadata(metadata) // Add the metadata to the config - .build(); - - // Perform the transcription request - RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); - - // Print out the results - for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - System.out.format("Transcript: %s\n\n", alternative.getTranscript()); - } - } - } - // [END speech_transcribe_recognition_metadata_beta] - - // [START speech_transcribe_diarization_beta] - /** - * Transcribe the given audio file using speaker diarization. + * Performs transcription of the given audio file synchronously with the selected model. * - * @param fileName the path to an audio file. + * @param fileName the path to a audio file to transcribe */ - public static void transcribeDiarization(String fileName) throws Exception { + public static void transcribeModelSelection(String fileName) throws Exception { Path path = Paths.get(fileName); byte[] content = Files.readAllBytes(path); - try (SpeechClient speechClient = SpeechClient.create()) { - // Get the contents of the local audio file - RecognitionAudio recognitionAudio = - RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); - - // Configure request to enable Speaker diarization - RecognitionConfig config = - RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(8000) - .setEnableSpeakerDiarization(true) - .setDiarizationSpeakerCount(2) - .build(); - - // Perform the transcription request - RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); - - // Print out the results - for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just - // use the first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - System.out.format("Transcript : %s\n", alternative.getTranscript()); - // The words array contains the entire transcript up until that point. - // Referencing the last spoken word to get the associated Speaker tag - System.out.format( - "Speaker Tag %s: %s\n", - alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), - alternative.getTranscript()); - } - } - } - // [END speech_transcribe_diarization_beta] - - // [START speech_transcribe_diarization_gcs_beta] - /** - * Transcribe a remote audio file using speaker diarization. - * - * @param gcsUri the path to an audio file. - */ - public static void transcribeDiarizationGcs(String gcsUri) throws Exception { - try (SpeechClient speechClient = SpeechClient.create()) { - // Configure request to enable Speaker diarization - RecognitionConfig config = + try (SpeechClient speech = SpeechClient.create()) { + // Configure request with video media type + RecognitionConfig recConfig = RecognitionConfig.newBuilder() + // encoding may either be omitted or must match the value in the file header .setEncoding(AudioEncoding.LINEAR16) .setLanguageCode("en-US") - .setSampleRateHertz(8000) - .setEnableSpeakerDiarization(true) - .setDiarizationSpeakerCount(2) + // sample rate hertz may be either be omitted or must match the value in the file + // header + .setSampleRateHertz(16000) + .setModel("video") .build(); - // Set the remote path for the audio file - RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); - - // Use non-blocking call for getting file transcription - OperationFuture response = - speechClient.longRunningRecognizeAsync(config, audio); - - while (!response.isDone()) { - System.out.println("Waiting for response..."); - Thread.sleep(10000); - } - - for (SpeechRecognitionResult result : response.get().getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just - // use the first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - // The words array contains the entire transcript up until that point. - // Referencing the last spoken word to get the associated Speaker tag - System.out.format( - "Speaker Tag %s:%s\n", - alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), - alternative.getTranscript()); - } - } - } - - // [END speech_transcribe_diarization_gcs_beta] - - // [START speech_transcribe_multichannel_beta] - - /** - * Transcribe a local audio file with multi-channel recognition - * - * @param fileName the path to local audio file - */ - public static void transcribeMultiChannel(String fileName) throws Exception { - Path path = Paths.get(fileName); - byte[] content = Files.readAllBytes(path); - - try (SpeechClient speechClient = SpeechClient.create()) { - // Get the contents of the local audio file RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); - // Configure request to enable multiple channels - RecognitionConfig config = - RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(44100) - .setAudioChannelCount(2) - .setEnableSeparateRecognitionPerChannel(true) - .build(); - - // Perform the transcription request - RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); - - // Print out the results - for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - System.out.format("Transcript : %s\n", alternative.getTranscript()); - System.out.printf("Channel Tag : %s\n\n", result.getChannelTag()); - } - } - } - // [END speech_transcribe_multichannel_beta] - - // [START speech_transcribe_multichannel_gcs_beta] - - /** - * Transcribe a remote audio file with multi-channel recognition - * - * @param gcsUri the path to the audio file - */ - public static void transcribeMultiChannelGcs(String gcsUri) throws Exception { - - try (SpeechClient speechClient = SpeechClient.create()) { - - // Configure request to enable multiple channels - RecognitionConfig config = - RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(44100) - .setAudioChannelCount(2) - .setEnableSeparateRecognitionPerChannel(true) - .build(); - - // Set the remote path for the audio file - RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); - - // Use non-blocking call for getting file transcription - OperationFuture response = - speechClient.longRunningRecognizeAsync(config, audio); - - while (!response.isDone()) { - System.out.println("Waiting for response..."); - Thread.sleep(10000); - } + RecognizeResponse recognizeResponse = speech.recognize(recConfig, recognitionAudio); // Just print the first result here. - for (SpeechRecognitionResult result : response.get().getResultsList()) { - - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - - // Print out the result - System.out.printf("Transcript : %s\n", alternative.getTranscript()); - System.out.printf("Channel Tag : %s\n\n", result.getChannelTag()); - } - } - } - // [END speech_transcribe_multichannel_gcs_beta] - - // [START speech_transcribe_multilanguage_beta] - - /** - * Transcribe a local audio file with multi-language recognition - * - * @param fileName the path to the audio file - */ - public static void transcribeMultiLanguage(String fileName) throws Exception { - Path path = Paths.get(fileName); - // Get the contents of the local audio file - byte[] content = Files.readAllBytes(path); - - try (SpeechClient speechClient = SpeechClient.create()) { - - RecognitionAudio recognitionAudio = - RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); - ArrayList languageList = new ArrayList<>(); - languageList.add("es-ES"); - languageList.add("en-US"); - - // Configure request to enable multiple languages - RecognitionConfig config = - RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setSampleRateHertz(16000) - .setLanguageCode("ja-JP") - .addAllAlternativeLanguageCodes(languageList) - .build(); - // Perform the transcription request - RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); - - // Print out the results - for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - System.out.format("Transcript : %s\n\n", alternative.getTranscript()); - } + SpeechRecognitionResult result = recognizeResponse.getResultsList().get(0); + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + System.out.printf("Transcript : %s\n", alternative.getTranscript()); } } - // [END speech_transcribe_multilanguage_beta] - - // [START speech_transcribe_multilanguage_gcs_beta] + // [END speech_transcribe_model_selection] + // [START speech_transcribe_model_selection_gcs] /** - * Transcribe a remote audio file with multi-language recognition + * Performs transcription of the remote audio file asynchronously with the selected model. * - * @param gcsUri the path to the remote audio file + * @param gcsUri the path to the remote audio file to transcribe. */ - public static void transcribeMultiLanguageGcs(String gcsUri) throws Exception { - try (SpeechClient speechClient = SpeechClient.create()) { - - ArrayList languageList = new ArrayList<>(); - languageList.add("es-ES"); - languageList.add("en-US"); + public static void transcribeModelSelectionGcs(String gcsUri) throws Exception { + try (SpeechClient speech = SpeechClient.create()) { - // Configure request to enable multiple languages + // Configure request with video media type RecognitionConfig config = RecognitionConfig.newBuilder() + // encoding may either be omitted or must match the value in the file header .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + // sample rate hertz may be either be omitted or must match the value in the file + // header .setSampleRateHertz(16000) - .setLanguageCode("ja-JP") - .addAllAlternativeLanguageCodes(languageList) + .setModel("video") .build(); - // Set the remote path for the audio file RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); // Use non-blocking call for getting file transcription OperationFuture response = - speechClient.longRunningRecognizeAsync(config, audio); + speech.longRunningRecognizeAsync(config, audio); while (!response.isDone()) { System.out.println("Waiting for response..."); Thread.sleep(10000); } - for (SpeechRecognitionResult result : response.get().getResultsList()) { - - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - - // Print out the result - System.out.printf("Transcript : %s\n\n", alternative.getTranscript()); - } - } - } - // [END speech_transcribe_multilanguage_gcs_beta] - - // [START speech_transcribe_word_level_confidence_beta] - - /** - * Transcribe a local audio file with word level confidence - * - * @param fileName the path to the local audio file - */ - public static void transcribeWordLevelConfidence(String fileName) throws Exception { - Path path = Paths.get(fileName); - byte[] content = Files.readAllBytes(path); - - try (SpeechClient speechClient = SpeechClient.create()) { - RecognitionAudio recognitionAudio = - RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); - // Configure request to enable word level confidence - RecognitionConfig config = - RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setSampleRateHertz(16000) - .setLanguageCode("en-US") - .setEnableWordConfidence(true) - .build(); - // Perform the transcription request - RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); - - // Print out the results - for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { - // There can be several alternative transcripts for a given chunk of speech. Just use the - // first (most likely) one here. - SpeechRecognitionAlternative alternative = result.getAlternatives(0); - System.out.format("Transcript : %s\n", alternative.getTranscript()); - System.out.format( - "First Word and Confidence : %s %s \n", - alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence()); - } - } - } - // [END speech_transcribe_word_level_confidence_beta] - - // [START speech_transcribe_word_level_confidence_gcs_beta] - - /** - * Transcribe a remote audio file with word level confidence - * - * @param gcsUri path to the remote audio file - */ - public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Exception { - try (SpeechClient speechClient = SpeechClient.create()) { - - // Configure request to enable word level confidence - RecognitionConfig config = - RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setSampleRateHertz(16000) - .setLanguageCode("en-US") - .setEnableWordConfidence(true) - .build(); - - // Set the remote path for the audio file - RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); - - // Use non-blocking call for getting file transcription - OperationFuture response = - speechClient.longRunningRecognizeAsync(config, audio); + List results = response.get().getResultsList(); - while (!response.isDone()) { - System.out.println("Waiting for response..."); - Thread.sleep(10000); - } // Just print the first result here. - SpeechRecognitionResult result = response.get().getResultsList().get(0); - + SpeechRecognitionResult result = results.get(0); // There can be several alternative transcripts for a given chunk of speech. Just use the // first (most likely) one here. SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); - // Print out the result System.out.printf("Transcript : %s\n", alternative.getTranscript()); - System.out.format( - "First Word and Confidence : %s %s \n", - alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence()); } } - // [END speech_transcribe_word_level_confidence_gcs_beta] + // [END speech_transcribe_model_selection_gcs] } diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java index 2a36ac3922a..3bc3d5f1611 100644 --- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java @@ -37,17 +37,13 @@ public class RecognizeIT { // The path to the audio file to transcribe private String audioFileName = "./resources/audio.raw"; - private String multiChannelAudioFileName = "./resources/commercial_stereo.wav"; - private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav"; private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac"; - private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav"; + private String recognitionAudioFile = "./resources/commercial_mono.wav"; // The path to the video file to transcribe private String videoFileName = "./resources/Google_Gnome.wav"; private String gcsVideoPath = "gs://" + BUCKET + "/speech/Google_Gnome.wav"; - private String recognitionAudioFile = "./resources/commercial_mono.wav"; - @Before public void setUp() { bout = new ByteArrayOutputStream(); @@ -111,22 +107,6 @@ public void testStreamRecognize() throws Exception { assertThat(got).contains("how old is the Brooklyn Bridge"); } - @Test - public void testModelSelection() throws Exception { - Recognize.transcribeModelSelection(videoFileName); - String got = bout.toString(); - assertThat(got).contains("OK Google"); - assertThat(got).contains("the weather outside is sunny"); - } - - @Test - public void testGcsModelSelection() throws Exception { - Recognize.transcribeModelSelectionGcs(gcsVideoPath); - String got = bout.toString(); - assertThat(got).contains("OK Google"); - assertThat(got).contains("the weather outside is sunny"); - } - @Test public void testAutoPunctuation() throws Exception { Recognize.transcribeFileWithAutomaticPunctuation(audioFileName); @@ -156,67 +136,18 @@ public void testEnhancedModel() throws Exception { } @Test - public void testMetadata() throws Exception { - Recognize.transcribeFileWithMetadata(recognitionAudioFile); - String got = bout.toString(); - assertThat(got).contains("Chrome"); - } - - @Test - public void testTranscribeDiarization() throws Exception { - Recognize.transcribeDiarization(recognitionAudioFile); - String got = bout.toString(); - assertThat(got).contains("Speaker Tag 2:"); - } - - @Test - public void testTranscribeDiarizationGcs() throws Exception { - Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath); - String got = bout.toString(); - assertThat(got).contains("Speaker Tag 2:"); - } - - @Test - public void testTranscribeMultiChannel() throws Exception { - Recognize.transcribeMultiChannel(multiChannelAudioFileName); - String got = bout.toString(); - assertThat(got).contains("Channel Tag : 1"); - } - - @Test - public void testTranscribeMultiChannelGcs() throws Exception { - Recognize.transcribeMultiChannelGcs(gcsMultiChannelAudioPath); - String got = bout.toString(); - assertThat(got).contains("Channel Tag : 1"); - } - - @Test - public void testTranscribeMultiLanguage() throws Exception { - Recognize.transcribeMultiLanguage(videoFileName); - String got = bout.toString(); - assertThat(got).contains("Transcript : OK Google"); - } - - @Test - public void testTranscribeMultiLanguageGcs() throws Exception { - Recognize.transcribeMultiLanguageGcs(gcsVideoPath); - String got = bout.toString(); - assertThat(got).contains("Transcript : OK Google"); - } - - @Test - public void testTranscribeWordLevelConfidence() throws Exception { - Recognize.transcribeWordLevelConfidence(audioFileName); + public void testModelSelection() throws Exception { + Recognize.transcribeModelSelection(videoFileName); String got = bout.toString(); - assertThat(got).contains("Transcript : how old is the Brooklyn Bridge"); - assertThat(got).contains("First Word and Confidence : how"); + assertThat(got).contains("OK Google"); + assertThat(got).contains("the weather outside is sunny"); } @Test - public void testTranscribeWordLevelConfidenceGcs() throws Exception { - Recognize.transcribeWordLevelConfidenceGcs(gcsAudioPath); + public void testGcsModelSelection() throws Exception { + Recognize.transcribeModelSelectionGcs(gcsVideoPath); String got = bout.toString(); - assertThat(got).contains("Transcript : how old is the Brooklyn Bridge"); - assertThat(got).contains("First Word and Confidence : how"); + assertThat(got).contains("OK Google"); + assertThat(got).contains("the weather outside is sunny"); } }